summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYves Fischer <yvesf-git@xapek.org>2016-12-09 21:19:19 +0100
committerYves Fischer <yvesf-git@xapek.org>2016-12-09 21:31:44 +0100
commit65a19f4c5fdeea6b4afa12d138f9319a063d618f (patch)
treeb0a421f695349641aaababc2544447e0ea510108
parentd1fd42b946734843415cd0bab3083679123153d0 (diff)
downloadwatchnews-65a19f4c5fdeea6b4afa12d138f9319a063d618f.tar.gz
watchnews-65a19f4c5fdeea6b4afa12d138f9319a063d618f.zip
add --test to test newspaper3k text extractionHEADmaster
-rwxr-xr-xwatchnews-cli5
-rwxr-xr-xwatchnews/fetch.py14
2 files changed, 18 insertions, 1 deletions
diff --git a/watchnews-cli b/watchnews-cli
index f3c080a..b43d1c2 100755
--- a/watchnews-cli
+++ b/watchnews-cli
@@ -17,6 +17,8 @@ if __name__ == '__main__':
help='Update all known feeds')
parser.add_argument('--update-feed', type=int, metavar='ID',
help='Update feed with ID')
+ parser.add_argument('--test', type=str, metavar='URL',
+ help='Fetch URL with newspaper library and dump result for testing')
parser.add_argument('--remove', type=int, metavar='ID',
help='Remove feed with ID')
parser.add_argument('--list', action='store_true',
@@ -60,5 +62,6 @@ if __name__ == '__main__':
web.get_app().run(debug=args.web_debug)
if args.rss:
print(rss.rss())
-
+ if args.test:
+ fetch.dump_paper(args.test)
diff --git a/watchnews/fetch.py b/watchnews/fetch.py
index e6c5c39..c476be7 100755
--- a/watchnews/fetch.py
+++ b/watchnews/fetch.py
@@ -8,6 +8,20 @@ from newspaper.cleaners import DocumentCleaner
import logging
+def dump_paper(link):
+ paper = Article(link)
+ paper.download()
+ paper.parse()
+ print("""\
+Article Dump:
+Title: {title}
+Url: {url}
+Authors: {authors}
+Text: {text}
+-----
+
+""".format(**paper.__dict__))
+
def update(feed):
logging.info('Update %s', feed.url)
result = feedparser.parse(feed.url)