summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
committeryvesf <yvesf-git@xapek.org>2010-11-26 23:59:59 +0100
commit15f17b4fce5a1e4b70709aac569c18d8cf57deba (patch)
tree89eebcc47aebed71cc86eedfaee8cf894e47c07e
parent9122cfed447a643a8c383be5558fd82fc3be7cc7 (diff)
downloadbooksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.tar.gz
booksearch-15f17b4fce5a1e4b70709aac569c18d8cf57deba.zip
add pdfminer for text extraction;
replaced all ' with " metadata hacking
-rw-r--r--INSTALL3
-rw-r--r--indexer.py97
-rw-r--r--web.py46
3 files changed, 111 insertions, 35 deletions
diff --git a/INSTALL b/INSTALL
index c2ff89f..9ec902c 100644
--- a/INSTALL
+++ b/INSTALL
@@ -3,8 +3,6 @@ TODO
clean html
consistent schema (_book _metadata)
? /usr/bin/convert ?
-check pdfminer for better text-extraction (whitespace)
-
------------------------
@@ -21,6 +19,7 @@ Install dependencies
pip install whoosh
pip install pypdf
pip install flask
+ pip install pdfminer
( Clone )
git clone http://xapek.org/~yvesf/repos/booksearch.git
diff --git a/indexer.py b/indexer.py
index 913591a..3ec1e8b 100644
--- a/indexer.py
+++ b/indexer.py
@@ -3,11 +3,90 @@
import os
import sys
import time
-import pyPdf
import whoosh.index as index
import whoosh.writing as writing
import whoosh.fields as fields
from compat import str_format
+import StringIO
+
+
+def pdf_extract_metadata(filepath):
+ from pdfminer.pdfparser import PDFParser, PDFDocument
+ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+ from pdfminer.converter import TextConverter
+ from pdfminer.layout import LAParams
+ from lxml import etree
+
+ outbuf = StringIO.StringIO()
+ rsrcmgr = PDFResourceManager()
+ device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams())
+ parser = PDFParser(file(filepath, "rb"))
+ doc = PDFDocument()
+
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ doc.initialize("")
+
+ namespaces={
+ "dc":"http://purl.org/dc/elements/1.1/",
+ "rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
+ "pdf":"http://ns.adobe.com/pdf/1.3/", }
+
+ if doc.catalog.has_key("Metadata"):
+ obj_ref = doc.catalog["Metadata"]
+ obj_stream = obj_ref.resolve()
+ if obj_stream.attrs['Subtype'].name == "XML":
+ obj_data = obj_stream.get_data()
+ if obj_data.endswith("\nf"):
+ obj_data = obj_data[:-len("\nf")]
+ print obj_data
+ tree = etree.parse(StringIO.StringIO(obj_data))
+ print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text
+ return obj_data
+ else:
+ return None
+
+def pdf_extract_text_pdfminer(filepath):
+ from pdfminer.pdfparser import PDFParser, PDFDocument
+ from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
+ from pdfminer.converter import TextConverter
+ from pdfminer.layout import LAParams
+
+ outbuf = StringIO.StringIO()
+ rsrcmgr = PDFResourceManager()
+ device = TextConverter(rsrcmgr, outbuf, "utf-8", laparams=LAParams())
+ parser = PDFParser(file(filepath, "rb"))
+ doc = PDFDocument()
+
+ parser.set_document(doc)
+ doc.set_parser(parser)
+ doc.initialize("")
+
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ for pagenumber, page in enumerate(doc.get_pages()):
+ interpreter.process_page(page)
+ outbuf.seek(0)
+ content = unicode(outbuf.read(),"utf-8",errors="replace")
+ yield (pagenumber+1, content) #start pages at 1
+ outbuf.seek(0)
+
+def pdf_extract_text_pypdf(filepath):
+ import pyPdf
+ inputfile = pyPdf.PdfFileReader(file(filepath, "r"))
+
+ pagenumber = 1
+ for page in inputfile.pages:
+ content = page.extractText()
+ yield (pagenumber, content)
+ pagenumber += 1
+
+""" Yields (pagenumber, text) """
+def pdf_extract_text(filepath):
+ try:
+ return pdf_extract_text_pdfminer(filepath)
+ except ImportError:
+ print "Fallback to pypdf"
+ return pdf_extract_text_pypdf(filepath)
schema_book = fields.Schema(
pagenumber=fields.NUMERIC(stored=True),
@@ -56,11 +135,11 @@ if not create_index: #update index for deleted files
deleted = 0
processed = 0
for fields in searcher_metadata.all_stored_fields():
- path = fields['path']
+ path = fields["path"]
processed += 1
if not os.path.exists(path):
- writer_book.delete_by_term(u'path', path)
- writer_metadata.delete_by_term('path', path)
+ writer_book.delete_by_term(u"path", path)
+ writer_metadata.delete_by_term("path", path)
deleted += 1
print str_format(u"\r{proc} pages processed. {deleted} deleted", proc=processed, deleted=deleted),
print ""
@@ -73,8 +152,7 @@ searcher_metadata.close()
def process_file(filepath):
try:
print str_format(u"{pid} processing {filepath}", pid=os.getpid(), filepath=filepath)
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
- title = inputfile.getDocumentInfo().title
+ title = u"notimplemented"
writer_metadata = index_metadata.writer()
writer_metadata.add_document(title=title, path=filepath, createtime=time.time())
@@ -82,9 +160,8 @@ def process_file(filepath):
writer_book = writing.BatchWriter(index_book, limit=1000)
pagenumber = 1
- for page in inputfile.pages:
+ for pagenumber, content in pdf_extract_text(filepath):
print str_format(u"{pid} processing {filepath} Page {page}", pid=os.getpid(), filepath=filepath, page=pagenumber)
- content = page.extractText()
writer_book.add_document(pagenumber=pagenumber,
path=filepath,
content=content)
@@ -92,9 +169,9 @@ def process_file(filepath):
writer_book.commit()
except KeyboardInterrupt:
- return 'KeyboardInterrupt'
+ return "KeyboardInterrupt"
except Exception,e:
- print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=e)
+ print str_format(u"{pid} failed at {filepath}: {err}", pid=os.getpid(), filepath=filepath, err=str(e))
try:
import multiprocessing as mp
diff --git a/web.py b/web.py
index b675c2c..7f8c0e5 100644
--- a/web.py
+++ b/web.py
@@ -33,9 +33,9 @@ def do_index():
def do_book_file(docnum):
with index_metadata.reader() as reader:
document = reader.stored_fields(docnum)
- r = werkzeug.Response(open(document['path'], "r"), mimetype="application/pdf",)
- r.headers.add('Content-Disposition', 'attachment',
- filename=os.path.basename(document['path']))
+ r = werkzeug.Response(open(document["path"], "r"), mimetype="application/pdf",)
+ r.headers.add("Content-Disposition", "attachment",
+ filename=os.path.basename(document["path"]))
return r
@@ -53,7 +53,7 @@ def pdf_to_image(filepath, page, size):
if stdout:
yield stdout
pdffile = StringIO()
- page = pyPdf.PdfFileReader(file(filepath, 'r')).getPage(page)
+ page = pyPdf.PdfFileReader(file(filepath, "r")).getPage(page)
out = pyPdf.PdfFileWriter()
out.addPage(page)
out.write(pdffile)
@@ -66,24 +66,24 @@ def pdf_to_image(filepath, page, size):
def do_page_image(docnum,size=260):
with index_book.reader() as reader:
document = reader.stored_fields(docnum)
- page = document['pagenumber'] - 1
- return pdf_to_image(document['path'], page, size=size)
+ page = document["pagenumber"] - 1
+ return pdf_to_image(document["path"], page, size=size)
@app.route("/book/frontpage/<int:docnum>", methods=["GET"])
def do_book_frontpage(docnum):
with index_metadata.reader() as reader:
document = reader.stored_fields(docnum)
- return pdf_to_image(document['path'], 0, 260)
+ return pdf_to_image(document["path"], 0, 260)
@app.route("/page/file/<int:docnum>", methods=["GET"])
def do_page_file(docnum):
with index_book.reader() as reader:
document = reader.stored_fields(docnum)
- filepath = document['path']
- page = document['pagenumber'] - 1
+ filepath = document["path"]
+ page = document["pagenumber"] - 1
app.logger.debug(str_format("Extract page={page} from filepath={filepath}", page=page, filepath=filepath))
- inputfile = pyPdf.PdfFileReader(file(filepath, 'r'))
+ inputfile = pyPdf.PdfFileReader(file(filepath, "r"))
pdfpage = inputfile.getPage(page)
outbuf = StringIO()
outfile = pyPdf.PdfFileWriter()
@@ -93,7 +93,7 @@ def do_page_file(docnum):
r = werkzeug.Response(outbuf, mimetype="application/pdf")
client_filename = os.path.basename(filepath)[:-3]
client_filename += str_format(u".Page-{page}.pdf", page=page)
- r.headers.add('Content-Disposition', 'attachment', filename=client_filename)
+ r.headers.add("Content-Disposition", "attachment", filename=client_filename)
return r
class MyHtmlFormatter(highlight.HtmlFormatter):
@@ -110,10 +110,10 @@ def do_excerpt(docnum, term):
q = q.simplify(reader)
terms = [ text for fieldname, text in q.all_terms()
if fieldname == "content" ]
- excerpt = highlight.highlight(document['content'],
+ excerpt = highlight.highlight(document["content"],
terms,
analysis.StandardAnalyzer(),
- highlight.SimpleFragmenter(),
+ highlight.ContextFragmenter(terms, surround=40),
MyHtmlFormatter())
return unicode( excerpt )
@@ -121,7 +121,7 @@ def do_excerpt(docnum, term):
@app.route("/search/", methods=["GET"])
def do_search(term=None):
if not term:
- return flask.render_template('search.html', objects=[], term="")
+ return flask.render_template("search.html", objects=[], term="")
term = term.lower()
searcher = index_book.searcher()
@@ -138,23 +138,23 @@ def do_search(term=None):
with index_metadata.searcher() as searcher:
docnum = searcher.document_number(path=filepath)
with index_metadata.reader() as reader2:
- title = reader2.stored_fields(docnum).get('title')
+ title = reader2.stored_fields(docnum).get("title")
books[docnum] = {
- 'matches' : [],
- 'title':title,
- 'filename' : os.path.basename(filepath),
+ "matches" : [],
+ "title":title,
+ "filename" : os.path.basename(filepath),
}
for match in book[1]:
- pagenumber = reader.stored_fields(match[0])['pagenumber']
+ pagenumber = reader.stored_fields(match[0])["pagenumber"]
match = (match[0], match[1], pagenumber)
- books[docnum]['matches'].append(match)
+ books[docnum]["matches"].append(match)
- return flask.render_template('search.html',
+ return flask.render_template("search.html",
books=books,
term=term)
def log_response(sender, response):
- sender.logger.debug('Request context is about to close down. '
- 'Response: %s', response)
+ sender.logger.debug("Request context is about to close down. "
+ "Response: %s", response)
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8000)