summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authoryvesf <yvesf-git@xapek.org>2010-12-01 17:16:12 +0100
committeryvesf <yvesf-git@xapek.org>2010-12-01 17:16:12 +0100
commit9b09da2a0d5806a161c9b25f62193be2d0c2eec1 (patch)
treede7a96f1aba6ec8b8305380df7494616aa2c2ed1
parente69a04675e142d8618433a785ad33e8167ef99d2 (diff)
downloadbooksearch-9b09da2a0d5806a161c9b25f62193be2d0c2eec1.tar.gz
booksearch-9b09da2a0d5806a161c9b25f62193be2d0c2eec1.zip
add metadata extraction
-rw-r--r--indexer.py41
1 files changed, 31 insertions, 10 deletions
diff --git a/indexer.py b/indexer.py
index 3ec1e8b..c909f68 100644
--- a/indexer.py
+++ b/indexer.py
@@ -15,7 +15,8 @@ def pdf_extract_metadata(filepath):
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
- from lxml import etree
+ import lxml
+ import lxml.etree
outbuf = StringIO.StringIO()
rsrcmgr = PDFResourceManager()
@@ -28,23 +29,42 @@ def pdf_extract_metadata(filepath):
doc.initialize("")
namespaces={
+ "x":"adobe:ns:meta/",
"dc":"http://purl.org/dc/elements/1.1/",
"rdf":"http://www.w3.org/1999/02/22-rdf-syntax-ns#",
- "pdf":"http://ns.adobe.com/pdf/1.3/", }
+ "pdf":"http://ns.adobe.com/pdf/1.3/",
+ "xap":"http://ns.adobe.com/xap/1.0/",
+ "xmpMM":"http://ns.adobe.com/xap/1.0/mm/"}
+
+ metadata = {}
+ def add_metadata(tree, name, xpath_expr):
+ d=tree.xpath(xpath_expr, namespaces=namespaces)
+ if d:
+ metadata[name] = "".join(d).strip()
if doc.catalog.has_key("Metadata"):
obj_ref = doc.catalog["Metadata"]
obj_stream = obj_ref.resolve()
if obj_stream.attrs['Subtype'].name == "XML":
- obj_data = obj_stream.get_data()
- if obj_data.endswith("\nf"):
- obj_data = obj_data[:-len("\nf")]
- print obj_data
- tree = etree.parse(StringIO.StringIO(obj_data))
- print tree.xpath("//dc:title/rdf:Alt/rdf:li", namespaces=namespaces)[0].text
- return obj_data
+ try:
+ obj_data = obj_stream.get_data()[:-2] # XXX remove trailing chars
+ print obj_data.strip()
+ tree = lxml.etree.parse(StringIO.StringIO(obj_data))
+ add_metadata(tree, "dc:title", "//rdf:Description/dc:title//*/text()")
+ add_metadata(tree, "dc:creator", "//rdf:Description/dc:creator//*/text()")
+ add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()")
+ add_metadata(tree, "xap:CreaterTool", "//rdf:Description/xap:CreatorTool/text()")
+ add_metadata(tree, "xap:ModifyDate", "//rdf:Description/xap:ModifyDate/text()")
+ add_metadata(tree, "xap:CreateDate", "//rdf:Description/xap:CreateDate/text()")
+ add_metadata(tree, "xap:MetadataDate", "//rdf:Description/xap:MetadataDate/text()")
+ add_metadata(tree, "pdf:Producer", "//rdf:Description/pdf:Producer/text()")
+ add_metadata(tree, "xmpMM:DocumentID", "//rdf:Description/xmpMM:DocumentID/text()")
+ add_metadata(tree, "xmpMM:InstanceID", "//rdf:Description/xmpMM:InstanceID/text()")
+ except lxml.etree.XMLSyntaxError,e:
+ print e
else:
- return None
+ pass
+ return metadata
def pdf_extract_text_pdfminer(filepath):
from pdfminer.pdfparser import PDFParser, PDFDocument
@@ -167,6 +187,7 @@ def process_file(filepath):
content=content)
pagenumber += 1
+ print str_format(u"{pid} commit", pid=os.getpid())
writer_book.commit()
except KeyboardInterrupt:
return "KeyboardInterrupt"