xmp.py : » PDF » pyPdf » pyPdf-1.12 » pyPdf » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » PDF » pyPdf
pyPdf » pyPdf 1.12 » pyPdf » xmp.py
import re
import datetime
import decimal
from generic import PdfObject
from xml.dom import getDOMImplementation
from xml.dom.minidom import parseString

RDF_NAMESPACE = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
DC_NAMESPACE = "http://purl.org/dc/elements/1.1/"
XMP_NAMESPACE = "http://ns.adobe.com/xap/1.0/"
PDF_NAMESPACE = "http://ns.adobe.com/pdf/1.3/"
XMPMM_NAMESPACE = "http://ns.adobe.com/xap/1.0/mm/"

# What is the PDFX namespace, you might ask?  I might ask that too.  It's
# a completely undocumented namespace used to place "custom metadata"
# properties, which are arbitrary metadata properties with no semantic or
# documented meaning.  Elements in the namespace are key/value-style storage,
# where the element name is the key and the content is the value.  The keys
# are transformed into valid XML identifiers by substituting an invalid
# identifier character with \u2182 followed by the unicode hex ID of the
# original character.  A key like "my car" is therefore "my\u21820020car".
#
# \u2182, in case you're wondering, is the unicode character
# \u{ROMAN NUMERAL TEN THOUSAND}, a straightforward and obvious choice for
# escaping characters.
#
# Intentional users of the pdfx namespace should be shot on sight.  A
# custom data schema and sensical XML elements could be used instead, as is
# suggested by Adobe's own documentation on XMP (under "Extensibility of
# Schemas").
#
# Information presented here on the /pdfx/ schema is a result of limited
# reverse engineering, and does not constitute a full specification.
PDFX_NAMESPACE = "http://ns.adobe.com/pdfx/1.3/"

iso8601 = re.compile("""
        (?P<year>[0-9]{4})
        (-
            (?P<month>[0-9]{2})
            (-
                (?P<day>[0-9]+)
                (T
                    (?P<hour>[0-9]{2}):
                    (?P<minute>[0-9]{2})
                    (:(?P<second>[0-9]{2}(.[0-9]+)?))?
                    (?P<tzd>Z|[-+][0-9]{2}:[0-9]{2})
                )?
            )?
        )?
        """, re.VERBOSE)

##
# An object that represents Adobe XMP metadata.
class XmpInformation(PdfObject):

    def __init__(self, stream):
        self.stream = stream
        docRoot = parseString(self.stream.getData())
        self.rdfRoot = docRoot.getElementsByTagNameNS(RDF_NAMESPACE, "RDF")[0]
        self.cache = {}

    def writeToStream(self, stream, encryption_key):
        self.stream.writeToStream(stream, encryption_key)

    def getElement(self, aboutUri, namespace, name):
        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
                attr = desc.getAttributeNodeNS(namespace, name)
                if attr != None:
                    yield attr
                for element in desc.getElementsByTagNameNS(namespace, name):
                    yield element

    def getNodesInNamespace(self, aboutUri, namespace):
        for desc in self.rdfRoot.getElementsByTagNameNS(RDF_NAMESPACE, "Description"):
            if desc.getAttributeNS(RDF_NAMESPACE, "about") == aboutUri:
                for i in range(desc.attributes.length):
                    attr = desc.attributes.item(i)
                    if attr.namespaceURI == namespace:
                        yield attr
                for child in desc.childNodes:
                    if child.namespaceURI == namespace:
                        yield child

    def _getText(self, element):
        text = ""
        for child in element.childNodes:
            if child.nodeType == child.TEXT_NODE:
                text += child.data
        return text

    def _converter_string(value):
        return value

    def _converter_date(value):
        m = iso8601.match(value)
        year = int(m.group("year"))
        month = int(m.group("month") or "1")
        day = int(m.group("day") or "1")
        hour = int(m.group("hour") or "0")
        minute = int(m.group("minute") or "0")
        second = decimal.Decimal(m.group("second") or "0")
        seconds = second.to_integral(decimal.ROUND_FLOOR)
        milliseconds = (second - seconds) * 1000000
        tzd = m.group("tzd") or "Z"
        dt = datetime.datetime(year, month, day, hour, minute, seconds, milliseconds)
        if tzd != "Z":
            tzd_hours, tzd_minutes = [int(x) for x in tzd.split(":")]
            tzd_hours *= -1
            if tzd_hours < 0:
                tzd_minutes *= -1
            dt = dt + datetime.timedelta(hours=tzd_hours, minutes=tzd_minutes)
        return dt
    _test_converter_date = staticmethod(_converter_date)

    def _getter_bag(namespace, name, converter):
        def get(self):
            cached = self.cache.get(namespace, {}).get(name)
            if cached:
                return cached
            retval = []
            for element in self.getElement("", namespace, name):
                bags = element.getElementsByTagNameNS(RDF_NAMESPACE, "Bag")
                if len(bags):
                    for bag in bags:
                        for item in bag.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                            value = self._getText(item)
                            value = converter(value)
                            retval.append(value)
            ns_cache = self.cache.setdefault(namespace, {})
            ns_cache[name] = retval
            return retval
        return get

    def _getter_seq(namespace, name, converter):
        def get(self):
            cached = self.cache.get(namespace, {}).get(name)
            if cached:
                return cached
            retval = []
            for element in self.getElement("", namespace, name):
                seqs = element.getElementsByTagNameNS(RDF_NAMESPACE, "Seq")
                if len(seqs):
                    for seq in seqs:
                        for item in seq.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                            value = self._getText(item)
                            value = converter(value)
                            retval.append(value)
                else:
                    value = converter(self._getText(element))
                    retval.append(value)
            ns_cache = self.cache.setdefault(namespace, {})
            ns_cache[name] = retval
            return retval
        return get

    def _getter_langalt(namespace, name, converter):
        def get(self):
            cached = self.cache.get(namespace, {}).get(name)
            if cached:
                return cached
            retval = {}
            for element in self.getElement("", namespace, name):
                alts = element.getElementsByTagNameNS(RDF_NAMESPACE, "Alt")
                if len(alts):
                    for alt in alts:
                        for item in alt.getElementsByTagNameNS(RDF_NAMESPACE, "li"):
                            value = self._getText(item)
                            value = converter(value)
                            retval[item.getAttribute("xml:lang")] = value
                else:
                    retval["x-default"] = converter(self._getText(element))
            ns_cache = self.cache.setdefault(namespace, {})
            ns_cache[name] = retval
            return retval
        return get

    def _getter_single(namespace, name, converter):
        def get(self):
            cached = self.cache.get(namespace, {}).get(name)
            if cached:
                return cached
            value = None
            for element in self.getElement("", namespace, name):
                if element.nodeType == element.ATTRIBUTE_NODE:
                    value = element.nodeValue
                else:
                    value = self._getText(element)
                break
            if value != None:
                value = converter(value)
            ns_cache = self.cache.setdefault(namespace, {})
            ns_cache[name] = value
            return value
        return get

    ##
    # Contributors to the resource (other than the authors).  An unsorted
    # array of names.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_contributor = property(_getter_bag(DC_NAMESPACE, "contributor", _converter_string))

    ##
    # Text describing the extent or scope of the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_coverage = property(_getter_single(DC_NAMESPACE, "coverage", _converter_string))

    ##
    # A sorted array of names of the authors of the resource, listed in order
    # of precedence.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_creator = property(_getter_seq(DC_NAMESPACE, "creator", _converter_string))

    ##
    # A sorted array of dates (datetime.datetime instances) of signifigance to
    # the resource.  The dates and times are in UTC.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_date = property(_getter_seq(DC_NAMESPACE, "date", _converter_date))

    ##
    # A language-keyed dictionary of textual descriptions of the content of the
    # resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_description = property(_getter_langalt(DC_NAMESPACE, "description", _converter_string))

    ##
    # The mime-type of the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_format = property(_getter_single(DC_NAMESPACE, "format", _converter_string))

    ##
    # Unique identifier of the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_identifier = property(_getter_single(DC_NAMESPACE, "identifier", _converter_string))

    ##
    # An unordered array specifying the languages used in the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_language = property(_getter_bag(DC_NAMESPACE, "language", _converter_string))

    ##
    # An unordered array of publisher names.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_publisher = property(_getter_bag(DC_NAMESPACE, "publisher", _converter_string))

    ##
    # An unordered array of text descriptions of relationships to other
    # documents.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_relation = property(_getter_bag(DC_NAMESPACE, "relation", _converter_string))

    ##
    # A language-keyed dictionary of textual descriptions of the rights the
    # user has to this resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_rights = property(_getter_langalt(DC_NAMESPACE, "rights", _converter_string))

    ##
    # Unique identifier of the work from which this resource was derived.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_source = property(_getter_single(DC_NAMESPACE, "source", _converter_string))

    ##
    # An unordered array of descriptive phrases or keywrods that specify the
    # topic of the content of the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_subject = property(_getter_bag(DC_NAMESPACE, "subject", _converter_string))

    ##
    # A language-keyed dictionary of the title of the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_title = property(_getter_langalt(DC_NAMESPACE, "title", _converter_string))

    ##
    # An unordered array of textual descriptions of the document type.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    dc_type = property(_getter_bag(DC_NAMESPACE, "type", _converter_string))

    ##
    # An unformatted text string representing document keywords.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    pdf_keywords = property(_getter_single(PDF_NAMESPACE, "Keywords", _converter_string))

    ##
    # The PDF file version, for example 1.0, 1.3.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    pdf_pdfversion = property(_getter_single(PDF_NAMESPACE, "PDFVersion", _converter_string))

    ##
    # The name of the tool that created the PDF document.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    pdf_producer = property(_getter_single(PDF_NAMESPACE, "Producer", _converter_string))

    ##
    # The date and time the resource was originally created.  The date and
    # time are returned as a UTC datetime.datetime object.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmp_createDate = property(_getter_single(XMP_NAMESPACE, "CreateDate", _converter_date))
    
    ##
    # The date and time the resource was last modified.  The date and time
    # are returned as a UTC datetime.datetime object.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmp_modifyDate = property(_getter_single(XMP_NAMESPACE, "ModifyDate", _converter_date))

    ##
    # The date and time that any metadata for this resource was last
    # changed.  The date and time are returned as a UTC datetime.datetime
    # object.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmp_metadataDate = property(_getter_single(XMP_NAMESPACE, "MetadataDate", _converter_date))

    ##
    # The name of the first known tool used to create the resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmp_creatorTool = property(_getter_single(XMP_NAMESPACE, "CreatorTool", _converter_string))

    ##
    # The common identifier for all versions and renditions of this resource.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmpmm_documentId = property(_getter_single(XMPMM_NAMESPACE, "DocumentID", _converter_string))

    ##
    # An identifier for a specific incarnation of a document, updated each
    # time a file is saved.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    xmpmm_instanceId = property(_getter_single(XMPMM_NAMESPACE, "InstanceID", _converter_string))

    def custom_properties(self):
        if not hasattr(self, "_custom_properties"):
            self._custom_properties = {}
            for node in self.getNodesInNamespace("", PDFX_NAMESPACE):
                key = node.localName
                while True:
                    # see documentation about PDFX_NAMESPACE earlier in file
                    idx = key.find(u"\u2182")
                    if idx == -1:
                        break
                    key = key[:idx] + chr(int(key[idx+1:idx+5], base=16)) + key[idx+5:]
                if node.nodeType == node.ATTRIBUTE_NODE:
                    value = node.nodeValue
                else:
                    value = self._getText(node)
                self._custom_properties[key] = value
        return self._custom_properties

    ##
    # Retrieves custom metadata properties defined in the undocumented pdfx
    # metadata schema.
    # <p>Stability: Added in v1.12, will exist for all future v1.x releases.
    # @return Returns a dictionary of key/value items for custom metadata
    # properties.
    custom_properties = property(custom_properties)
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.