Matt Swain

A lightweight XMP parser for extracting PDF metadata in Python

Metadata (title, author, etc.) can be embedded in PDF files in a number of different ways, and can be a bit of a pain to extract. Older PDFs use “Info” in the XRefs trailer, whereas newer ones use XMP metadata. Using the Python PDFMiner library, it’s possible to extract the “Info” as a python dictionary, but the XMP metadata is just extracted as raw XML.

I couldn’t find a nice lightweight XMP parser in Python, so I put together something that seemed to work on all the PDFs I threw at it.

You can install PDFMiner by downloading the source, then doing:

cd pdfminer
make cmap
python install

Once installed, use PDFMiner to open the PDF and get the XMP.

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdftypes import resolve1
from xmp import xmp_to_dict

fp = open('mypdf.pdf', 'rb')
parser = PDFParser(fp)
doc = PDFDocument()

print        # The "Info" metadata

if 'Metadata' in doc.catalog:
    metadata = resolve1(doc.catalog['Metadata']).get_data()
    print metadata  # The raw XMP metadata
    print xmp_to_dict(metadata)

The xmp_to_dict function is defined follows:

#!/usr/bin/env python

    Parses XMP metadata from PDF files.

    By Matt Swain. Released under the MIT license.

from collections import defaultdict
from xml.etree import ElementTree as ET

RDF_NS = '{}'
XML_NS = '{}'
NS_MAP = {
    ''    : 'rdf',
    ''               : 'dc',
    ''                   : 'xap',
    ''                   : 'pdf',
    ''                : 'xapmm',
    ''                  : 'pdfx',
    '' : 'prism',
    ''             : 'crossmark',
    ''            : 'rights',
    ''           : 'xml'

class XmpParser(object):
    Parses an XMP string into a dictionary.


        parser = XmpParser(xmpstring)
        meta = parser.meta

    def __init__(self, xmp):
        self.tree = ET.XML(xmp)
        self.rdftree = self.tree.find(RDF_NS+'RDF')

    def meta(self):
        """ A dictionary of all the parsed metadata. """
        meta = defaultdict(dict)
        for desc in self.rdftree.findall(RDF_NS+'Description'):
            for el in desc.getchildren():
                ns, tag =  self._parse_tag(el)
                value = self._parse_value(el)
                meta[ns][tag] = value
        return dict(meta)

    def _parse_tag(self, el):
        """ Extract the namespace and tag from an element. """
        ns = None
        tag = el.tag
        if tag[0] == "{":
            ns, tag = tag[1:].split('}',1)
            if ns in NS_MAP:
                ns = NS_MAP[ns]
        return ns, tag

    def _parse_value(self, el):
        """ Extract the metadata value from an element. """
        if el.find(RDF_NS+'Bag') is not None:
            value = []
            for li in el.findall(RDF_NS+'Bag/'+RDF_NS+'li'):
        elif el.find(RDF_NS+'Seq') is not None:
            value = []
            for li in el.findall(RDF_NS+'Seq/'+RDF_NS+'li'):
        elif el.find(RDF_NS+'Alt') is not None:
            value = {}
            for li in el.findall(RDF_NS+'Alt/'+RDF_NS+'li'):
                value[li.get(XML_NS+'lang')] = li.text
            value = el.text
        return value

def xmp_to_dict(xmp):
    """ Shorthand function for parsing an XMP string into a python dictionary. """
    return XmpParser(xmp).meta