Get document summary information


The requirement: Get document summary information from any file


Most files, on the property tabs, have a tab called Summary. It includes things like Title, Subject and Keywords. In the case of certain files, such as those from Microsoft Office, there may be extra document-specific fields and also a user-defined set. These are part of the Structured Storage mechanism which embeds mini-filesystems inside files.

FIXME: Add more explanation here and annotate the code

The Code

import os, sys
import pythoncom
from import shell
from win32com import storagecon

  pythoncom.FMTID_SummaryInformation : "SummaryInformation",
  pythoncom.FMTID_DocSummaryInformation : "DocSummaryInformation",
  pythoncom.FMTID_UserDefinedProperties : "UserDefinedProperties"
  pythoncom.FMTID_SummaryInformation : dict (
    (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDSI_")
  pythoncom.FMTID_DocSummaryInformation : dict (
    (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDDSI_")


def property_dict (property_set_storage, fmtid):
  properties = {}
    property_storage = property_set_storage.Open (fmtid, STORAGE_READ)
  except pythoncom.com_error, error:
    if error.strerror == 'STG_E_FILENOTFOUND':
      return {}
  for name, property_id, vartype in property_storage:
    if name is None:
      name = PROPERTIES.get (fmtid, {}).get (property_id, None)
    if name is None:
      name = hex (property_id)
      for value in property_storage.ReadMultiple ([property_id]):
        properties[name] = value
    # There are certain values we can't read; they
    # raise type errors from within the pythoncom
    # implementation, thumbnail
    except TypeError:
      properties[name] = None
  return properties
def property_sets (filepath):
  pidl, flags = shell.SHILCreateFromPath (os.path.abspath (filepath), 0)
  property_set_storage = shell.SHGetDesktopFolder ().BindToStorage (pidl, None, pythoncom.IID_IPropertySetStorage)
  for fmtid, clsid, flags, ctime, mtime, atime in property_set_storage:
    yield FORMATS.get (fmtid, unicode (fmtid)), property_dict (property_set_storage, fmtid)
    if fmtid == pythoncom.FMTID_DocSummaryInformation:
      fmtid = pythoncom.FMTID_UserDefinedProperties
      user_defined_properties = property_dict (property_set_storage, fmtid)
      if user_defined_properties:
        yield FORMATS.get (fmtid, unicode (fmtid)), user_defined_properties

if __name__ == '__main__':
  for name, properties in property_sets (sys.argv[1]):
    print name
    for k, v in properties.items ():
      print "  ", k, "=>", v