Get document summary information

Introduction

The requirement: Get document summary information from any file

Discussion

Most files, on the property tabs, have a tab called Summary. It includes things like Title, Subject and Keywords. In the case of certain files, such as those from Microsoft Office, there may be extra document-specific fields and also a user-defined set. These are part of the Structured Storage mechanism which embeds mini-filesystems inside files.

FIXME: Add more explanation here and annotate the code

The Code

import os, sys
import pythoncom
from win32com.shell import shell
from win32com import storagecon

FORMATS = {
  pythoncom.FMTID_SummaryInformation : "SummaryInformation",
  pythoncom.FMTID_DocSummaryInformation : "DocSummaryInformation",
  pythoncom.FMTID_UserDefinedProperties : "UserDefinedProperties"
}
PROPERTIES = {
  pythoncom.FMTID_SummaryInformation : dict (
    (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDSI_")
  ),
  pythoncom.FMTID_DocSummaryInformation : dict (
    (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDDSI_")
  )
}

STORAGE_READ = storagecon.STGM_READ | storagecon.STGM_SHARE_EXCLUSIVE

def property_dict (property_set_storage, fmtid):
  properties = {}
  try:
    property_storage = property_set_storage.Open (fmtid, STORAGE_READ)
  except pythoncom.com_error, error:
    if error.strerror == 'STG_E_FILENOTFOUND':
      return {}
    else:
      raise
      
  for name, property_id, vartype in property_storage:
    if name is None:
      name = PROPERTIES.get (fmtid, {}).get (property_id, None)
    if name is None:
      name = hex (property_id)
    try:
      for value in property_storage.ReadMultiple ([property_id]):
        properties[name] = value
    #
    # There are certain values we can't read; they
    # raise type errors from within the pythoncom
    # implementation, thumbnail
    #
    except TypeError:
      properties[name] = None
  return properties
  
def property_sets (filepath):
  pidl, flags = shell.SHILCreateFromPath (os.path.abspath (filepath), 0)
  property_set_storage = shell.SHGetDesktopFolder ().BindToStorage (pidl, None, pythoncom.IID_IPropertySetStorage)
  for fmtid, clsid, flags, ctime, mtime, atime in property_set_storage:
    yield FORMATS.get (fmtid, unicode (fmtid)), property_dict (property_set_storage, fmtid)
    if fmtid == pythoncom.FMTID_DocSummaryInformation:
      fmtid = pythoncom.FMTID_UserDefinedProperties
      user_defined_properties = property_dict (property_set_storage, fmtid)
      if user_defined_properties:
        yield FORMATS.get (fmtid, unicode (fmtid)), user_defined_properties

if __name__ == '__main__':
  for name, properties in property_sets (sys.argv[1]):
    print name
    for k, v in properties.items ():
      print "  ", k, "=>", v