The requirement: Get document summary information from any file
Most files, on the property tabs, have a tab called Summary. It includes things like Title, Subject and Keywords. In the case of certain files, such as those from Microsoft Office, there may be extra document-specific fields and also a user-defined set. These are part of the Structured Storage mechanism which embeds mini-filesystems inside files.
FIXME: Add more explanation here and annotate the code
import os, sys import pythoncom from import shell from win32com import storagecon FORMATS = { pythoncom.FMTID_SummaryInformation : "SummaryInformation", pythoncom.FMTID_DocSummaryInformation : "DocSummaryInformation", pythoncom.FMTID_UserDefinedProperties : "UserDefinedProperties" } PROPERTIES = { pythoncom.FMTID_SummaryInformation : dict ( (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDSI_") ), pythoncom.FMTID_DocSummaryInformation : dict ( (getattr (storagecon, d), d) for d in dir (storagecon) if d.startswith ("PIDDSI_") ) } STORAGE_READ = storagecon.STGM_READ | storagecon.STGM_SHARE_EXCLUSIVE def property_dict (property_set_storage, fmtid): properties = {} try: property_storage = property_set_storage.Open (fmtid, STORAGE_READ) except pythoncom.com_error, error: if error.strerror == 'STG_E_FILENOTFOUND': return {} else: raise for name, property_id, vartype in property_storage: if name is None: name = PROPERTIES.get (fmtid, {}).get (property_id, None) if name is None: name = hex (property_id) try: for value in property_storage.ReadMultiple ([property_id]): properties[name] = value # # There are certain values we can't read; they # raise type errors from within the pythoncom # implementation, thumbnail # except TypeError: properties[name] = None return properties def property_sets (filepath): pidl, flags = shell.SHILCreateFromPath (os.path.abspath (filepath), 0) property_set_storage = shell.SHGetDesktopFolder ().BindToStorage (pidl, None, pythoncom.IID_IPropertySetStorage) for fmtid, clsid, flags, ctime, mtime, atime in property_set_storage: yield FORMATS.get (fmtid, unicode (fmtid)), property_dict (property_set_storage, fmtid) if fmtid == pythoncom.FMTID_DocSummaryInformation: fmtid = pythoncom.FMTID_UserDefinedProperties user_defined_properties = property_dict (property_set_storage, fmtid) if user_defined_properties: yield FORMATS.get (fmtid, unicode (fmtid)), user_defined_properties if __name__ == '__main__': for name, properties in property_sets (sys.argv[1]): print name for k, v in properties.items (): print " ", k, "=>", v