import os, sys import md5 import win32file INCLUDE_EXTENSIONS = (".doc", ".xls", ".ppt", ".txt", ".sql", ".py", ".txt") IGNORE_FILES = ("readme.txt", "index.html") def hash (filename, n_chunks=0): CHUNK_SIZE = 128 * 1024 hash = md5.new () f = open (filename) try: n_chunk = 1 while 1: data = f.read (CHUNK_SIZE) if data == '': break hash.update (data) n_chunk += 1 if n_chunks and n_chunk > n_chunks: break finally: f.close () return hash.hexdigest () def visit (files, dirname, fnames): """Keep track of reused filenames and the directory they appeared in. """ sys.stderr.write ("%s\r" % (dirname + (80 * " "))[:80]) for fname in fnames: if fname.lower () not in IGNORE_FILES: root, ext = os.path.splitext (fname) if ext in INCLUDE_EXTENSIONS: checksum = hash (os.path.join (dirname, fname)) files.setdefault (fname, {}).setdefault (checksum, []).append (dirname) def main (args): roots = [os.path.abspath (p.strip ()) for p in args] files = {} for root in roots: os.path.walk (root, visit, files) print print duplicates = {} for fname in files.keys (): for size in files[fname].keys (): dirnames = files[fname][size] if len (dirnames) > 1: master_dirname = dirnames[0] master_filepath = os.path.join (master_dirname, fname) print "Master is:", master_filepath for dirname in dirnames[1:]: slave_filepath = os.path.join (dirname, fname) win32file.DeleteFile (slave_filepath) win32file.CreateHardLink (slave_filepath, master_filepath) print "Linked:", slave_filepath if __name__ == '__main__': main (sys.argv[1:])