#!/usr/bin/python # natsume.py - Don Yang (uguu.org) # # Ported from natsume0.pl, go there for more comments. # # 12/26/05 import os import sys import md5 # How many header bytes to read header_size = 1024 # Number of bytes to read per block when scanning the entire file read_block_size = 0x10000 # Statistics file_count = 0 dup_count = 0 dup_bytes = 0 read_bytes = 0 total_bytes = 0 def PreprocessList(in_list): """Canonicalize paths in list, sort it and remove duplicates. """ canonical_list = [os.path.normpath(file) for file in in_list] canonical_list.sort() last = "" out_list = [] for file in canonical_list: if last != file: last = file out_list.append(last) return out_list def FileList(): """Return list of files to process: use command line arguments if available, otherwise read from stdin. """ if len(sys.argv) > 1: return PreprocessList(sys.argv[1:]) else: return PreprocessList([x.strip() for x in sys.stdin.readlines()]) def DigestHeader(file): """Compute MD5 for file header. """ global read_bytes try: infile = os.open(file, os.O_RDONLY|os.O_BINARY) data = os.read(infile, header_size) os.close(infile) d = md5.new() d.update(data) read_bytes += len(data) return d.digest() except OSError, err: print "# %s: %s" % (file, str(err)) return 0 def DigestAll(file): """Compute MD5 for the entire file. """ global read_bytes try: read_bytes += os.path.getsize(file) infile = os.open(file, os.O_RDONLY|os.O_BINARY) d = md5.new() data = os.read(infile, read_block_size) while len(data) > 0: d.update(data) data = os.read(infile, read_block_size) os.close(infile) return d.digest() except OSError, err: print "# %s: %s" % (file, str(err)) return 0 def FindCollision(hash, file, size): """Check file for collision, return original file name if found, otherwise update hash and return empty string. """ # Check for size collision if not hash.has_key(size): hash[size] = {} hash[size][''] = file return "" # Check for header collision if hash[size].has_key(''): file0 = hash[size][''] del hash[size][''] d0 = DigestHeader(file0) if d0 == 0: return "" hash[size][d0] = {} hash[size][d0][''] = file0 d_head = DigestHeader(file) if d_head == 0: return "" if not hash[size].has_key(d_head): hash[size][d_head] = {} hash[size][d_head][''] = file return "" # Check for content collision if hash[size][d_head].has_key(''): file0 = hash[size][d_head][''] del hash[size][d_head][''] d0 = DigestAll(file0) if d0 == 0: return "" hash[size][d_head][d0] = file0 d_full = DigestAll(file) if not hash[size][d_head].has_key(d_full): hash[size][d_head] = {} hash[size][d_head][d_full] = file return "" return hash[size][d_head][d_full] def PrintCollision(orig_file, new_file): """Print command to resolve collision. """ if (not orig_file[0] == '/') and (not new_file[0] == '/'): # Remove common root target_parts = new_file.split('/') orig_parts = orig_file.split('/') while len(target_parts) > 0 and len(orig_parts) > 0 and \ target_parts[0] == orig_parts[0]: target_parts = target_parts[1:] orig_parts = orig_parts[1:] # Add relative links orig_file = ("../" * (len(target_parts) - 1)) + ('/'.join(orig_parts)) print "ln -s -f '%s' '%s'" % (orig_file, new_file) def ProcessFiles(file_list): """Process sorted list of files and check for collisions. """ global file_count, dup_count, dup_bytes, total_bytes hash = {} for file in file_list: if not os.path.isfile(file): print "# %s: not a file" % file continue try: size = os.path.getsize(file) except OSError, err: print "# %s: %s" % (file, str(err)) continue file_count += 1 total_bytes += size file0 = FindCollision(hash, file, size) if file0 != "": PrintCollision(file0, file) dup_count += 1 dup_bytes += size if __name__ == "__main__": ProcessFiles(FileList()) print "# %d files, %d/%d bytes read" % (file_count, read_bytes, total_bytes) if dup_count > 0: print "# %d bytes in %d duplicate files" % (dup_bytes, dup_count) else: print "# No duplicates found"