#!/usr/bin/python
# natsume.py - Don Yang (uguu.org)
#
# Ported from natsume0.pl, go there for more comments.
#
# 12/26/05

import os
import sys
import md5

# How many header bytes to read
header_size = 1024

# Number of bytes to read per block when scanning the entire file
read_block_size = 0x10000

# Statistics
file_count = 0
dup_count = 0
dup_bytes = 0
read_bytes = 0
total_bytes = 0

def PreprocessList(in_list):
   """Canonicalize paths in list, sort it and remove duplicates. """
   canonical_list = [os.path.normpath(file) for file in in_list]
   canonical_list.sort()
   last = ""
   out_list = []
   for file in canonical_list:
      if last != file:
         last = file
         out_list.append(last)
   return out_list

def FileList():
   """Return list of files to process: use command line arguments if
      available, otherwise read from stdin.                          """
   if len(sys.argv) > 1:
      return PreprocessList(sys.argv[1:])
   else:
      return PreprocessList([x.strip() for x in sys.stdin.readlines()])

def DigestHeader(file):
   """Compute MD5 for file header. """
   global read_bytes
   try:
      infile = os.open(file, os.O_RDONLY|os.O_BINARY)
      data = os.read(infile, header_size)
      os.close(infile)
      d = md5.new()
      d.update(data)
      read_bytes += len(data)
      return d.digest()
   except OSError, err:
      print "# %s: %s" % (file, str(err))
      return 0

def DigestAll(file):
   """Compute MD5 for the entire file. """
   global read_bytes
   try:
      read_bytes += os.path.getsize(file)
      infile = os.open(file, os.O_RDONLY|os.O_BINARY)
      d = md5.new()
      data = os.read(infile, read_block_size)
      while len(data) > 0:
         d.update(data)
         data = os.read(infile, read_block_size)
      os.close(infile)
      return d.digest()
   except OSError, err:
      print "# %s: %s" % (file, str(err))
      return 0

def FindCollision(hash, file, size):
   """Check file for collision, return original file name if found,
      otherwise update hash and return empty string.                """

   # Check for size collision
   if not hash.has_key(size):
      hash[size] = {}
      hash[size][''] = file
      return ""

   # Check for header collision
   if hash[size].has_key(''):
      file0 = hash[size]['']
      del hash[size]['']
      d0 = DigestHeader(file0)
      if d0 == 0:
         return ""
      hash[size][d0] = {}
      hash[size][d0][''] = file0

   d_head = DigestHeader(file)
   if d_head == 0:
      return ""
   if not hash[size].has_key(d_head):
      hash[size][d_head] = {}
      hash[size][d_head][''] = file
      return ""

   # Check for content collision
   if hash[size][d_head].has_key(''):
      file0 = hash[size][d_head]['']
      del hash[size][d_head]['']
      d0 = DigestAll(file0)
      if d0 == 0:
         return ""
      hash[size][d_head][d0] = file0

   d_full = DigestAll(file)
   if not hash[size][d_head].has_key(d_full):
      hash[size][d_head] = {}
      hash[size][d_head][d_full] = file
      return ""

   return hash[size][d_head][d_full]

def PrintCollision(orig_file, new_file):
   """Print command to resolve collision. """
   if (not orig_file[0] == '/') and (not new_file[0] == '/'):
      # Remove common root
      target_parts = new_file.split('/')
      orig_parts = orig_file.split('/')
      while len(target_parts) > 0 and len(orig_parts) > 0 and \
            target_parts[0] == orig_parts[0]:
         target_parts = target_parts[1:]
         orig_parts = orig_parts[1:]

      # Add relative links
      orig_file = ("../" * (len(target_parts) - 1)) + ('/'.join(orig_parts))

   print "ln -s -f '%s' '%s'" % (orig_file, new_file)

def ProcessFiles(file_list):
   """Process sorted list of files and check for collisions. """
   global file_count, dup_count, dup_bytes, total_bytes
   hash = {}
   for file in file_list:
      if not os.path.isfile(file):
         print "# %s: not a file" % file
         continue

      try:
         size = os.path.getsize(file)
      except OSError, err:
         print "# %s: %s" % (file, str(err))
         continue

      file_count += 1
      total_bytes += size
      file0 = FindCollision(hash, file, size)
      if file0 != "":
         PrintCollision(file0, file)
         dup_count += 1
         dup_bytes += size

if __name__ == "__main__":
   ProcessFiles(FileList())
   print "# %d files, %d/%d bytes read" % (file_count, read_bytes, total_bytes)
   if dup_count > 0:
      print "# %d bytes in %d duplicate files" % (dup_bytes, dup_count)
   else:
      print "# No duplicates found"