#!/usr/bin/perl
# natsume1.pl - Don Yang (uguu.org)
#
# 01/02/06


use strict;
use Digest::MD5;

my $HEADER_LENGTH = 1024;
my (%Hash);
my ($FileCount, $ReadBytes, $TotalBytes);
my ($DupCount, $DupBytes);


sub Uniq(@)
{
   my (@list, $file);

   foreach $file (@_)
   {
      if( $#list < 0 )
      {
         @list = ($file);
      }
      else
      {
         if( $file ne $list[$#list] )
         {
            push @list, $file;
         }
      }
   }
   return @list;
}

sub Canonicalize(@)
{
   my (@list);
   foreach my $file (@_)
   {
      my @parts = split /\//, $file;
      next if $#parts < 0;
      my @keep_parts = ();
      for(my $i = 0; $i <= $#parts; $i++)
      {
         if( $parts[$i] eq "." )
         {
            next;
         }

         if( $#keep_parts < 0 )
         {
            push @keep_parts, $parts[$i];
         }
         else
         {
            if( $parts[$i] ne ".." )
            {
               push @keep_parts, $parts[$i];
            }
            elsif( $keep_parts[$#keep_parts] eq "" ||
                   $keep_parts[$#keep_parts] eq ".." )
            {
               push @keep_parts, $parts[$i];
            }
            else
            {
               pop @keep_parts;
            }
         }
      }

      $file = join '/', @keep_parts;
      push @list, $file unless( $file eq '' || $file eq '.' || $file eq '..' );
   }
   return Uniq(sort @list);
}

sub DigestHeader($)
{
   my ($file) = @_;
   local (*INFILE);

   unless( open INFILE, "< $file" )
   {
      print "# $file: Can not open: $!\n";
      return undef;
   }

   my $data;
   read INFILE, $data, $HEADER_LENGTH;
   close INFILE;

   my $ctx = Digest::MD5->new;
   $ctx->add($data);
   $ReadBytes += length $data;
   return $ctx->digest;
}

sub DigestAll($$)
{
   my ($file, $size) = @_;
   local (*INFILE);

   unless( open INFILE, "< $file" )
   {
      print "# $file: Can not open: $!\n";
      return undef;
   }

   my $ctx = Digest::MD5->new;
   $ctx->addfile(*INFILE);
   $ReadBytes += $size;
   close INFILE;
   return $ctx->digest;
}

sub FindCollision($$)
{
   my ($file, $size) = @_;

   unless( exists $Hash{$size} )
   {
      $Hash{$size}{''} = $file;
      return undef;
   }

   if( exists $Hash{$size}{''} )
   {
      my $file0 = $Hash{$size}{''};
      my $header0 = DigestHeader($file0);
      delete $Hash{$size}{''};
      $Hash{$size}{$header0}{''} = $file0;
   }

   my $header = DigestHeader($file);
   unless( exists $Hash{$size}{$header} )
   {
      $Hash{$size}{$header}{''} = $file;
      return undef;
   }

   if( exists $Hash{$size}{$header}{''} )
   {
      my $file0 = $Hash{$size}{$header}{''};
      my $digest0 = DigestAll($file0, -s $file0);
      delete $Hash{$size}{$header}{''};
      $Hash{$size}{$header}{$digest0} = $file0;
   }

   my $digest = DigestAll($file, $size);
   unless( exists $Hash{$size}{$header}{$digest} )
   {
      $Hash{$size}{$header}{$digest} = $file;
      return undef;
   }
   return $Hash{$size}{$header}{$digest};
}

sub PrintCollision($$)
{
   my ($orig, $new) = @_;

   if( $orig !~ m{^/} && $new !~ m{^/} )
   {
      my $target = $new;
      while( $orig =~ m{^([^/]+)/(.*)} )
      {
         my ($orig_root, $orig_subpath) = ($1, $2);
         last if( $target !~ m{^([^/]+)/(.*)} );
         my ($target_root, $target_subpath) = ($1, $2);
         if( $orig_root eq $target_root )
         {
            $orig = $orig_subpath;
            $target = $target_subpath;
         }
         else
         {
            last;
         }
      }
      if( index($target, '/') >= 0 )
      {
         my @parts = split /\//, $target;
         $orig = ("../" x $#parts) . $orig;
      }
   }

   print "ln -s -f '$orig' '$new'\n";
}

sub ProcessFiles(@)
{
   $FileCount = $ReadBytes = $TotalBytes = $DupCount = $DupBytes = 0;
   foreach my $file (@_)
   {
      unless( -e $file )   { print "# $file: not found\n"; next; }
      unless( -f _ )       { print "# $file: not a file\n"; next; }
      unless( -r _ )       { print "# $file: not readable\n"; next; }

      my $size = -s _;
      $FileCount++;
      $TotalBytes += $size;
      if( $size <= 0 )
      {
         print "ln -s -f /dev/null '$file'\n";
         next;
      }

      my $file0 = FindCollision($file, $size);
      if( defined $file0 )
      {
         PrintCollision($file0, $file);
         $DupCount++;
         $DupBytes += $size;
      }
   }
   print "# $FileCount files, $ReadBytes/$TotalBytes bytes read\n";
   if( $DupCount > 0 )
   {
      print "# $DupBytes bytes in $DupCount duplicate files\n";
   }
   else
   {
      print "# No duplicates found\n";
   }
}


if( $#ARGV < 0 )
{
   my (@list);
   while(<STDIN>)
   {
      chomp;
      push @list, $_;
   }
   ProcessFiles(Canonicalize(@list));
}
else
{
   ProcessFiles(Canonicalize(@ARGV));
}