#!/usr/bin/perl # natsume1.pl - Don Yang (uguu.org) # # 01/02/06 use strict; use Digest::MD5; my $HEADER_LENGTH = 1024; my (%Hash); my ($FileCount, $ReadBytes, $TotalBytes); my ($DupCount, $DupBytes); sub Uniq(@) { my (@list, $file); foreach $file (@_) { if( $#list < 0 ) { @list = ($file); } else { if( $file ne $list[$#list] ) { push @list, $file; } } } return @list; } sub Canonicalize(@) { my (@list); foreach my $file (@_) { my @parts = split /\//, $file; next if $#parts < 0; my @keep_parts = (); for(my $i = 0; $i <= $#parts; $i++) { if( $parts[$i] eq "." ) { next; } if( $#keep_parts < 0 ) { push @keep_parts, $parts[$i]; } else { if( $parts[$i] ne ".." ) { push @keep_parts, $parts[$i]; } elsif( $keep_parts[$#keep_parts] eq "" || $keep_parts[$#keep_parts] eq ".." ) { push @keep_parts, $parts[$i]; } else { pop @keep_parts; } } } $file = join '/', @keep_parts; push @list, $file unless( $file eq '' || $file eq '.' || $file eq '..' ); } return Uniq(sort @list); } sub DigestHeader($) { my ($file) = @_; local (*INFILE); unless( open INFILE, "< $file" ) { print "# $file: Can not open: $!\n"; return undef; } my $data; read INFILE, $data, $HEADER_LENGTH; close INFILE; my $ctx = Digest::MD5->new; $ctx->add($data); $ReadBytes += length $data; return $ctx->digest; } sub DigestAll($$) { my ($file, $size) = @_; local (*INFILE); unless( open INFILE, "< $file" ) { print "# $file: Can not open: $!\n"; return undef; } my $ctx = Digest::MD5->new; $ctx->addfile(*INFILE); $ReadBytes += $size; close INFILE; return $ctx->digest; } sub FindCollision($$) { my ($file, $size) = @_; unless( exists $Hash{$size} ) { $Hash{$size}{''} = $file; return undef; } if( exists $Hash{$size}{''} ) { my $file0 = $Hash{$size}{''}; my $header0 = DigestHeader($file0); delete $Hash{$size}{''}; $Hash{$size}{$header0}{''} = $file0; } my $header = DigestHeader($file); unless( exists $Hash{$size}{$header} ) { $Hash{$size}{$header}{''} = $file; return undef; } if( exists $Hash{$size}{$header}{''} ) { my $file0 = $Hash{$size}{$header}{''}; my $digest0 = DigestAll($file0, -s $file0); delete $Hash{$size}{$header}{''}; $Hash{$size}{$header}{$digest0} = $file0; } my $digest = DigestAll($file, $size); unless( exists $Hash{$size}{$header}{$digest} ) { $Hash{$size}{$header}{$digest} = $file; return undef; } return $Hash{$size}{$header}{$digest}; } sub PrintCollision($$) { my ($orig, $new) = @_; if( $orig !~ m{^/} && $new !~ m{^/} ) { my $target = $new; while( $orig =~ m{^([^/]+)/(.*)} ) { my ($orig_root, $orig_subpath) = ($1, $2); last if( $target !~ m{^([^/]+)/(.*)} ); my ($target_root, $target_subpath) = ($1, $2); if( $orig_root eq $target_root ) { $orig = $orig_subpath; $target = $target_subpath; } else { last; } } if( index($target, '/') >= 0 ) { my @parts = split /\//, $target; $orig = ("../" x $#parts) . $orig; } } print "ln -s -f '$orig' '$new'\n"; } sub ProcessFiles(@) { $FileCount = $ReadBytes = $TotalBytes = $DupCount = $DupBytes = 0; foreach my $file (@_) { unless( -e $file ) { print "# $file: not found\n"; next; } unless( -f _ ) { print "# $file: not a file\n"; next; } unless( -r _ ) { print "# $file: not readable\n"; next; } my $size = -s _; $FileCount++; $TotalBytes += $size; if( $size <= 0 ) { print "ln -s -f /dev/null '$file'\n"; next; } my $file0 = FindCollision($file, $size); if( defined $file0 ) { PrintCollision($file0, $file); $DupCount++; $DupBytes += $size; } } print "# $FileCount files, $ReadBytes/$TotalBytes bytes read\n"; if( $DupCount > 0 ) { print "# $DupBytes bytes in $DupCount duplicate files\n"; } else { print "# No duplicates found\n"; } } if( $#ARGV < 0 ) { my (@list); while() { chomp; push @list, $_; } ProcessFiles(Canonicalize(@list)); } else { ProcessFiles(Canonicalize(@ARGV)); }