#!/usr/bin/perl # yume0.pl - Don Yang (uguu.org) # Subset of yume.c's features for correctness/benchmark test. # # Supported options: -i -c -f -s -w -k -u -d -h1 # Not binary clean, feed this only text files with uniform end of lines. # # 03/20/04 use strict; local (*INFILE, *OUTFILE); my (%Opt, %Freq, @Text, $f_regexp); sub ParseOptions { my ($iname, $oname, $i, $f, $a); $iname = undef; $oname = undef; for($i = 0; $i <= $#ARGV; $i++) { if( $ARGV[$i] eq '-' ) { $iname = '-'; } elsif( $ARGV[$i] eq '-i' || $ARGV[$i] eq '-c' || $ARGV[$i] eq '-h1' || $ARGV[$i] eq '-k' || $ARGV[$i] eq '-u' || $ARGV[$i] eq '-d' ) { $Opt{$ARGV[$i]}++; } elsif( $ARGV[$i] =~ /^(-[fsw])(\d*)$/ ) { ($f, $a) = ($1, $2); if( length($a) == 0 ) { if( $i == $#ARGV ) { die "not enough arguments for $ARGV[$i]\n"; } $a = $ARGV[++$i]; } $Opt{$f} = $a; if( $f eq '-f' ) { $f_regexp = '^' . ('\S+\s+' x $a) . '(\S.*)$'; } } elsif( $ARGV[$i] =~ /^-[lrh]/ ) { die "unsupported option: $ARGV[$i]\n"; } elsif( $ARGV[$i] =~ /^-/ ) { die "invalid option: $ARGV[$i]\n"; } else { if( $iname ) { $oname = $ARGV[$i]; } else { $iname = $ARGV[$i]; } } } if( $iname && $iname ne '-' ) { open INFILE, "< $iname" or die "can not open $iname: $!\n"; } else { open INFILE, "<&STDIN" or die "can not dup STDIN: $!\n"; } if( $oname ) { open OUTFILE, "> $oname" or die "can not create $oname: $!\n"; } else { open OUTFILE, ">&STDOUT" or die "can not dup STDOUT: $!\n"; } } sub OutputLine { my ($str, $count) = @_; return unless $count; return if( $Opt{'-u'} && $count > 1 ); return if( $Opt{'-d'} && $count < 2 ); if( $Opt{'-c'} ) { print OUTFILE (sprintf "\%7d\t", $count); } print OUTFILE $str; } sub Filter { my ($line) = @_; chomp $line; if( exists $Opt{'-f'} ) { unless( $line =~ s/$f_regexp/$1/o ) { $line = undef unless $Opt{'-k'}; } } elsif( exists $Opt{'-w'} ) { if( length $line >= $Opt{'-w'} ) { $line = substr($line, 0, $Opt{'-w'}); } else { $line = undef unless $Opt{'-k'}; } } elsif( exists $Opt{'-s'} ) { if( length $line > $Opt{'-s'} ) { $line = substr($line, $Opt{'-s'}); } else { $line = undef unless $Opt{'-k'}; } } return undef unless defined $line; return $Opt{'-i'} ? lc($line) : $line; } sub Run { my ($line, $key); while( $line = ) { if( defined($key = Filter($line)) ) { if( ++$Freq{$key} < 2 ) { push @Text, $line; } } } OutputLine($_, $Freq{Filter($_)}) foreach (@Text); } sub FlushLine { my ($line, $count) = @_; return unless defined $line; OutputLine($line, $count); } sub Uniq { my ($line, $lastline, $key, $lastkey, $count); $lastline = undef; $lastkey = undef; $count = 0; while( $line = ) { if( defined($key = Filter($line)) ) { if( $key eq $lastkey ) { $count++; } else { FlushLine($lastline, $count); $lastline = $line; $lastkey = $key; $count = 1; } } else { FlushLine($lastline, $count); $lastline = undef; $lastkey = undef; $count = 0; } } FlushLine($lastline, $count); } %Opt = (); %Freq = (); @Text = (); ParseOptions(); if( $Opt{'-h1'} ) { Uniq(); } else { Run(); } close INFILE; close OUTFILE;