#!/usr/bin/perl # yume1.pl - Don Yang (uguu.org) # Slightly optimized version of yume0.pl # # Supported options: -i -c -f -s -w -k -u -d -h1 # Not binary clean, feed this only text files with uniform end of lines. # # 03/20/04 use strict; local (*INFILE, *OUTFILE); my (%Freq, @Text, $f_regexp); my ($Opt_i, $Opt_c, $Opt_h, $Opt_k, $Opt_u, $Opt_d); my ($Opt_f, $Opt_s, $Opt_w); sub ParseOptions { my ($iname, $oname, $i, $f, $a); $iname = undef; $oname = undef; ($Opt_i, $Opt_c, $Opt_h, $Opt_k, $Opt_u, $Opt_d, $Opt_f, $Opt_s, $Opt_w) = ( undef, undef, undef, undef, undef, undef, undef, undef, undef); for($i = 0; $i <= $#ARGV; $i++) { if( $ARGV[$i] eq '-' ) { $iname = '-'; } elsif( $ARGV[$i] =~ /^-([ichkud])/ ) { eval "\$Opt_$1++;"; } elsif( $ARGV[$i] =~ /^-([fsw])(\d*)$/ ) { ($f, $a) = ($1, $2); if( length($a) == 0 ) { if( $i == $#ARGV ) { die "not enough arguments for $ARGV[$i]\n"; } $a = $ARGV[++$i]; } eval "\$Opt_$f = '$a';"; if( $f eq 'f' ) { $f_regexp = '^' . ('\S+\s+' x $a) . '(\S.*)$'; } } elsif( $ARGV[$i] =~ /^-[lrh]/ ) { die "unsupported option: $ARGV[$i]\n"; } elsif( $ARGV[$i] =~ /^-/ ) { die "invalid option: $ARGV[$i]\n"; } else { if( $iname ) { $oname = $ARGV[$i]; } else { $iname = $ARGV[$i]; } } } if( $iname && $iname ne '-' ) { open INFILE, "< $iname" or die "can not open $iname: $!\n"; } else { open INFILE, "<&STDIN" or die "can not dup STDIN: $!\n"; } if( $oname ) { open OUTFILE, "> $oname" or die "can not create $oname: $!\n"; } else { open OUTFILE, ">&STDOUT" or die "can not dup STDOUT: $!\n"; } } sub OutputLine { my ($str, $count) = @_; return unless $count; return if( $Opt_u && $count > 1 ); return if( $Opt_d && $count < 2 ); if( $Opt_c ) { print OUTFILE (sprintf "\%7d\t", $count); } print OUTFILE $str; } sub Filter { my ($line) = @_; chomp $line; if( defined($Opt_f) ) { unless( $line =~ s/$f_regexp/$1/o ) { $line = undef unless $Opt_k; } } elsif( defined($Opt_w) ) { if( length $line >= $Opt_w ) { $line = substr($line, 0, $Opt_w); } else { $line = undef unless $Opt_k; } } elsif( defined($Opt_s) ) { if( length $line > $Opt_s ) { $line = substr($line, $Opt_s); } else { $line = undef unless $Opt_k; } } return undef unless defined $line; return $Opt_i ? lc($line) : $line; } sub Run { my ($line, $key); while( $line = ) { if( defined($key = Filter($line)) ) { if( ++$Freq{$key} < 2 ) { push @Text, $line; } } } OutputLine($_, $Freq{Filter($_)}) foreach (@Text); } sub FlushLine { my ($line, $count) = @_; return unless defined $line; OutputLine($line, $count); } sub Uniq { my ($line, $lastline, $key, $lastkey, $count); $lastline = undef; $lastkey = undef; $count = 0; while( $line = ) { if( defined($key = Filter($line)) ) { if( $key eq $lastkey ) { $count++; } else { FlushLine($lastline, $count); $lastline = $line; $lastkey = $key; $count = 1; } } else { FlushLine($lastline, $count); $lastline = undef; $lastkey = undef; } } FlushLine($lastline, $count); } %Freq = (); @Text = (); ParseOptions(); if( $Opt_h ) { Uniq(); } else { Run(); } close INFILE; close OUTFILE;