/* yume8.c - Don Yang (uguu.org) 03/19/04 */ #include #include #include typedef struct _line { unsigned int key; int pos, freq, size, rstart, rlength; char *line; struct _line *next; } Line; typedef struct _node { unsigned int key; int bucketsize; Line **lines; struct _node *left, *right; } Node; Node *Keys, **Glob_X, *Glob_N, *Node_l, *Node_r, *Node_p; Line *Head, *Tail, *D, **E; int Glob_I, Glob_J, Glob_K, LineNumber, HistorySize, LineSize, LineLength, OptUniqOnly, OptDupOnly, OptIgnoreCase, OptShowCount, OptKeepUnmatched, Marker_repeat[9], Marker_cursor, Marker_lastsearch, Marker_nest, Enq_i, Proc_maxsize, Proc_buffersize, Proc_start, Proc_end, Proc_eof, Proc_grow, Proc_status; char *LeftExpr, *RightExpr, *Str, *Glob_P, *Glob_Q, GenExpr[32], FreqCount[16], *Proc_buffer, *Proc_p, *Marker_paren[8], *DefaultLeftExpr = "^", *DefaultRightExpr = "$"; FILE *Infile, *Outfile; unsigned int CRCTable[256], CRCKey; int s_alpha(int c) { return (c > 64 && c < 91) || (c > 96 && c < 123); } int s_digit(int c) { return (c > 47 && c < 58); } int s_alnum(int c) { return s_alpha(c) || s_digit(c); } int s_space(int c) { return (c == 32) || (c > 8 && c < 14); } int s_lower(int c) { return (c > 64 && c < 91) ? c + 32 : c; } int s_fclose(FILE *a) { return fclose(a); } int s_fwrite(char *a, int size) { return fwrite(a, size, 1, Outfile); } int s_sprintf(char *a, char *t, int d) { return sprintf(a, t, d); } int s_printf(char *t, char *a) { return printf(t, a); } int s_free(void *a) { free(a); return 1; } void *s_alloc(int size) { return calloc(size, 1); } void s_memcpy(void *dst, void *src, int size) { memcpy(dst, src, size); } void FlushLine(void) { if( !(OptUniqOnly && Head->freq > 1) && !(OptDupOnly && Head->freq < 2) ) { OptShowCount ? ( Head->freq > 9999999 ) ? s_fwrite("*a lot*\t", 8) : s_sprintf(FreqCount, "%7d\t", Head->freq) ? s_fwrite(FreqCount, 8) : 0 : 0; s_fwrite(Head->line, Head->size); } for(Glob_X = &Keys; *Glob_X;) { if( (*Glob_X)->key == Head->key ) break; Glob_X = (*Glob_X)->key > Head->key ? &((*Glob_X)->left) : &((*Glob_X)->right); } if( *Glob_X ) { Glob_N = *Glob_X; for(Glob_I = Glob_J = 0; Glob_I < Glob_N->bucketsize; Glob_I++) { (Glob_N->lines[Glob_I] - Head) ? 0 : (Glob_N->lines[Glob_I] = NULL); Glob_N->lines[Glob_I] ? 0 : Glob_J++; } if( Glob_J >= Glob_N->bucketsize ) { if( Glob_N->left ) if( Glob_N->right ) { Node_p = Node_l = Glob_N->left; if( (Node_r = Node_l->right) ) { for(; Node_r->right; Node_r = Node_r->right) Node_p = Node_r; Node_p->right = Node_r->left; Node_r->left = Node_l; Node_r->right = Glob_N->right; *Glob_X = Node_r; } else { Node_l->right = Glob_N->right; *Glob_X = Node_l; } } else *Glob_X = Glob_N->left; else *Glob_X = Glob_N->right; s_free(Glob_N->lines); s_free(Glob_N); } } D = Head; (Head = Head->next) ? 0 : (Tail = NULL); s_free(D->line); s_free(D); } int s_atoi(void) { return atoi(Str); } int s_abs_atoi(void) { return abs(s_atoi()); } int AddKey0(void) { for(Glob_I = 0; Glob_I < Glob_N->bucketsize; Glob_I++) if( !Glob_N->lines[Glob_I] ) { Glob_N->lines[Glob_I] = D; return 0; } if( !(E = (Line**)s_alloc(Glob_N->bucketsize * 2 * sizeof(Line*))) ) return 1; s_memcpy(E, Glob_N->lines, Glob_N->bucketsize * sizeof(Line*)); s_free(Glob_N->lines); (Glob_N->lines = E)[Glob_N->bucketsize] = D; Glob_N->bucketsize *= 2; return 0; } int GetN(void) { for(Glob_K = (int)*(Glob_P++) - 48; s_digit(*Glob_P); ++Glob_P) Glob_K = Glob_K * 10 + (int)*Glob_P - 48; --Glob_P; return Glob_K; } int SetMarker(int dir, int *pos1, int *pos2, char *expr) { Marker_cursor = *pos1; Marker_repeat[Marker_lastsearch = Marker_nest = 0] = 0; for(Glob_P = expr; *Glob_P; Glob_P++) switch( *Glob_P ) { case 94: Marker_cursor = 0; case 43: dir = 1; break; case 36: Marker_cursor = LineLength - 1; case 45: dir = -1; break; case 64: *pos2 = Marker_cursor; break; case 40: if( Marker_nest > 7 ) return 0; Marker_paren[Marker_nest++] = Glob_P; Marker_repeat[Marker_nest] = 0; break; case 41: if( !Marker_nest ) return 0; Marker_nest--; if( Marker_repeat[Marker_nest] > 1 ) { Glob_P = Marker_paren[Marker_nest]; Marker_repeat[Marker_nest++]--; } else if( Marker_repeat[Marker_nest] == 1 ) { Glob_P++; GetN(); Marker_repeat[Marker_nest]--; } else { Glob_Q = Glob_P++; if( !s_digit(*Glob_P) ) return 0; if( (Marker_repeat[Marker_nest] = GetN()) < 2 ) { Marker_repeat[Marker_nest] = 0; } else { Glob_P = Glob_Q - 1; Marker_nest++; } } break; default: if( s_digit(*Glob_P) ) { Marker_cursor += GetN() * dir; if( Marker_cursor < 0 || Marker_cursor >= LineLength ) return 0; Marker_lastsearch = 0; } else if( strchr("asdiASDI", *Glob_P) ) { if( Marker_lastsearch == -(int)*Glob_P ) Marker_cursor += dir; switch( *Glob_P ) { #define CHARSEARCH1(chartest) \ for(; Marker_cursor > -1 && \ Marker_cursor < LineLength; \ Marker_cursor += dir) \ if( chartest(Str[Marker_cursor]) ) \ break; \ if( Marker_cursor < 0 || Marker_cursor >= LineLength ) \ return 0; case 97: CHARSEARCH1(s_alpha); break; case 115: CHARSEARCH1(s_space); break; case 100: CHARSEARCH1(s_digit); break; case 105: CHARSEARCH1(s_alnum); break; #define CHARSEARCH0(chartest) \ for(; Marker_cursor > -1 && \ Marker_cursor < LineLength; \ Marker_cursor += dir) \ if( !chartest(Str[Marker_cursor]) ) \ break; \ if( Marker_cursor < 0 || Marker_cursor >= LineLength ) \ return 0; case 65: CHARSEARCH0(s_alpha); break; case 83: CHARSEARCH0(s_space); break; case 68: CHARSEARCH0(s_digit); break; case 73: CHARSEARCH0(s_alnum); break; default: break; } Marker_lastsearch = -(int)*Glob_P; } else { if( *Glob_P == 47 ) if( !*++Glob_P ) return 0; for((Marker_lastsearch - (int)*Glob_P) ? 0 : (Marker_cursor += dir); (Marker_cursor > -1 && Marker_cursor < LineLength) ? Str[Marker_cursor] - *Glob_P : 0; Marker_cursor += dir); if( Marker_cursor < 0 || Marker_cursor >= LineLength ) return 0; Marker_lastsearch = (int)*Glob_P; } break; } *pos1 = Marker_cursor; return 1; } int ProcessLine(void) { LineNumber++; if( HistorySize > 0 ) for(; Head ? (LineNumber - Head->pos > HistorySize) : 0; FlushLine()); LineLength = LineSize; for(Glob_K = 0; Glob_K < 2 && LineLength > 0; Glob_K++) { if( (Str[LineLength - 1] - 13) && (Str[LineLength - 1] - 10) ) break; LineLength--; } CRCKey = ~0; if( LineLength > 0 ) { Glob_I = Enq_i = 0; Glob_J = LineLength - 1; if ( SetMarker(1, &Glob_I, &Glob_J, LeftExpr) ? SetMarker(-1, &Glob_J, &Glob_I, RightExpr) ? ((Glob_J = Glob_J - Glob_I + 1) < 0) : 1 : 1 ) { if( !OptKeepUnmatched ) return 0; Glob_I = 0; Glob_J = LineLength; } Glob_K = Glob_J; Glob_P = Str + Glob_I; if( OptIgnoreCase ) for(; Glob_K > 0; Glob_K--) CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)s_lower(*(Glob_P++))) & 0xff]; else for(; Glob_K > 0; Glob_K--) CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)*(Glob_P++)) & 0xff]; } else { Glob_I = Glob_J = 0; if( !SetMarker(1, &Glob_I, &Glob_J, LeftExpr) || !SetMarker(-1, &Glob_J, &Glob_I, RightExpr) ) if( !OptKeepUnmatched ) return 0; Glob_I = Glob_J = 0; } for(Glob_N = Keys; Glob_N ? (Glob_N->key - CRCKey) : 0; Glob_N = (Glob_N->key > CRCKey) ? Glob_N->left : Glob_N->right); if( Glob_N ) { E = Glob_N->lines; for(Glob_K = 0; Glob_K < Glob_N->bucketsize; Glob_K++) { if( *E ) if( (*E)->rlength == Glob_J ) { Glob_P = (*E)->line + (*E)->rstart; Glob_Q = Str + Glob_I; Enq_i = Glob_J; if( OptIgnoreCase ) { for(; Enq_i-- > 0;) if( s_lower(*Glob_P++) - s_lower(*Glob_Q++) ) break; } else for(; Enq_i-- > 0;) if( *Glob_P++ - *Glob_Q++ ) break; if( Enq_i < 0 ) { (*E)->pos = LineNumber; (*E)->freq++; return 0; } } E++; } } if( !(D = (Line*)s_alloc(sizeof(Line))) ) return 1; if( !(D->line = (char*)s_alloc(D->size = LineSize)) ) return s_free(D); s_memcpy(D->line, Str, LineSize); D->next = NULL; D->freq = 1; D->pos = LineNumber; D->rstart = Glob_I; D->rlength = Glob_J; D->key = CRCKey; Tail ? (Tail = (Tail->next = D)) : (Head = Tail = D); if( HistorySize < 0 && LineNumber > -HistorySize ) return 0; for(Glob_X = &Keys; *Glob_X;) { if( (*Glob_X)->key == CRCKey ) { Glob_N = *Glob_X; return AddKey0(); } Glob_X = ((*Glob_X)->key > CRCKey) ? &((*Glob_X)->left) : &((*Glob_X)->right); } if( !(*Glob_X = Glob_N = (Node*)s_alloc(sizeof(Node))) ) return 1; if( !(Glob_N->lines = (Line**)s_alloc((Glob_N->bucketsize = 4) * sizeof(Line*))) ) { *Glob_X = NULL; return s_free(Glob_N); } Glob_N->key = CRCKey; Glob_N->left = Glob_N->right = NULL; return AddKey0(); } int Process(void) { if( !(Proc_buffer = (char*)s_alloc(Proc_maxsize = 0x100)) ) return 1; for(Proc_buffersize = Proc_eof = Proc_start = Proc_status = 0;;) { if( Proc_buffersize < Proc_maxsize ) { if( (Glob_K = fread(Proc_buffer + Proc_buffersize, 1, Proc_maxsize - Proc_buffersize, Infile)) < Proc_maxsize - Proc_buffersize ) Proc_eof = 1; Proc_buffersize += Glob_K; } for(Proc_start = Proc_grow = 0; Proc_start < Proc_buffersize; Proc_start = Proc_end) { Proc_grow = 1; Str = Proc_p = Proc_buffer + Proc_start; for(Proc_end = Proc_start; Proc_end < Proc_buffersize; Proc_end++) { if( *Proc_p == 10 ) { LineSize = ++Proc_end - Proc_start; Proc_status += ProcessLine(); Proc_grow = 0; break; } else if( *Proc_p == 13 ) { if( ++Proc_end < Proc_buffersize ) { if( *++Proc_p == 10 ) Proc_end++; LineSize = Proc_end - Proc_start; Proc_status += ProcessLine(); Proc_grow = 0; } break; } Proc_p++; } if( Proc_grow ) break; } if( Proc_eof ) break; if( Proc_start > 0 ) { Proc_p = Proc_buffer; for(Glob_K = 0; Glob_K < Proc_buffersize - Proc_start; Glob_K++) { *Proc_p = *(Proc_p + Proc_start); Proc_p++; } Proc_buffersize -= Proc_start; Proc_start = 0; } else { if( !(Proc_p = (char*)s_alloc(Proc_maxsize * 2)) ) return s_free(Proc_buffer); s_memcpy(Proc_p, Proc_buffer, Proc_buffersize); s_free(Proc_buffer); Proc_buffer = Proc_p; Proc_maxsize *= 2; } } if( Proc_buffersize > Proc_start ) { Str = Proc_buffer + Proc_start; LineSize = Proc_buffersize - Proc_start; Proc_status += ProcessLine(); } s_free(Proc_buffer); return Proc_status; } int main(int argc, char **argv) { Glob_P = Glob_Q = NULL; OptUniqOnly = OptDupOnly = OptIgnoreCase = OptShowCount = OptKeepUnmatched = HistorySize = LineNumber = 0; LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; for(Glob_I = 1; Glob_I < argc; Glob_I++) if( argv[Glob_I][0] == 45 ) switch( argv[Glob_I][1] ) { case 0: Glob_P = *argv; break; case 117: OptUniqOnly = 1; break; case 100: OptDupOnly = 1; break; case 105: OptIgnoreCase = 1; break; case 99: OptShowCount = 1; break; case 107: OptKeepUnmatched = 1; break; case 104: case 108: case 114: case 102: case 115: case 119: if( argv[Glob_J = Glob_I][2] ) Str = &argv[Glob_I][2]; else { if( Glob_I > argc ) return s_printf( "not enough arguments for %s\n", argv[Glob_I]); Str = argv[++Glob_I]; } (argv[Glob_J][1] == 104) ? (HistorySize = s_atoi()) : (argv[Glob_J][1] == 108) ? !(LeftExpr = Str) : (argv[Glob_J][1] == 114) ? !(RightExpr = Str) : (LeftExpr = DefaultLeftExpr) ? (RightExpr = DefaultRightExpr) ? (argv[Glob_J][1] - 102) ? (argv[Glob_J][1] - 115) ? s_sprintf(RightExpr = GenExpr, "^%d", abs(s_atoi()-1)) : s_sprintf(LeftExpr = GenExpr, "^%d", s_abs_atoi()) : s_sprintf(LeftExpr = GenExpr, "^S(sS)%d", s_abs_atoi()) :0:0 ; break; default: return s_printf("invalid option: %s\n", argv[Glob_I]); } else Glob_P ? (Glob_Q = argv[Glob_I]) : (Glob_P = argv[Glob_I]); Infile = stdin; Outfile = stdout; if( Glob_P && Glob_P - *argv ) if( !(Infile = fopen(Glob_P, "rb")) ) return s_printf("can not open %s\n", Glob_P); if( Glob_Q ) if( !(Outfile = fopen(Glob_Q, "wb+")) ) return s_fclose(Infile) + s_printf("can not create %s\n", Glob_Q); for(Glob_I = 0; (CRCKey = Glob_I) < 256; CRCTable[Glob_I++] = CRCKey) for(Glob_J = 8; Glob_J > 0; Glob_J--) (CRCKey & 1) ? (CRCKey = (CRCKey >> 1) ^ 0xedb88320) : (CRCKey >>= 1); Keys = NULL; Head = Tail = NULL; for(Process() ? fputs("out of memory\n", stderr) : 0; Head; FlushLine()); return s_fclose(Infile) + s_fclose(Outfile); }