/* yume3.c - Don Yang (uguu.org) 03/18/04 */ #include #include #include #include typedef struct _line { unsigned int key; int pos, freq, size; int rstart, rlength; char *line; struct _line *next; } Line; typedef struct _node { unsigned int key; int bucketsize; Line **lines; struct _node *left, *right; } Node; static Node *Keys, **Glob_X, *Glob_N; static Line *Head, *Tail, *D, **E; static int Glob_I, Glob_J, Glob_K; static int LineNumber, HistorySize, LineSize, LineLength; static int OptUniqOnly, OptDupOnly; static int OptIgnoreCase, OptShowCount, OptKeepUnmatched; static char *LeftExpr, *RightExpr, *Str, *Glob_P, *Glob_Q; static FILE *Infile, *Outfile; static char GenExpr[32], FreqCount[16]; static int Enq_i; static int Proc_maxsize, Proc_buffersize; static int Proc_start, Proc_end, Proc_eof, Proc_grow, Proc_status; static char *Proc_buffer, *Proc_p; #define MAX_NEST 8 static Node *Node_l, *Node_r, *Node_p; static char *Marker_paren[MAX_NEST]; static int Marker_repeat[MAX_NEST + 1], Marker_cursor; static int Marker_lastsearch, Marker_nest; static char *DefaultLeftExpr = "^"; static char *DefaultRightExpr = "$"; static unsigned int CRCTable[256], CRCKey; static int Cmp(char *a, char *b, int size); static void CRC(char *p, int size); static void InitCRC(void); static int EnqueueLine(void); static void DequeueLine(void); static void FlushLine(void); static int AddKey(void); static int AddKey0(void); static void DeleteKey(void); static void FindKey(void); static int GetN(void); static int SetMarker(int dir, int *pos1, int *pos2, char *expr); static int Process(void); static int ProcessLine(void); static int s_lower(int c); static int s_alpha(int c); static int s_alnum(int c); static int s_digit(int c); static int s_space(int c); int main(int argc, char **argv) { Glob_P = Glob_Q = NULL; OptUniqOnly = OptDupOnly = OptIgnoreCase = OptShowCount = OptKeepUnmatched = HistorySize = 0; LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; for(Glob_I = 1; Glob_I < argc; Glob_I++) { if( argv[Glob_I][0] == '-' ) { switch( argv[Glob_I][1] ) { case '\0': Glob_P = *argv; break; case 'u': OptUniqOnly = 1; break; case 'd': OptDupOnly = 1; break; case 'i': OptIgnoreCase = 1; break; case 'c': OptShowCount = 1; break; case 'k': OptKeepUnmatched = 1; break; case 'h': case 'l': case 'r': case 'f': case 's': case 'w': Glob_J = Glob_I; if( argv[Glob_I][2] != '\0' ) { Str = &argv[Glob_I][2]; } else { if( Glob_I + 1 >= argc ) return printf( "not enough arguments for %s\n", argv[Glob_I]); Str = argv[++Glob_I]; } if( argv[Glob_J][1] == 'h' ) { HistorySize = atoi(Str); } else if( argv[Glob_J][1] == 'l' ) { LeftExpr = Str; } else if( argv[Glob_J][1] == 'r' ) { RightExpr = Str; } else { LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; if( argv[Glob_J][1] == 'f' ) { sprintf(GenExpr, "^S(sS)%d", abs(atoi(Str))); LeftExpr = GenExpr; } else { if( argv[Glob_J][1] == 's' ) { sprintf(GenExpr, "^%d", abs(atoi(Str))); LeftExpr = GenExpr; } else { sprintf(GenExpr, "^%d", abs(atoi(Str) - 1)); RightExpr = GenExpr; } } } break; default: return printf("invalid option: %s\n", argv[Glob_I]); } } else { if( Glob_P == NULL ) Glob_P = argv[Glob_I]; else Glob_Q = argv[Glob_I]; } } if( Glob_P == NULL || Glob_P == *argv ) { Infile = stdin; } else { if( (Infile = fopen(Glob_P, "rb")) == NULL ) return printf("can not open %s\n", Glob_P); } if( Glob_Q == NULL ) { Outfile = stdout; } else { if( (Outfile = fopen(Glob_Q, "wb+")) == NULL ) { (void)fclose(Infile); return printf("can not create %s\n", Glob_Q); } } InitCRC(); Keys = NULL; Head = Tail = NULL; LineNumber = 0; if( Process() != 0 ) (void)fputs("out of memory\n", stderr); while( Head != NULL ) { FlushLine(); DequeueLine(); } assert(Keys == NULL); (void)fclose(Infile); (void)fclose(Outfile); return 0; } static int Cmp(char *a, char *b, int size) { if( OptIgnoreCase != 0 ) { while( size-- > 0 ) { if( s_lower(*a++) != s_lower(*b++) ) break; } } else { while( size-- > 0 ) { if( *a++ != *b++ ) break; } } return (size >= 0) ? 1 : 0; } static void CRC(char *p, int size) { if( OptIgnoreCase != 0 ) { for(CRCKey = ~0U; size > 0; size--) { CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)s_lower(*p)) & 0xff]; p++; } } else { for(CRCKey = ~0U; size > 0; size--) { CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)*(p++)) & 0xff]; } } } static void InitCRC(void) { for(Glob_I = 0; Glob_I < 256; Glob_I++) { CRCKey = Glob_I; for(Glob_J = 8; Glob_J > 0; Glob_J--) { if( (CRCKey & 1) != 0 ) CRCKey = (CRCKey >> 1) ^ 0xedb88320U; else CRCKey >>= 1; } CRCTable[Glob_I] = CRCKey; } } static int EnqueueLine(void) { assert(LineSize > 0); LineLength = LineSize; for(Glob_K = 0; Glob_K < 2 && LineLength > 0; Glob_K++) { if( Str[LineLength - 1] == '\r' || Str[LineLength - 1] == '\n' ) LineLength--; } if( LineLength > 0 ) { Glob_I = Enq_i = 0; Glob_J = LineLength - 1; if( SetMarker(1, &Glob_I, &Glob_J, LeftExpr) == 0 ) { if( SetMarker(-1, &Glob_J, &Glob_I, RightExpr) == 0 ) { Glob_J = Glob_J - Glob_I + 1; if( Glob_J >= 0 ) Enq_i = 1; } } if( Enq_i == 0 ) { if( OptKeepUnmatched == 0 ) return 0; Glob_I = 0; Glob_J = LineLength; } CRC(Str + Glob_I, Glob_J); } else { Glob_I = Glob_J = 0; if( SetMarker(1, &Glob_I, &Glob_J, LeftExpr) != 0 || SetMarker(-1, &Glob_J, &Glob_I, RightExpr) != 0 ) { if( OptKeepUnmatched == 0 ) return 0; } Glob_I = Glob_J = 0; CRCKey = ~0U; } FindKey(); if( Glob_N != NULL ) { E = Glob_N->lines; for(Glob_K = 0; Glob_K < Glob_N->bucketsize; Glob_K++) { if( *E != NULL ) { if( (*E)->rlength == Glob_J ) { if( Cmp((*E)->line + (*E)->rstart, Str + Glob_I, Glob_J) == 0 ) { (*E)->pos = LineNumber; (*E)->freq++; return 0; } } } E++; } } if( (D = (Line*)malloc(sizeof(Line))) == NULL ) return 1; if( (D->line = (char*)malloc((size_t)(D->size = LineSize))) == NULL ) { free(D); return 1; } memcpy(D->line, Str, (size_t)LineSize); D->next = NULL; D->freq = 1; D->pos = LineNumber; D->rstart = Glob_I; D->rlength = Glob_J; D->key = CRCKey; if( Tail == NULL ) { Head = Tail = D; } else { Tail->next = D; Tail = D; } if( HistorySize < 0 && LineNumber > -HistorySize ) return 0; return AddKey(); } static void DequeueLine(void) { assert(Head != NULL); DeleteKey(); D = Head; if( (Head = Head->next) == NULL ) Tail = NULL; free(D->line); free(D); } static void FlushLine(void) { if( !((OptUniqOnly != 0 && Head->freq > 1) || (OptDupOnly != 0 && Head->freq <= 1)) ) { if( OptShowCount != 0 ) { if( Head->freq < 10000000 ) { sprintf(FreqCount, "%7d\t", Head->freq); (void)fwrite(FreqCount, 8, 1, Outfile); } else { (void)fwrite("*a lot*\t", 8, 1, Outfile); } } (void)fwrite(Head->line, (size_t)Head->size, 1, Outfile); } } static int AddKey(void) { for(Glob_X = &Keys; *Glob_X != NULL;) { if( (*Glob_X)->key == CRCKey ) { Glob_N = *Glob_X; return AddKey0(); } Glob_X = ((*Glob_X)->key > CRCKey) ? &((*Glob_X)->left) : &((*Glob_X)->right); } if( (*Glob_X = Glob_N = (Node*)malloc(sizeof(Node))) == NULL ) return 1; if( (Glob_N->lines = (Line**)calloc((size_t)(Glob_N->bucketsize = 4), sizeof(Line*))) == NULL ) { free(Glob_N); *Glob_X = NULL; return 1; } Glob_N->key = CRCKey; Glob_N->left = Glob_N->right = NULL; return AddKey0(); } static int AddKey0(void) { for(Glob_I = 0; Glob_I < Glob_N->bucketsize; Glob_I++) { if( Glob_N->lines[Glob_I] == NULL ) { Glob_N->lines[Glob_I] = D; return 0; } } if( (E = (Line**)calloc((size_t)(Glob_N->bucketsize * 2), sizeof(Line*))) == NULL ) return 1; memcpy(E, Glob_N->lines, Glob_N->bucketsize * sizeof(Line*)); free(Glob_N->lines); (Glob_N->lines = E)[Glob_N->bucketsize] = D; Glob_N->bucketsize *= 2; return 0; } static void DeleteKey(void) { for(Glob_X = &Keys; *Glob_X != NULL;) { if( (*Glob_X)->key == Head->key ) break; Glob_X = ((*Glob_X)->key > Head->key) ? &((*Glob_X)->left) : &((*Glob_X)->right); } if( *Glob_X != NULL ) { Glob_N = *Glob_X; for(Glob_I = Glob_J = 0; Glob_I < Glob_N->bucketsize; Glob_I++) { if( Glob_N->lines[Glob_I] == Head ) Glob_N->lines[Glob_I] = NULL; if( Glob_N->lines[Glob_I] == NULL ) Glob_J++; } if( Glob_J >= Glob_N->bucketsize ) { if( Glob_N->left == NULL ) { *Glob_X = Glob_N->right; } else if( Glob_N->right == NULL ) { *Glob_X = Glob_N->left; } else { Node_p = Node_l = Glob_N->left; if( (Node_r = Node_l->right) == NULL ) { Node_l->right = Glob_N->right; *Glob_X = Node_l; } else { while( Node_r->right != NULL ) { Node_p = Node_r; Node_r = Node_r->right; } Node_p->right = Node_r->left; Node_r->left = Node_l; Node_r->right = Glob_N->right; *Glob_X = Node_r; } } free(Glob_N->lines); free(Glob_N); } } } static void FindKey(void) { for(Glob_N = Keys; Glob_N != NULL;) { if( Glob_N->key == CRCKey ) break; Glob_N = (Glob_N->key > CRCKey) ? Glob_N->left : Glob_N->right; } } static int GetN(void) { for(Glob_K = (int)*(Glob_P++) - (int)'0'; s_digit(*Glob_P); ++Glob_P) Glob_K = Glob_K * 10 + (int)*Glob_P - (int)'0'; --Glob_P; return Glob_K; } static int SetMarker(int dir, int *pos1, int *pos2, char *expr) { Marker_cursor = *pos1; Marker_lastsearch = Marker_nest = 0; Marker_repeat[0] = 0; for(Glob_P = expr; *Glob_P != '\0'; Glob_P++) { switch( *Glob_P ) { case '^': Marker_cursor = 0; case '+': dir = 1; break; case '$': Marker_cursor = LineLength - 1; case '-': dir = -1; break; case '@': *pos2 = Marker_cursor; break; case '(': if( Marker_nest >= MAX_NEST ) return 5; Marker_paren[Marker_nest++] = Glob_P; Marker_repeat[Marker_nest] = 0; break; case ')': if( Marker_nest == 0 ) return 6; Marker_nest--; if( Marker_repeat[Marker_nest] > 1 ) { assert(Marker_paren[Marker_nest] != NULL); Glob_P = Marker_paren[Marker_nest]; Marker_repeat[Marker_nest]--; Marker_nest++; } else if( Marker_repeat[Marker_nest] == 1 ) { assert(s_digit(*(Glob_P + 1))); Glob_P++; (void)GetN(); Marker_repeat[Marker_nest]--; } else { Glob_Q = Glob_P++; if( !s_digit(*Glob_P) ) return 4; if( (Marker_repeat[Marker_nest] = GetN()) < 2 ) { Marker_repeat[Marker_nest] = 0; } else { Glob_P = Glob_Q - 1; Marker_nest++; } } break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': Marker_cursor += GetN() * dir; if( Marker_cursor < 0 || Marker_cursor >= LineLength ) return 3; Marker_lastsearch = 0; break; default: if( strchr("asdiASDI", *Glob_P) != NULL ) { if( Marker_lastsearch == -(int)*Glob_P ) Marker_cursor += dir; switch( *Glob_P ) { #define CHARSEARCH(chartest) \ for(; Marker_cursor >= 0 && \ Marker_cursor < LineLength; \ Marker_cursor += dir) \ { \ if( chartest(Str[Marker_cursor]) ) \ break; \ } \ if( Marker_cursor < 0 || Marker_cursor >= LineLength ) \ return 1; case 'a': CHARSEARCH(s_alpha); break; case 's': CHARSEARCH(s_space); break; case 'd': CHARSEARCH(s_digit); break; case 'i': CHARSEARCH(s_alnum); break; #undef CHARSEARCH #define CHARSEARCH(chartest) \ for(; Marker_cursor >= 0 && \ Marker_cursor < LineLength; \ Marker_cursor += dir) \ { \ if( !chartest(Str[Marker_cursor]) ) \ break; \ } \ if( Marker_cursor < 0 || Marker_cursor >= LineLength ) \ return 1; case 'A': CHARSEARCH(s_alpha); break; case 'S': CHARSEARCH(s_space); break; case 'D': CHARSEARCH(s_digit); break; case 'I': CHARSEARCH(s_alnum); break; #undef CHARSEARCH default: assert(0); break; } Marker_lastsearch = -(int)*Glob_P; } else { if( *Glob_P == '/' ) { if( *++Glob_P == '\0' ) return 2; } if( Marker_lastsearch == (int)*Glob_P ) Marker_cursor += dir; for(; Marker_cursor >= 0 && Marker_cursor < LineLength; Marker_cursor += dir) { if( Str[Marker_cursor] == *Glob_P ) break; } if( Marker_cursor < 0 || Marker_cursor >= LineLength ) return 1; Marker_lastsearch = (int)*Glob_P; } break; } } *pos1 = Marker_cursor; return 0; } static int Process(void) { if( (Proc_buffer = (char*)malloc((size_t)(Proc_maxsize = 0x100))) == NULL ) return 1; Proc_buffersize = Proc_eof = Proc_start = 0; Proc_status = 0; for(;;) { if( Proc_buffersize < Proc_maxsize ) { Glob_K = (int)fread(Proc_buffer + Proc_buffersize, 1, (size_t)(Proc_maxsize - Proc_buffersize), Infile); if( Glob_K < Proc_maxsize - Proc_buffersize ) Proc_eof = 1; Proc_buffersize += Glob_K; } for(Proc_start = Proc_grow = 0; Proc_start < Proc_buffersize; Proc_start = Proc_end) { Proc_grow = 1; Str = Proc_p = Proc_buffer + Proc_start; for(Proc_end = Proc_start; Proc_end < Proc_buffersize; Proc_end++) { if( *Proc_p == '\n' ) { LineSize = ++Proc_end - Proc_start; Proc_status += ProcessLine(); Proc_grow = 0; break; } else if( *Proc_p == '\r' ) { if( ++Proc_end < Proc_buffersize ) { if( *++Proc_p == '\n' ) Proc_end++; LineSize = Proc_end - Proc_start; Proc_status += ProcessLine(); Proc_grow = 0; } break; } Proc_p++; } if( Proc_grow != 0 ) break; } if( Proc_eof != 0 ) break; if( Proc_start > 0 ) { Proc_p = Proc_buffer; for(Glob_K = 0; Glob_K < Proc_buffersize - Proc_start; Glob_K++) { *Proc_p = *(Proc_p + Proc_start); Proc_p++; } Proc_buffersize -= Proc_start; Proc_start = 0; } else { if( (Proc_p = (char*)malloc((size_t)Proc_maxsize * 2)) == NULL ) { free(Proc_buffer); return 1; } memcpy(Proc_p, Proc_buffer, (size_t)Proc_buffersize); free(Proc_buffer); Proc_buffer = Proc_p; Proc_maxsize *= 2; } } if( Proc_buffersize > Proc_start ) { Str = Proc_buffer + Proc_start; LineSize = Proc_buffersize - Proc_start; Proc_status += ProcessLine(); } free(Proc_buffer); return Proc_status; } static int ProcessLine(void) { assert(LineSize > 0); LineNumber++; if( HistorySize > 0 ) { while( Head != NULL ) { if( LineNumber - Head->pos <= HistorySize ) break; FlushLine(); DequeueLine(); } } return EnqueueLine(); } static int s_alpha(int c) { return (c >= 65 && c <= 90) || (c >= 97 && c <= 122); } static int s_alnum(int c) { return s_alpha(c) || s_digit(c); } static int s_digit(int c) { return (c >= 48 && c <= 57); } static int s_space(int c) { return (c == 32) || (c >= 9 && c <= 13); } static int s_lower(int c) { return (c >= 65 && c <= 90) ? c + 32 : c; }