/* yume1.c - Don Yang (uguu.org) 03/18/04 */ #include #include #include #include typedef struct _line { unsigned int key; int pos, freq, size; int rstart, rlength; char *line; struct _line *next; } Line; typedef struct _node { unsigned int key; int bucketsize; Line **lines; struct _node *left, *right; } Node; static Node *Keys, **Glob_X, *Glob_N; static Line *Head, *Tail, *D; static int LineNumber, HistorySize, LineSize, LineLength; static int OptUniqOnly, OptDupOnly; static int OptIgnoreCase, OptShowCount, OptKeepUnmatched; static char *LeftExpr, *RightExpr, *Str, *Glob_P, *Glob_Q; static FILE *Infile, *Outfile; static char *DefaultLeftExpr = "^"; static char *DefaultRightExpr = "$"; static unsigned int CRCTable[256], CRCKey; static int Cmp(char *a, char *b, int size); static void CRC(char *p, int size); static void InitCRC(void); static int EnqueueLine(void); static void DequeueLine(void); static void FlushLine(void); static int AddKey(void); static int AddKey0(void); static void DeleteKey(void); static Node *FindKey(void); static int GetN(void); static int SetMarker(int dir, int *pos1, int *pos2, char *expr); static int Process(void); static int ProcessLine(void); static int s_lower(int c); static int s_alpha(int c); static int s_alnum(int c); static int s_digit(int c); static int s_space(int c); int main(int argc, char **argv) { static char gen_expr[32]; char *arg, *iname, *oname; int i, j; /* Parse options */ iname = oname = NULL; OptUniqOnly = OptDupOnly = OptIgnoreCase = OptShowCount = OptKeepUnmatched = HistorySize = 0; LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; for(i = 1; i < argc; i++) { if( argv[i][0] == '-' ) { switch( argv[i][1] ) { case '\0': iname = *argv; break; case 'u': OptUniqOnly = 1; break; case 'd': OptDupOnly = 1; break; case 'i': OptIgnoreCase = 1; break; case 'c': OptShowCount = 1; break; case 'k': OptKeepUnmatched = 1; break; case 'h': case 'l': case 'r': case 'f': case 's': case 'w': j = i; if( argv[i][2] != '\0' ) { arg = &argv[i][2]; } else { if( i + 1 >= argc ) return printf("not enough arguments for %s\n", argv[i]); arg = argv[++i]; } if( argv[j][1] == 'h' ) { HistorySize = atoi(arg); } else if( argv[j][1] == 'l' ) { LeftExpr = arg; } else if( argv[j][1] == 'r' ) { RightExpr = arg; } else { LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; if( argv[j][1] == 'f' ) { sprintf(gen_expr, "^S(sS)%d", abs(atoi(arg))); LeftExpr = gen_expr; } else { if( argv[j][1] == 's' ) { sprintf(gen_expr, "^%d", abs(atoi(arg))); LeftExpr = gen_expr; } else { sprintf(gen_expr, "^%d", abs(atoi(arg) - 1)); RightExpr = gen_expr; } } } break; default: return printf("invalid option: %s\n", argv[i]); } } else { if( iname == NULL ) iname = argv[i]; else oname = argv[i]; } } if( iname == NULL || iname == *argv ) { Infile = stdin; } else { if( (Infile = fopen(iname, "rb")) == NULL ) return printf("can not open %s\n", iname); } if( oname == NULL ) { Outfile = stdout; } else { if( (Outfile = fopen(oname, "wb+")) == NULL ) { (void)fclose(Infile); return printf("can not create %s\n", oname); } } InitCRC(); Keys = NULL; Head = Tail = NULL; LineNumber = 0; if( Process() != 0 ) (void)fputs("out of memory\n", stderr); while( Head != NULL ) { FlushLine(); DequeueLine(); } assert(Keys == NULL); (void)fclose(Infile); (void)fclose(Outfile); return 0; } static int Cmp(char *a, char *b, int size) { if( OptIgnoreCase != 0 ) { while( size-- > 0 ) { if( s_lower(*a) != s_lower(*b) ) return 1; a++; b++; } } else { while( size-- > 0 ) { if( *a++ != *b++ ) return 1; } } return 0; } static void CRC(char *p, int size) { if( OptIgnoreCase != 0 ) { for(CRCKey = ~0U; size > 0; size--) { CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)s_lower(*p)) & 0xff]; p++; } } else { for(CRCKey = ~0U; size > 0; size--) { CRCKey = ((CRCKey >> 8) & 0xffffff) ^ CRCTable[(CRCKey ^ (unsigned int)*(p++)) & 0xff]; } } } static void InitCRC(void) { unsigned int poly = 0xedb88320U, crc, i, j; for(i = 0; i < 256; i++) { crc = i; for(j = 8; j > 0; j--) { if( (crc & 1) != 0 ) crc = (crc >> 1) ^ poly; else crc >>= 1; } CRCTable[i] = crc; } } static int EnqueueLine(void) { int l, r, i; Line **e; Node *n; assert(LineSize > 0); LineLength = LineSize; for(i = 0; i < 2 && LineLength > 0; i++) { if( Str[LineLength - 1] == '\r' || Str[LineLength - 1] == '\n' ) LineLength--; } if( LineLength > 0 ) { l = i = 0; r = LineLength - 1; if( SetMarker(1, &l, &r, LeftExpr) == 0 ) { if( SetMarker(-1, &r, &l, RightExpr) == 0 ) { r = r - l + 1; if( r >= 0 ) i = 1; } } if( i == 0 ) { if( OptKeepUnmatched == 0 ) return 0; l = 0; r = LineLength; } CRC(Str + l, r); } else { r = l = 0; if( SetMarker(1, &l, &r, LeftExpr) != 0 || SetMarker(-1, &r, &l, RightExpr) != 0 ) { if( OptKeepUnmatched == 0 ) return 0; } r = l = 0; CRCKey = ~0U; } /* Find duplicate lines, check for matching CRC before byte compare */ if( (n = FindKey()) != NULL ) { e = n->lines; for(i = 0; i < n->bucketsize; i++) { if( *e != NULL ) { if( (*e)->rlength == r ) { if( Cmp((*e)->line + (*e)->rstart, Str + l, r) == 0 ) { /* Duplicate found */ (*e)->pos = LineNumber; (*e)->freq++; return 0; } } } e++; } } /* Copy line */ if( (D = (Line*)malloc(sizeof(Line))) == NULL ) return 1; if( (D->line = (char*)malloc((size_t)(D->size = LineSize))) == NULL ) { free(D); return 1; } memcpy(D->line, Str, (size_t)LineSize); D->next = NULL; D->freq = 1; D->pos = LineNumber; D->rstart = l; D->rlength = r; D->key = CRCKey; /* Add to queue */ if( Tail == NULL ) { Head = Tail = D; } else { Tail->next = D; Tail = D; } /* Add reference. If line number exceeds history range, then reference is not added, this causes lines past the history range to be recorded but will never be matched. */ if( HistorySize < 0 && LineNumber > -HistorySize ) return 0; return AddKey(); } static void DequeueLine(void) { Line *d; assert(Head != NULL); DeleteKey(); d = Head; if( (Head = Head->next) == NULL ) Tail = NULL; free(d->line); free(d); } static void FlushLine(void) { char count[16]; if( OptUniqOnly != 0 ) { if( Head->freq > 1 ) return; } else if( OptDupOnly != 0 ) { if( Head->freq <= 1 ) return; } if( OptShowCount != 0 ) { if( Head->freq < 10000000 ) { sprintf(count, "%7d\t", Head->freq); (void)fwrite(count, 8, 1, Outfile); } else { (void)fwrite("*a lot*\t", 8, 1, Outfile); } } (void)fwrite(Head->line, (size_t)Head->size, 1, Outfile); } static int AddKey(void) { for(Glob_X = &Keys; *Glob_X != NULL;) { if( (*Glob_X)->key == CRCKey ) /* Add reference to existing key */ { Glob_N = *Glob_X; return AddKey0(); } Glob_X = ((*Glob_X)->key > CRCKey) ? &((*Glob_X)->left) : &((*Glob_X)->right); } /* Create new key */ if( (*Glob_X = Glob_N = (Node*)malloc(sizeof(Node))) == NULL ) return 1; if( (Glob_N->lines = (Line**)calloc((size_t)(Glob_N->bucketsize = 4), sizeof(Line*))) == NULL ) { free(Glob_N); *Glob_X = NULL; return 1; } Glob_N->key = CRCKey; Glob_N->left = Glob_N->right = NULL; return AddKey0(); } static int AddKey0(void) { Line **lines; int i; for(i = 0; i < Glob_N->bucketsize; i++) { if( Glob_N->lines[i] == NULL ) { Glob_N->lines[i] = D; return 0; } } if( (lines = (Line**)calloc((size_t)(Glob_N->bucketsize * 2), sizeof(Line*))) == NULL ) return 1; memcpy(lines, Glob_N->lines, Glob_N->bucketsize * sizeof(Line*)); free(Glob_N->lines); (Glob_N->lines = lines)[Glob_N->bucketsize] = D; Glob_N->bucketsize *= 2; return 0; } static void DeleteKey(void) { Node **x, *d, *l, *r, *p; int i, z; /* Find key */ for(x = &Keys; *x != NULL;) { if( (*x)->key == Head->key ) break; x = ((*x)->key > Head->key) ? &((*x)->left) : &((*x)->right); } if( *x == NULL ) return; /* Remove line reference to key */ d = *x; for(i = z = 0; i < d->bucketsize; i++) { if( d->lines[i] == Head ) d->lines[i] = NULL; if( d->lines[i] == NULL ) z++; } if( z < d->bucketsize ) return; /* All references removed, remove key itself */ if( d->left == NULL ) { *x = d->right; } else if( d->right == NULL ) { *x = d->left; } else { p = l = d->left; if( (r = l->right) == NULL ) { l->right = d->right; *x = l; } else { while( r->right != NULL ) { p = r; r = r->right; } p->right = r->left; r->left = l; r->right = d->right; *x = r; } } free(d->lines); free(d); } static Node *FindKey(void) { Node *r; for(r = Keys; r != NULL;) { if( r->key == CRCKey ) return r; r = (r->key > CRCKey) ? r->left : r->right; } return NULL; } static int GetN(void) { int n; for(n = (int)*(Glob_P++) - (int)'0'; s_digit(*Glob_P); ++Glob_P) n = n * 10 + (int)*Glob_P - (int)'0'; --Glob_P; return n; } static int SetMarker(int dir, int *pos1, int *pos2, char *expr) { #define MAX_NEST 8 char *paren[MAX_NEST]; int repeat[MAX_NEST + 1], cursor, lastsearch, nest; cursor = *pos1; lastsearch = nest = 0; repeat[0] = 0; for(Glob_P = expr; *Glob_P != '\0'; Glob_P++) { switch( *Glob_P ) { /* Set current cursor position / direction */ case '^': cursor = 0; /*@fallthrough@*/ case '+': dir = 1; break; case '$': cursor = LineLength - 1; /*@fallthrough@*/ case '-': dir = -1; break; /* Set other marker position */ case '@': *pos2 = cursor; break; /* Repeat expression */ case '(': if( nest >= MAX_NEST ) return 5; /* Error5: Nest level too deep */ paren[nest++] = Glob_P; repeat[nest] = 0; break; case ')': if( nest == 0 ) return 6; /* Error6: Unmatched parentheses */ nest--; if( repeat[nest] > 1 ) { /* Repeat */ assert(paren[nest] != NULL); Glob_P = paren[nest]; repeat[nest]--; nest++; } else if( repeat[nest] == 1 ) { /* End last iteration */ assert(s_digit(*(Glob_P + 1))); Glob_P++; (void)GetN(); repeat[nest]--; } else { /* Start repeat */ Glob_Q = Glob_P++; if( !s_digit(*Glob_P) ) return 4; /* Error4: No repeat count */ if( (repeat[nest] = GetN()) < 2 ) { repeat[nest] = 0; } else { Glob_P = Glob_Q - 1; nest++; } } break; /* Move cursor */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': cursor += GetN() * dir; if( cursor < 0 || cursor >= LineLength ) return 3; /* Error3: Cursor moved out of range */ lastsearch = 0; break; /* Character search */ default: if( strchr("asdiASDI", *Glob_P) != NULL ) { /* Character class */ if( lastsearch == -(int)*Glob_P ) cursor += dir; switch( *Glob_P ) { #define CHARSEARCH(chartest) \ for(; cursor >= 0 && cursor < LineLength; \ cursor += dir) \ { \ if( chartest(Str[cursor]) ) \ break; \ } \ if( cursor < 0 || cursor >= LineLength ) return 1; case 'a': CHARSEARCH(s_alpha); break; case 's': CHARSEARCH(s_space); break; case 'd': CHARSEARCH(s_digit); break; case 'i': CHARSEARCH(s_alnum); break; #undef CHARSEARCH #define CHARSEARCH(chartest) \ for(; cursor >= 0 && cursor < LineLength; \ cursor += dir) \ { \ if( !chartest(Str[cursor]) ) \ break; \ } \ if( cursor < 0 || cursor >= LineLength ) return 1; case 'A': CHARSEARCH(s_alpha); break; case 'S': CHARSEARCH(s_space); break; case 'D': CHARSEARCH(s_digit); break; case 'I': CHARSEARCH(s_alnum); break; #undef CHARSEARCH default: assert(0); break; } lastsearch = -(int)*Glob_P; } else { /* Single character */ if( *Glob_P == '/' ) { if( *++Glob_P == '\0' ) return 2; /* Error2: "/" not followed by char */ } if( lastsearch == (int)*Glob_P ) cursor += dir; for(; cursor >= 0 && cursor < LineLength; cursor += dir) { if( Str[cursor] == *Glob_P ) break; } if( cursor < 0 || cursor >= LineLength ) return 1; /* Error1: Search character not found */ lastsearch = (int)*Glob_P; } break; } } *pos1 = cursor; return 0; } static int Process(void) { int maxsize, buffersize, start, end, eof, grow, i, status; char *buffer, *p; if( (buffer = (char*)malloc((size_t)(maxsize = 0x100))) == NULL ) return 1; buffersize = eof = start = 0; status = 0; for(;;) { /* Fill buffer */ if( buffersize < maxsize ) { i = (int)fread(buffer + buffersize, 1, (size_t)(maxsize - buffersize), Infile); if( i < maxsize - buffersize ) eof = 1; buffersize += i; } /* Tokenize buffer */ for(start = grow = 0; start < buffersize; start = end) { grow = 1; Str = p = buffer + start; for(end = start; end < buffersize; end++) { if( *p == '\n' ) { LineSize = ++end - start; status += ProcessLine(); grow = 0; break; } else if( *p == '\r' ) { if( ++end < buffersize ) { if( *++p == '\n' ) end++; LineSize = end - start; status += ProcessLine(); grow = 0; } break; } p++; } if( grow != 0 ) break; } if( eof != 0 ) break; if( start > 0 ) { /* Shift partial line */ p = buffer; for(i = 0; i < buffersize - start; i++) { *p = *(p + start); p++; } buffersize -= start; start = 0; } else { /* Expand buffer */ if( (p = (char*)malloc((size_t)maxsize * 2)) == NULL ) { free(buffer); return 1; } memcpy(p, buffer, (size_t)buffersize); free(buffer); buffer = p; maxsize *= 2; } } /* Last line */ if( buffersize > start ) { Str = buffer + start; LineSize = buffersize - start; status += ProcessLine(); } free(buffer); return status; } static int ProcessLine(void) { assert(LineSize > 0); LineNumber++; if( HistorySize > 0 ) { while( Head != NULL ) { if( LineNumber - Head->pos <= HistorySize ) break; FlushLine(); DequeueLine(); } } return EnqueueLine(); } /* ctype substitutes... works better with libraries that implements them as macros. */ static int s_alpha(int c) { return (c >= 65 && c <= 90) || (c >= 97 && c <= 122); } static int s_alnum(int c) { return s_alpha(c) || s_digit(c); } static int s_digit(int c) { return (c >= 48 && c <= 57); } static int s_space(int c) { return (c == 32) || (c >= 9 && c <= 13); } static int s_lower(int c) { return (c >= 65 && c <= 90) ? c + 32 : c; }