/* yume0.c - Don Yang (uguu.org) 03/10/04 */ #include #include #include #include #define OPTION_UNIQ_ONLY 1 #define OPTION_DUP_ONLY 2 #define OPTION_IGNORE_CASE 4 #define OPTION_SHOW_COUNT 8 #define OPTION_KEEP_UNMATCHED 16 typedef struct _line { unsigned int key; int pos, freq, size; int rstart, rlength; char *line; struct _line *next; } Line; typedef struct _node { unsigned int key; int bucketsize; Line **lines; struct _node *left, *right; } Node; static Node *Keys; static Line *Head, *Tail; static int LineNumber, HistorySize, OutputOptions; static char *LeftExpr, *RightExpr; static FILE *Infile, *Outfile; static char *DefaultLeftExpr = "^"; static char *DefaultRightExpr = "$"; static unsigned int CRCTable[256]; static int Cmp(char *a, char *b, int size); static unsigned int CRC(char *p, int size); static void InitCRC(void); static int EnqueueLine(/*@observer@*/char *line, int size); static void DequeueLine(void); static void FlushLine(void); static int AddKey(unsigned int key, Line *line); static int AddKey0(Node *node, Line *line); static void DeleteKey(unsigned int key, Line *line); static Node *FindKey(unsigned int key); static int GetN(char **p); static int SetMarker(/*@observer@*/char *str, int size, int dir, int *pos1, int *pos2, char *expr); static int Process(void); static int ProcessLine(/*@observer@*/char *line, int size); static int s_lower(int c); static int s_alpha(int c); static int s_alnum(int c); static int s_digit(int c); static int s_space(int c); int main(int argc, char **argv) { static char gen_expr[32]; char *arg, *iname, *oname; int i, j; /* Parse options */ iname = oname = NULL; OutputOptions = HistorySize = 0; LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; for(i = 1; i < argc; i++) { if( argv[i][0] == '-' ) { switch( argv[i][1] ) { case '\0': iname = *argv; break; case 'u': OutputOptions |= OPTION_UNIQ_ONLY; break; case 'd': OutputOptions |= OPTION_DUP_ONLY; break; case 'i': OutputOptions |= OPTION_IGNORE_CASE; break; case 'c': OutputOptions |= OPTION_SHOW_COUNT; break; case 'k': OutputOptions |= OPTION_KEEP_UNMATCHED; break; case 'h': case 'l': case 'r': case 'f': case 's': case 'w': j = i; if( argv[i][2] != '\0' ) { arg = &argv[i][2]; } else { if( i + 1 >= argc ) return printf("not enough arguments for %s\n", argv[i]); arg = argv[++i]; } if( argv[j][1] == 'h' ) { HistorySize = atoi(arg); } else if( argv[j][1] == 'l' ) { LeftExpr = arg; } else if( argv[j][1] == 'r' ) { RightExpr = arg; } else { LeftExpr = DefaultLeftExpr; RightExpr = DefaultRightExpr; if( argv[j][1] == 'f' ) { sprintf(gen_expr, "^S(sS)%d", abs(atoi(arg))); LeftExpr = gen_expr; } else { if( argv[j][1] == 's' ) { sprintf(gen_expr, "^%d", abs(atoi(arg))); LeftExpr = gen_expr; } else { sprintf(gen_expr, "^%d", abs(atoi(arg) - 1)); RightExpr = gen_expr; } } } break; default: return printf("invalid option: %s\n", argv[i]); } } else { if( iname == NULL ) iname = argv[i]; else oname = argv[i]; } } if( iname == NULL || iname == *argv ) { Infile = stdin; } else { if( (Infile = fopen(iname, "rb")) == NULL ) return printf("can not open %s\n", iname); } if( oname == NULL ) { Outfile = stdout; } else { if( (Outfile = fopen(oname, "wb+")) == NULL ) { (void)fclose(Infile); return printf("can not create %s\n", oname); } } InitCRC(); Keys = NULL; Head = Tail = NULL; LineNumber = 0; if( Process() != 0 ) (void)fputs("out of memory\n", stderr); while( Head != NULL ) { FlushLine(); DequeueLine(); } assert(Keys == NULL); (void)fclose(Infile); (void)fclose(Outfile); return 0; } static int Cmp(char *a, char *b, int size) { if( (OutputOptions & OPTION_IGNORE_CASE) != 0 ) { while( size-- > 0 ) { if( s_lower(*a) != s_lower(*b) ) return 1; a++; b++; } } else { while( size-- > 0 ) { if( *a++ != *b++ ) return 1; } } return 0; } static unsigned int CRC(char *p, int size) { unsigned int crc; if( (OutputOptions & OPTION_IGNORE_CASE) != 0 ) { for(crc = ~0U; size > 0; size--) { crc = ((crc >> 8) & 0xffffff) ^ CRCTable[(crc ^ (unsigned int)s_lower(*p)) & 0xff]; p++; } } else { for(crc = ~0U; size > 0; size--) { crc = ((crc >> 8) & 0xffffff) ^ CRCTable[(crc ^ (unsigned int)*(p++)) & 0xff]; } } return crc; } static void InitCRC(void) { unsigned int poly = 0xedb88320U, crc, i, j; for(i = 0; i < 256; i++) { crc = i; for(j = 8; j > 0; j--) { if( (crc & 1) != 0 ) crc = (crc >> 1) ^ poly; else crc >>= 1; } CRCTable[i] = crc; } } static int EnqueueLine(/*@observer@*/char *line, int size) { unsigned int key; int length, l, r, i; Line *a, **e; Node *n; assert(size > 0); length = size; for(i = 0; i < 2 && length > 0; i++) { if( line[length - 1] == '\r' || line[length - 1] == '\n' ) length--; } if( length > 0 ) { l = i = 0; r = length - 1; if( SetMarker(line, length, 1, &l, &r, LeftExpr) == 0 ) { if( SetMarker(line, length, -1, &r, &l, RightExpr) == 0 ) { r = r - l + 1; if( r >= 0 ) i = 1; } } if( i == 0 ) { if( (OutputOptions & OPTION_KEEP_UNMATCHED) == 0 ) return 0; l = 0; r = length; } key = CRC(line + l, r); } else { l = r = 0; if( SetMarker(line, length, 1, &l, &r, LeftExpr) != 0 || SetMarker(line, length, -1, &l, &r, RightExpr) != 0 ) { if( (OutputOptions & OPTION_KEEP_UNMATCHED) == 0 ) return 0; } l = r = 0; key = ~0U; } /* Find duplicate lines, check for matching CRC before byte compare */ if( (n = FindKey(key)) != NULL ) { e = n->lines; for(i = 0; i < n->bucketsize; i++) { if( *e != NULL ) { if( (*e)->rlength == r ) { if( Cmp((*e)->line + (*e)->rstart, line + l, r) == 0 ) { /* Duplicate found */ (*e)->pos = LineNumber; (*e)->freq++; return 0; } } } e++; } } /* Copy line */ if( (a = (Line*)malloc(sizeof(Line))) == NULL ) return 1; if( (a->line = (char*)malloc((size_t)(a->size = size))) == NULL ) { free(a); return 1; } memcpy(a->line, line, (size_t)size); a->next = NULL; a->freq = 1; a->pos = LineNumber; a->rstart = l; a->rlength = r; a->key = key; /* Add to queue */ if( Tail == NULL ) { Head = Tail = a; } else { Tail->next = a; Tail = a; } /* Add reference. If line number exceeds history range, then reference is not added, this causes lines past the history range to be recorded but will never be matched. */ if( HistorySize < 0 && LineNumber > -HistorySize ) return 0; return AddKey(a->key, a); } static void DequeueLine(void) { Line *d; assert(Head != NULL); DeleteKey(Head->key, Head); d = Head; if( (Head = Head->next) == NULL ) Tail = NULL; free(d->line); free(d); } static void FlushLine(void) { char count[16]; if( (OutputOptions & OPTION_UNIQ_ONLY) != 0 ) { if( Head->freq > 1 ) return; } else if( (OutputOptions & OPTION_DUP_ONLY) != 0 ) { if( Head->freq <= 1 ) return; } if( (OutputOptions & OPTION_SHOW_COUNT) != 0 ) { if( Head->freq < 10000000 ) { sprintf(count, "%7d\t", Head->freq); (void)fwrite(count, 8, 1, Outfile); } else { (void)fwrite("*a lot*\t", 8, 1, Outfile); } } (void)fwrite(Head->line, (size_t)Head->size, 1, Outfile); } static int AddKey(unsigned int key, Line *line) { Node **x, *n; for(x = &Keys; *x != NULL;) { if( (*x)->key == key ) /* Add reference to existing key */ return AddKey0(*x, line); x = ((*x)->key > key) ? &((*x)->left) : &((*x)->right); } /* Create new key */ if( (*x = n = (Node*)malloc(sizeof(Node))) == NULL ) return 1; if( (n->lines = (Line**)calloc((size_t)(n->bucketsize = 4), sizeof(Line*))) == NULL ) { free(n); *x = NULL; return 1; } n->key = key; n->left = n->right = NULL; return AddKey0(n, line); } static int AddKey0(Node *node, Line *line) { Line **lines; int i; for(i = 0; i < node->bucketsize; i++) { if( node->lines[i] == NULL ) { node->lines[i] = line; return 0; } } if( (lines = (Line**)calloc((size_t)(node->bucketsize * 2), sizeof(Line*))) == NULL ) return 1; memcpy(lines, node->lines, node->bucketsize * sizeof(Line*)); free(node->lines); (node->lines = lines)[node->bucketsize] = line; node->bucketsize *= 2; return 0; } static void DeleteKey(unsigned int key, Line *line) { Node **x, *d, *l, *r, *p; int i, z; /* Find key */ for(x = &Keys; *x != NULL;) { if( (*x)->key == key ) break; x = ((*x)->key > key) ? &((*x)->left) : &((*x)->right); } if( *x == NULL ) return; /* Remove line reference to key */ d = *x; for(i = z = 0; i < d->bucketsize; i++) { if( d->lines[i] == line ) d->lines[i] = NULL; if( d->lines[i] == NULL ) z++; } if( z < d->bucketsize ) return; /* All references removed, remove key itself */ if( d->left == NULL ) { *x = d->right; } else if( d->right == NULL ) { *x = d->left; } else { p = l = d->left; if( (r = l->right) == NULL ) { l->right = d->right; *x = l; } else { while( r->right != NULL ) { p = r; r = r->right; } p->right = r->left; r->left = l; r->right = d->right; *x = r; } } free(d->lines); free(d); } static Node *FindKey(unsigned int key) { Node *r; for(r = Keys; r != NULL;) { if( r->key == key ) return r; r = (r->key > key) ? r->left : r->right; } return NULL; } static int GetN(char **p) { int n; for(n = (int)*((*p)++) - (int)'0'; s_digit(**p); ++*p) n = n * 10 + (int)**p - (int)'0'; --*p; return n; } static int SetMarker(/*@observer@*/char *str, int size, int dir, int *pos1, int *pos2, char *expr) { #define MAX_NEST 8 char *paren[MAX_NEST], *p, *q; int repeat[MAX_NEST + 1], cursor, lastsearch, nest; cursor = *pos1; lastsearch = nest = 0; repeat[0] = 0; for(p = expr; *p != '\0'; p++) { switch( *p ) { /* Set current cursor position / direction */ case '^': cursor = 0; /*@fallthrough@*/ case '+': dir = 1; break; case '$': cursor = size - 1; /*@fallthrough@*/ case '-': dir = -1; break; /* Set other marker position */ case '@': *pos2 = cursor; break; /* Repeat expression */ case '(': if( nest >= MAX_NEST ) return 5; /* Error5: Nest level too deep */ paren[nest++] = p; repeat[nest] = 0; break; case ')': if( nest == 0 ) return 6; /* Error6: Unmatched parentheses */ nest--; if( repeat[nest] > 1 ) { /* Repeat */ assert(paren[nest] != NULL); p = paren[nest]; repeat[nest]--; nest++; } else if( repeat[nest] == 1 ) { /* End last iteration */ assert(s_digit(*(p + 1))); p++; (void)GetN(&p); repeat[nest]--; } else { /* Start repeat */ q = p + 1; if( !s_digit(*q) ) return 4; /* Error4: No repeat count */ if( (repeat[nest] = GetN(&q)) < 2 ) { p = q; repeat[nest] = 0; } else { p--; nest++; } } break; /* Move cursor */ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': cursor += GetN(&p) * dir; if( cursor < 0 || cursor >= size ) return 3; /* Error3: Cursor moved out of range */ lastsearch = 0; break; /* Character search */ default: if( strchr("asdiASDI", *p) != NULL ) { /* Character class */ if( lastsearch == -(int)*p ) cursor += dir; switch( *p ) { #define CHARSEARCH(chartest) \ for(; cursor >= 0 && cursor < size; cursor += dir) \ { \ if( chartest(str[cursor]) ) \ break; \ } \ if( cursor < 0 || cursor >= size ) return 1; case 'a': CHARSEARCH(s_alpha); break; case 's': CHARSEARCH(s_space); break; case 'd': CHARSEARCH(s_digit); break; case 'i': CHARSEARCH(s_alnum); break; #undef CHARSEARCH #define CHARSEARCH(chartest) \ for(; cursor >= 0 && cursor < size; cursor += dir) \ { \ if( !chartest(str[cursor]) ) \ break; \ } \ if( cursor < 0 || cursor >= size ) return 1; case 'A': CHARSEARCH(s_alpha); break; case 'S': CHARSEARCH(s_space); break; case 'D': CHARSEARCH(s_digit); break; case 'I': CHARSEARCH(s_alnum); break; #undef CHARSEARCH default: assert(0); break; } lastsearch = -(int)*p; } else { /* Single character */ if( *p == '/' ) { if( *++p == '\0' ) return 2; /* Error2: "/" not followed by char */ } if( lastsearch == (int)*p ) cursor += dir; for(; cursor >= 0 && cursor < size; cursor += dir) { if( str[cursor] == *p ) break; } if( cursor < 0 || cursor >= size ) return 1; /* Error1: Search character not found */ lastsearch = (int)*p; } break; } } *pos1 = cursor; return 0; } static int Process(void) { int maxsize, buffersize, start, end, eof, grow, i, status; char *buffer, *p; if( (buffer = (char*)malloc((size_t)(maxsize = 0x100))) == NULL ) return 1; buffersize = eof = start = 0; status = 0; for(;;) { /* Fill buffer */ if( buffersize < maxsize ) { i = (int)fread(buffer + buffersize, 1, (size_t)(maxsize - buffersize), Infile); if( i < maxsize - buffersize ) eof = 1; buffersize += i; } /* Tokenize buffer */ for(start = grow = 0; start < buffersize; start = end) { grow = 1; p = buffer + start; for(end = start; end < buffersize; end++) { if( *p == '\n' ) { status += ProcessLine(buffer + start, ++end - start); grow = 0; break; } else if( *p == '\r' ) { if( ++end < buffersize ) { if( *++p == '\n' ) end++; status += ProcessLine(buffer + start, end - start); grow = 0; } break; } p++; } if( grow != 0 ) break; } if( eof != 0 ) break; if( start > 0 ) { /* Shift partial line */ p = buffer; for(i = 0; i < buffersize - start; i++) { *p = *(p + start); p++; } buffersize -= start; start = 0; } else { /* Expand buffer */ if( (p = (char*)malloc((size_t)maxsize * 2)) == NULL ) { free(buffer); return 1; } memcpy(p, buffer, (size_t)buffersize); free(buffer); buffer = p; maxsize *= 2; } } /* Last line */ if( buffersize > start ) status += ProcessLine(buffer + start, buffersize - start); free(buffer); return status; } static int ProcessLine(/*@observer@*/char *line, int size) { assert(size > 0); LineNumber++; if( HistorySize > 0 ) { while( Head != NULL ) { if( LineNumber - Head->pos <= HistorySize ) break; FlushLine(); DequeueLine(); } } return EnqueueLine(line, size); } /* ctype substitutes... works better with libraries that implements them as macros. */ static int s_alpha(int c) { return (c >= 65 && c <= 90) || (c >= 97 && c <= 122); } static int s_alnum(int c) { return s_alpha(c) || s_digit(c); } static int s_digit(int c) { return (c >= 48 && c <= 57); } static int s_space(int c) { return (c == 32) || (c >= 9 && c <= 13); } static int s_lower(int c) { return (c >= 65 && c <= 90) ? c + 32 : c; }