/* clean.c - Cleanup text file 3.0 (4/26/00) Don Yang (uguu.org) Usage: clean [options] [files] Options: -e set input tab size to [tabsize] and expand tabs. -t set output tab size to [tabsize] and compress tabs. -e8 -t3 will convert file with 8 spaces per tab to file with 3 spaces per tab, preserving idents. -t3 -e8 will expand tabs to 8 spaces (-t3 is ignored). Only leading spaces are compressed. This best preserves original layout of file in case if it is viewed with some other tab size. Also, compressing internal spaces would result in problems such as quote spaces, etc. If any leading spaces are compressed, file is considered modified, even if the new tabs matched original ones. -r use CR end of line sequence. -l use LF end of line sequence. -c use CR-LF end of line sequence. Whatever the original end of line was, it will be converted to \r, \n, or \r\n. Output will always end in newline even if the original file doesn't. Only exception is an empty file input, which results in empty file output. -f overwrite backup file. Original filename~ will be lost, even if clean failed. -h display help. Any unrecognized first option results in help screen. files list of files. Unlimited file size, unlimited file name length, unless out of memory... Default option is to expand tabs and use LF end of line sequence. Default input/output tab size is 8 spaces. If no input file is supplied, clean reads from stdin and writes to stdout. Otherwise, file is renamed to filename~, and original file is overwritten. Messages are send to stdout if input is a file, or stderr if input is stdin. Trailing spaces/embedded NULLs always removed. Program returns 0 for success, 1 for error. Notes: This is the third version of my text file utility (and I still use C instead of some scripting languange). All previous limits (line size, file name length, etc) are removed, and added support to compress leading spaces, etc. So it's 1.5 times better than my last version and 2 times better than the first :) Really, I just needed this utility under Linux, and decided to write one from scratch since my previous version was win32 (not so reliable). This one compiles with gcc instead of msc so it should be more portable, unless your int is not 32bits: Because line buffer start at 16K initially and doubles whenever line gets too long, eventually it might get over 64K and become too big to be stored in a 16bit int, at which point bad things will happen. The same problem will happen with this version if your line is longer than 2GB, but I think 16KB is pretty generous already :) This program should be pretty stable. But of course, if anything breaks, that's your problem... 05/24/98: v1.0 03/18/99: v2.0 04/26/00: v3.0 */ #include #include #include #include /* Constants */ #define DEFAULT_TAB_SIZE 8 #define DEFAULT_LINE_SIZE 0x4000 #define MODIFIED_STRIPNULL 1 #define MODIFIED_CUTLINE 2 #define MODIFIED_EXPANDTAB 4 #define MODIFIED_COMPRESSSPACE 8 #define MODIFIED_ENDOFLINE 16 #define MODIFIED_LASTLINE 32 #define CLEAN_EOF -9 enum { ENDL_CR, ENDL_LF, ENDL_CRLF, TAB_ET, TAB_CT, BACKUP_SAVE, BACKUP_KILL }; /* Globals */ static int endl = ENDL_LF; static int tab = TAB_ET; static int itabsize = DEFAULT_TAB_SIZE, otabsize = DEFAULT_TAB_SIZE; static int backup = BACKUP_SAVE; static int modified, linecount; static int linesize = DEFAULT_LINE_SIZE; static char *line; /* Prototypes */ static void changes(FILE *output); static int clean(char *name); static int clean0(void); static int cleanline(FILE *infile, FILE *outfile, FILE *error); static int expand(void); /******************************************************************** main */ int main(int argc, char **argv) { int i, f, e; /* Allocate memory */ if( (line = malloc(DEFAULT_LINE_SIZE)) == NULL ) { puts("Not enough memory."); return 1; } /* Process arguents */ for(i = 1, f = e = 0; i < argc; i++) { if( argv[i][0] == '-' ) { /* Set options */ switch( tolower(argv[i][1]) ) { case 'e': itabsize = argv[i][2] ? atoi(argv[i] + 2) : DEFAULT_TAB_SIZE; if( itabsize < 1 ) itabsize = DEFAULT_TAB_SIZE; tab = TAB_ET; break; case 't': otabsize = argv[i][2] ? atoi(argv[i] + 2) : DEFAULT_TAB_SIZE; if( otabsize < 1 ) otabsize = DEFAULT_TAB_SIZE; tab = TAB_CT; break; case 'c': endl = ENDL_CRLF; break; case 'l': endl = ENDL_LF; break; case 'r': endl = ENDL_CR; break; case 'f': backup = BACKUP_KILL; break; default: if( i > 1 && tolower(argv[i][1]) != 'h' ) { printf("Unrecognized option: %s\n", argv[i]); break; } free(line); printf( "Clean 3.0 (4/26/00) - Don Yang (uguu.org)\n\n" "clean [options] [files]\n\n" "Options:\n" " -e[tabsize] Set input tab size and expand tabs\n" " -t[tabsize] Set output tab size and compress spaces\n" " -l Write LF at end of line\n" " -c Write CRLF at end of line\n" " -r Write CR at end of line\n" " -f Overwrite backup file\n\n" " Default is to expand tabs and write LFs.\n" " Default input/output tab size is %d spaces.\n\n" "Original files are renamed as filename~\n" "Reads stdin/writes stdout if no files specified.\n", DEFAULT_TAB_SIZE); return 0; } } else { /* Process file */ e |= clean(argv[i]); f++; } } if( !f ) { /* Clean stdin to stdout */ e = clean0(); } /* End */ free(line); return e ? 1 : 0; } /* main() */ /******************************************************************* changes Show file changes. */ static void changes(FILE *output) { if( modified & MODIFIED_STRIPNULL ) fprintf(output, ", nulls stripped"); if( modified & MODIFIED_EXPANDTAB ) fprintf(output, ", tabs expanded"); if( modified & MODIFIED_CUTLINE ) fprintf(output, ", trailing spaces removed"); if( modified & MODIFIED_COMPRESSSPACE ) fprintf(output, ", leading spaces compressed"); if( modified & MODIFIED_ENDOFLINE ) fprintf(output, ", end of lines converted"); if( modified & MODIFIED_LASTLINE ) fprintf(output, ", last line adjusted"); fprintf(output, ".\n"); } /* changes() */ /********************************************************************* clean Clean file. */ static int clean(char *name) { FILE *infile, *outfile; char *bname; int e; /* Open input */ if( (infile = fopen(name, "rb")) == NULL ) /* Check existence */ return printf("Can not open %s\n", name); fclose(infile); if( (bname = malloc(strlen(name) + 2)) == NULL ) /* Set backup name */ return printf("Not enough memory.\n"); strcpy(bname, name); strcat(bname, "~"); if( (infile = fopen(bname, "rb")) != NULL ) /* Check backup */ { fclose(infile); if( backup == BACKUP_SAVE ) { free(bname); return printf("Backup file %s already exists.\n", bname); } printf("Backup file %s overwritten.\n", bname); } if( rename(name, bname) ) /* Create backup */ { free(bname); return printf("Can not rename %s to %s.\n", name, bname); } if( (infile = fopen(bname, "rb")) == NULL ) /* Open file */ { rename(bname, name); free(bname); return puts("Can not open input."); } /* Open output */ if( (outfile = fopen(name, "wb+")) == NULL ) { fclose(infile); rename(bname, name); free(bname); return puts("Can not create output."); } /* Clean lines */ modified = linecount = 0; for(e = 0; !e; e = cleanline(infile, outfile, stdout)); /* End */ fclose(infile); fclose(outfile); printf("%s: %d lines", name, linecount); if( modified ) { changes(stdout); } else { puts(" (not changed)."); rename(bname, name); } free(bname); return e == CLEAN_EOF ? 0 : 1; } /* clean() */ /******************************************************************** clean0 Clean stdin. */ static int clean0(void) { int e; /* Process lines */ modified = linecount = 0; for(e = 0; !e; e = cleanline(stdin, stdout, stderr)); /* End */ fprintf(stderr, "(stdin): %d lines", linecount); if( modified ) changes(stderr); else fprintf(stderr, " (not changed)\n"); return e == CLEAN_EOF ? 0 : 1; } /* clean0() */ /***************************************************************** cleanline Cleans one line of input. */ static int cleanline(FILE *infile, FILE *outfile, FILE *error) { int chr, cursor, start; /* Check end of file */ if( feof(infile) ) return CLEAN_EOF; /* Read line */ cursor = 0; for(chr = fgetc(infile); !feof(infile); chr = fgetc(infile)) { /* Check end of line */ if( chr == '\r' || chr == '\n' ) break; /* Process input */ if( chr == '\t' ) { /* Expand tabs */ modified |= MODIFIED_EXPANDTAB; if( cursor + itabsize + 4 >= linesize ) { /* Allocate more memory for line buffer */ if( expand() ) { fprintf(error, "Out of memory.\n"); return 1; } } for(line[cursor++] = ' '; cursor % itabsize; line[cursor++] = ' '); } else if( chr ) { /* Normal characters */ if( cursor + 4 >= linesize ) { /* Allocate more memory for line buffer */ if( expand() ) { fprintf(error, "Out of memory.\n"); return 1; } } line[cursor++] = (char)chr; } else { /* NULLs */ modified |= MODIFIED_STRIPNULL; } } /* Check end of file */ if( cursor == 0 && feof(infile) ) return CLEAN_EOF; if( chr == EOF ) { if( !feof(infile) ) return fprintf(error, "Read error\n"); } /* Check end of line */ if( chr == '\r' ) { if( (chr = fgetc(infile)) != '\n' ) { if( chr != EOF ) ungetc(chr, infile); if( endl != ENDL_CR ) modified |= MODIFIED_ENDOFLINE; } else { if( endl != ENDL_CRLF ) modified |= MODIFIED_ENDOFLINE; } } else if( chr == '\n' ) { if( endl != ENDL_LF ) modified |= MODIFIED_ENDOFLINE; } else { modified |= MODIFIED_LASTLINE; } /* Strip trailing white spaces */ for(cursor--; cursor >= 0; cursor--) { if( line[cursor] != ' ' ) break; modified |= MODIFIED_CUTLINE; } cursor++; /* Set end of line */ if( endl == ENDL_LF ) strcpy(line + cursor, "\n"); else if( endl == ENDL_CRLF ) strcpy(line + cursor, "\r\n"); else strcpy(line + cursor, "\r"); /* Compress leading spaces */ if( tab == TAB_CT ) { for(cursor = 0; line[cursor] == ' '; cursor++); for(start = 0; cursor - start >= otabsize; start += otabsize) { modified |= MODIFIED_COMPRESSSPACE; if( fputc('\t', outfile) == EOF ) return fprintf(error, "Write error.\n"); } } else { start = 0; } /* Write output */ if( fputs(line + start, outfile) == EOF ) return fprintf(error, "Write error.\n"); linecount++; return 0; } /* cleanline() */ /******************************************************************** expand Expand line buffer. */ static int expand(void) { char *newblock; /* Try increase size of current line buffer, or allocate new buffer */ if( (newblock = realloc(line, linesize * 2)) == NULL ) { if( (newblock = malloc(linesize * 2)) == NULL ) return 1; else memcpy(newblock, line, linesize * 2); } linesize *= 2; line = newblock; return 0; } /* expand() */