/* shindou7_no_ucase.c - Don Yang (uguu.org) 02/24/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif typedef enum { UTF8 = 1, SJIS = 2, EUCJP = 4, UCS2LE = 8, UCS2BE = 16 } Encoding; static FILE *infile; static unsigned char *tmp, *buffer; static char *const_str = " \0!\0 \1\xff\1\xef\xbc\x81I\xa1\xaa"; static int a, state, buffer_size, read_size, i, i0, last_char_offset, c, utf8_kana, sjis_kana, eucjp_kana, encoding; static unsigned char parse_ucs2[] = { 2 }; static unsigned char parse_utf8[] = { 0xf0, 0xf7, 0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 4, 0xe0, 0xef, 0x80, 0xbf, 0x80, 0xbf, 3, 0xc0, 0xdf, 0x80, 0xbf, 2, 1 }; static unsigned char parse_sjis[] = { 0x81, 0x9f, 0x40, 0xfc, 2, 0xe0, 0xef, 0x40, 0xfc, 2, 1 }; static unsigned char parse_eucjp[] = { 0xa1, 0xa8, 0xa1, 0xfe, 2, 0xad, 0xad, 0xa1, 0xfe, 2, 0xb0, 0xf4, 0xa1, 0xfe, 2, 0xf9, 0xfc, 0xa1, 0xfe, 2, 0x8e, 0x8e, 0xa1, 0xfe, 2, 0x8f, 0x8f, 0xa1, 0xfe, 0xa1, 0xfe, 3, 1 }; static unsigned char *parser; static void Output(void *buffer, int size) { fwrite(buffer, size, 1, stdout); } static void ExtendBuffer(void) { if( read_size == buffer_size ) { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); assert(buffer != NULL); memcpy(buffer, tmp, read_size); free(tmp); } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } static void NextChar(void) { unsigned char *p; int byte_offset; c = 0; byte_offset = 0; for(p = parser; *p > 4 && c >= 0; p++) { if( buffer[i + byte_offset] < *p || buffer[i + byte_offset] > p[1] ) { /* Byte not within range */ if( byte_offset ) { /* Not matching first byte -> character does not fit encoding */ c = -1; } else { /* Try next range */ for(; *p > 4; p++); } } else { /* Byte within range */ byte_offset++; p++; } } for(byte_offset = 0; c >= 0 && byte_offset < *p; byte_offset++) c = (c << 8) | buffer[i++]; if( encoding & UCS2LE ) c = (c >> 8) | ((c & 255) << 8); } static void DetectFileEncoding(void) { read_size = utf8_kana = sjis_kana = eucjp_kana = 0; for(encoding = UTF8 | SJIS | EUCJP; !feof(infile) && (encoding & (encoding - 1));) { ExtendBuffer(); if( !(encoding & (UCS2LE|UCS2BE)) ) { if( buffer[0] == 0xff && buffer[1] == 0xfe ) { encoding = UCS2LE; } else if( buffer[0] == 0xfe && buffer[1] == 0xff ) { encoding = UCS2BE; } else { for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x0a && !buffer[i + 1] ) { encoding = UCS2LE; goto finalize; } if( !buffer[i] && buffer[i + 1] == 0x0a ) { encoding = UCS2BE; goto finalize; } } if( buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf ) { encoding = UTF8; } else { parser = parse_utf8; for(i = 0; i < read_size - 3 && encoding & UTF8;) { NextChar(); if( c < 0 ) { encoding &= ~UTF8; } else if( (c >= 0xe38181 && c <= 0xe381bf) || (c >= 0xe38280 && c <= 0xe38293) ) { utf8_kana++; } } parser = parse_sjis; for(i = 0; i < read_size - 1 && encoding & SJIS;) { NextChar(); if( c < 0 ) { encoding &= ~SJIS; } else if( c >= 0x829f && c <= 0x82f1 ) { sjis_kana++; } } parser = parse_eucjp; for(i = 0; i < read_size - 2 && encoding & EUCJP;) { NextChar(); if( c < 0 ) { encoding &= ~EUCJP; } else if( c >= 0xa4a1 && c <= 0xa4f3 ) { eucjp_kana++; } } } } } } finalize: if( !(encoding & (UCS2LE|UCS2BE)) ) { if( !(encoding & UTF8) ) utf8_kana = -1; if( !(encoding & SJIS) ) sjis_kana = -1; if( !(encoding & EUCJP) ) eucjp_kana = -1; encoding = (utf8_kana < sjis_kana) ? (sjis_kana < eucjp_kana) ? EUCJP : SJIS : (utf8_kana < eucjp_kana) ? EUCJP : encoding & UTF8 ? UTF8 : EUCJP; } } static int IsExclaimationPunct(void) { return c == 0x21 || (encoding & (UCS2LE|UCS2BE) && c == 0xff01) || (encoding & UTF8 && c == 0xefbc81) || (encoding & SJIS && c == 0x8149) || (encoding & EUCJP && c == 0xa1aa); } static int IsEOSPunct(void) { return c == 0x2e || c == 0x3f || (encoding & (UCS2LE|UCS2BE) && (c == 0xff1f || c == 0xff0e || c == 0x3002 || c == 0xff61)) || (encoding & UTF8 && (c == 0xefbc9f || c == 0xefbc8e || c == 0xe38082 || c == 0xefbda1)) || (encoding & SJIS && (c == 0x8148 || c == 0x8144 || c == 0x8142)) || (encoding & EUCJP && (c == 0xa1a9 || c == 0xa1a5 || c == 0xa1a3)) || IsExclaimationPunct(); } static int IsPunct(void) { return c == 0x2c || c == 0x0a || (encoding & (UCS2LE|UCS2BE) && (c == 0xff0c || c == 0x3001 || c == 0xff64)) || (encoding & UTF8 && (c == 0xefbc8c || c == 0xe38081 || c == 0xefbda4)) || (encoding & SJIS && (c == 0x8143 || c == 0x8141)) || (encoding & EUCJP && (c == 0xa1a4 || c == 0xa1a2)) || IsEOSPunct(); } static void WriteExclamationMark(void) { Output(const_str + (c > 255 ? encoding & UCS2BE ? 6 : encoding & UCS2LE ? 5 : encoding & UTF8 ? 8 : encoding & SJIS ? 10 : 12 : encoding & UCS2BE ? 1 : 2), c > 255 ? encoding & UTF8 ? 3 : 2 : encoding & (UCS2LE|UCS2BE) ? 2 : 1); } static void Filter(void) { for(i = i0 = state = 0; !feof(infile) || i < read_size;) { last_char_offset = i; NextChar(); if( c < 0 ) i++; if( i < read_size - 3 ) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; ExtendBuffer(); } if( state ) { if( IsPunct() ) { if( IsEOSPunct() ) { for(i = i0; i < last_char_offset; i0 = i) { NextChar(); Output(buffer + i0, i - i0); Output(const_str + (encoding & UCS2BE ? 3 : 0), encoding & (UCS2LE|UCS2BE) ? 2 : 1); } NextChar(); } if( i0 < last_char_offset ) Output(buffer + i0, last_char_offset - i0); if( IsExclaimationPunct() ) { WriteExclamationMark(); WriteExclamationMark(); } else if( IsPunct() && c != 0x0a ) { WriteExclamationMark(); } else { Output(buffer + last_char_offset, i - last_char_offset); } state = 0; } } else if( c < 0 || IsPunct() ) { Output(buffer + last_char_offset, i - last_char_offset); } else { state = 1; i0 = last_char_offset; } } } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 0x1000; buffer = malloc(buffer_size); assert(buffer != NULL); if( argc > 1 ) { for(a = 1; a < argc; a++) { if( (infile = fopen(argv[a], "rb")) ) { DetectFileEncoding(); printf("%s: %s\n", argv[a], encoding != UCS2LE ? encoding != UCS2BE ? encoding != UTF8 ? encoding != SJIS ? "EUC-JP" : "Shift_JIS" : "UTF-8" : "UCS-2BE" : "UCS-2LE"); fclose(infile); } else { printf("%s: can not open file\n", argv[a]); } } } else { infile = stdin; DetectFileEncoding(); parser = encoding != UTF8 ? encoding != SJIS ? encoding != EUCJP ? parse_ucs2 : parse_eucjp : parse_sjis : parse_utf8; Filter(); } free(buffer); return 0; }