/* shindou6.c - Don Yang (uguu.org) 02/19/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif static FILE *infile; static unsigned char *tmp, *buffer, /* space\0 = 0 \0! = 1 !\0 = 2 \0space = 3 efbc81 = 5 8149 = 7 01ff = 9 ff01 = 10 a1aa = 12 */ *const_str = " \0!\0 \xef\xbc\x81I\1\xff\1\xa1\xaa"; static int u, a, state, buffer_size, read_size, i, i0, last_char_offset, c, utf8_kana, sjis_kana, eucjp_kana, encoding; static void Output(void *data, size_t size) { fwrite(data, size, 1, stdout); } static void ExtendBuffer(void) { if( read_size != buffer_size ) { read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } else { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); assert(buffer != NULL); memcpy(buffer, tmp, read_size); free(tmp); } } static void NextByte(void) { c = (c << 8) | buffer[i++]; } static void NextUCS2BEChar(void) { c = 0; NextByte(); NextByte(); } static void NextUCS2LEChar(void) { c = buffer[i] | (buffer[i + 1] << 8); i += 2; } static void NextUTF8Char(void) { c = 0; if( (buffer[i] & 224) != 192 ) { if( (buffer[i] & 240) != 224 ) { if( (buffer[i] & 248) != 240 ) { NextByte(); } else { if( (buffer[i + 1] & 192) != 128 || (buffer[i + 2] & 192) != 128 || (buffer[i + 2] & 192) != 128 ) { c--; } else { NextByte(); NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 192) != 128 || (buffer[i + 2] & 192) != 128 ) { c--; } else { NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 192) != 128 ) { c--; } else { NextByte(); NextByte(); } } } static void NextSJISChar(void) { c = 0; if( (buffer[i] >= 129 && buffer[i] <= 159) || (buffer[i] >= 224 && buffer[i] <= 239) ) { if( buffer[i + 1] < 64 || buffer[i + 1] > 252 ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } static void NextEUCJPChar(void) { c = 0; if( (buffer[i] >= 161 && buffer[i] <= 168) || (buffer[i] == 173) || (buffer[i] >= 176 && buffer[i] <= 244) || (buffer[i] >= 249 && buffer[i] <= 252) ) { if( buffer[i + 1] < 161 || buffer[i + 1] > 254 ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } static void DetectEncoding(void) { if( !(encoding & 3) ) { if( buffer[0] != 255 || buffer[1] != 254 ) { if( buffer[0] != 254 || buffer[1] != 255 ) { for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 10 && buffer[i + 1] == 0 ) { encoding = 1; return; } if( buffer[i] == 0 && buffer[i + 1] == 10 ) { encoding = 2; return; } } if( buffer[0] != 239 || buffer[1] != 187 || buffer[2] != 191 ) { if( encoding & 4 ) { for(i = 0; i < read_size - 3;) { NextUTF8Char(); if( c < 0 ) { encoding &= ~4; break; } if( (c >= 0xe38181 && c <= 0xe381bf) || (c >= 0xe38280 && c <= 0xe38293) ) { utf8_kana++; } } } if( encoding & 8 ) { for(i = 0; i < read_size - 1;) { NextSJISChar(); if( c < 0 ) { encoding &= ~8; break; } if( c >= 33439 && c <= 33521 ) sjis_kana++; } } if( encoding & 16 ) { for(i = 0; i < read_size - 1;) { NextEUCJPChar(); if( c < 0 ) { encoding &= ~16; break; } if( c >= 42145 && c <= 42227 ) eucjp_kana++; } } } else { encoding = 4; } } else { encoding = 2; } } else { encoding = 1; } } } static void DetectFileEncoding(void) { read_size = utf8_kana = sjis_kana = eucjp_kana = 0; encoding = 28; for(; !feof(infile) && (encoding & (encoding - 1));) { ExtendBuffer(); DetectEncoding(); } if( !(encoding & 3) ) { if( !(encoding & 4) ) utf8_kana = 0; if( !(encoding & 8) ) sjis_kana = 0; if( !(encoding & 16) ) eucjp_kana = 0; if( utf8_kana < sjis_kana ) { encoding = (sjis_kana < eucjp_kana) ? 16 : 8; } else { if( utf8_kana < eucjp_kana ) encoding = 16; else encoding = (encoding & 4) ? 4 : 16; } } } static int IsAlpha(void) { return (c >= 97 && c <= 122) || (c >= 65 && c <= 90) || (c >= 48 && c <= 57) || c == 45 || c == 95 || c == 32 || c == 9; } static int IsExclaimationPunct(void) { return c == 33 || (encoding & 3 && c == 65281) || (encoding & 4 && c == 0xefbc81) || (encoding & 8 && c == 33097) || (encoding & 16 && c == 41386); } static int IsEOSPunct(void) { return c == 46 || c == 63 || (encoding & 3 && (c == 65311 || c == 65294 || c == 12290 || c == 65377)) || (encoding & 4 && (c == 0xefbc9f || c == 0xefbc8e || c == 0xe38082 || c == 0xefbda1)) || (encoding & 8 && (c == 33096 || c == 33092 || c == 33090)) || (encoding & 16 && (c == 41385 || c == 41381 || c == 41379)) || IsExclaimationPunct(); } static int IsPunct(void) { return c == 44 || c == 10 || (encoding & 3 && (c == 65292 || c == 12289 || c == 65380)) || (encoding & 4 && (c == 0xefbc8c || c == 0xe38081 || c == 0xefbda4)) || (encoding & 8 && (c == 33091 || c == 33089)) || (encoding & 16 && (c == 41380 || c == 41378)) || IsEOSPunct(); } static void ToUpperChar(void) { if( buffer[u] >= 97 && buffer[u] <= 122 ) buffer[u] -= 32; u++; } static void ToUpperUCS2LE(void) { for(u = i0; u < i; u++) ToUpperChar(); } static void ToUpperUCS2BE(void) { i0++; ToUpperUCS2LE(); i0--; } static void ToUpperUTF8(void) { for(u = i0; u < i;) { if( (buffer[u] & 248) != 240 ) { if( (buffer[u] & 240) != 224 ) { if( (buffer[u] & 224) != 192 ) { ToUpperChar(); } else { u += 2; } } else { u += 3; } } else { u += 4; } } } static void ToUpperJIS(void) { for(u = i0; u < i;) { if( (buffer[u] >= 129 && buffer[i] <= 159) || (buffer[u] >= 224 && buffer[i] <= 239) ) { u++; } else { ToUpperChar(); } } } static void WriteExclamationMark(void) { if( c > 255 ) { if( encoding & 2 ) { Output(const_str + 10, 2); } else if( encoding & 1 ) { Output(const_str + 9, 2); } else if( encoding & 4 ) { Output(const_str + 5, 3); } else if( encoding & 8 ) { Output(const_str + 7, 2); } else { Output(const_str + 12, 2); } } else { if( encoding & 2 ) { Output(const_str + 1, 2); } else if( encoding & 1 ) { Output(const_str + 2, 2); } else { Output(const_str + 2, 1); } } } static void Flush(void) { if( i0 != last_char_offset ) Output(buffer + i0, last_char_offset - i0); if( IsExclaimationPunct() ) { WriteExclamationMark(); WriteExclamationMark(); } else if( IsPunct() && c != 10 ) { WriteExclamationMark(); } else { Output(buffer + last_char_offset, i - last_char_offset); } } static void FlushUCS2WithSpace(void) { for(; i0 < last_char_offset; i0 += 2) { Output(buffer + i0, 2); if( encoding & 1 ) { Output(const_str, 2); } else { Output(const_str + 3, 2); } } } static void FlushUTF8WithSpace(void) { for(; i0 < last_char_offset; Output(const_str, 1)) { if( (buffer[i0] & 248) != 240 ) { if( (buffer[i0] & 240) != 224 ) { if( (buffer[i0] & 224) != 192 ) { Output(buffer + i0, 1); i0++; } else { Output(buffer + i0, 2); i0 += 2; } } else { Output(buffer + i0, 3); i0 += 3; } } else { Output(buffer + i0, 4); i0 += 4; } } } static void FlushJISWithSpace(void) { for(; i0 < last_char_offset; i0++) { if( buffer[i0] > 127 ) { Output(buffer + i0, 2); i0++; } else { Output(buffer + i0, 1); } Output(const_str, 1); } } static void Filter(void (*NextChar)(void), void (*ToUpper)(void), void (*FlushWithSpace)(void)) { i = i0 = state = 0; for(; !feof(infile) || i < read_size;) { last_char_offset = i; NextChar(); if( i < read_size - 3 ) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; ExtendBuffer(); } if( state != 1 ) { if( state != 2 ) { if( c > 127 ) { state = 2; } else if( IsAlpha() ) { state = 1; } else { Output(buffer + last_char_offset, i - last_char_offset); } i0 = last_char_offset; } else { if( IsAlpha() || IsPunct() ) { if( IsEOSPunct() ) FlushWithSpace(); Flush(); state = 0; } } } else { if( !IsAlpha() || IsPunct() ) { if( IsEOSPunct() ) ToUpper(); Flush(); state = 0; } } } } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 4096; buffer = malloc(buffer_size); assert(buffer != NULL); if( argc > 1 ) { for(a = 1; a < argc; a++) { if( (infile = fopen(argv[a], "rb")) ) { DetectFileEncoding(); printf("%s: %s\n", argv[a], encoding != 1 ? encoding != 2 ? encoding != 4 ? encoding != 8 ? "EUC-JP" : "Shift_JIS" : "UTF-8" : "UCS-2BE" : "UCS-2LE"); fclose(infile); } else { printf("%s: can not open file\n", argv[a]); } } } else { infile = stdin; DetectFileEncoding(); if( encoding & 3 ) { if( encoding & 1 ) Filter(NextUCS2LEChar, ToUpperUCS2LE, FlushUCS2WithSpace); else Filter(NextUCS2BEChar, ToUpperUCS2BE, FlushUCS2WithSpace); } else if( encoding & 4 ) { Filter(NextUTF8Char, ToUpperUTF8, FlushUTF8WithSpace); } else { if( encoding & 8 ) Filter(NextSJISChar, ToUpperJIS, FlushJISWithSpace); else Filter(NextEUCJPChar, ToUpperJIS, FlushJISWithSpace); } } free(buffer); return 0; }