/* shindou9.c - Don Yang (uguu.org) 02/19/07 */ #include #include #include #ifdef _WIN32 #include #include #endif FILE *infile; unsigned char *tmp, *buffer, *const_str = " \0!\0 \xef\xbc\x81I\1\xff\1\xa1\xaa"; int u, a, state, buffer_size, read_size, i, i0, last_char_offset, c, utf8_kana, sjis_kana, eucjp_kana, encoding, ke38081 = 0xe38081, ke3827f = 0xe3827f, kefbc81 = 0xefbc81, kefbda1 = 0xefbda1, k12289 = 12289, k33089 = 33089, k33438 = 33438, k41378 = 41378, k65281 = 65281, k128 = 128, k160 = 160, k192 = 192, k224 = 224, k240 = 240, k248 = 248, k255 = 255; #define E(_) }else if(_){ #define Output(data, size) \ fwrite(data, size, 1, stdout); #define Func(_) static void _(void) Func(ExtendBuffer) { if( read_size - buffer_size ) { read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); E(1) tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); memcpy(buffer, tmp, read_size); free(tmp); } } Func(NextByte) { c = (c << 8) | buffer[i++]; } Func(NextUCS2BEChar) { c = 0; NextByte(); NextByte(); } Func(NextUCS2LEChar) { c = buffer[i] | (buffer[i + 1] << 8); i += 2; } Func(NextUTF8Char) { c = 0; if( (buffer[i] & k224) - k192 ) { if( (buffer[i] & k240) - k224 ) { if( (buffer[i] & k248) - k240 ) { NextByte(); E( (buffer[i + 1] & k192) - k128 || (buffer[i + 2] & k192) - k128 || (buffer[i + 2] & k192) - k128 ) c--; E(1) NextByte(); NextByte(); NextByte(); NextByte(); } E( (buffer[i + 1] & k192) - k128 || (buffer[i + 2] & k192) - k128 ) c--; E(1) NextByte(); NextByte(); NextByte(); } E( (buffer[i + 1] & k192) - k128 ) c--; E(1) NextByte(); NextByte(); } } Func(NextSJISChar) { c = 0; if( (buffer[i] > k128 && buffer[i] < k160) || (buffer[i] >= k224 && buffer[i] < k240) ) { if( buffer[i + 1] < 64 || buffer[i + 1] > k255-3 ) { c--; E(1) NextByte(); NextByte(); } E(1) NextByte(); } } Func(NextEUCJPChar) { c = 0; if( (buffer[i] > k160 && buffer[i] < k160+9) || buffer[i] == 173 || (buffer[i] > 175 && buffer[i] < k240+5) || (buffer[i] > k255-7 && buffer[i] < k255-2) ) { if( buffer[i + 1] <= k160 || buffer[i + 1] >= k255 ) { c--; E(1) NextByte(); NextByte(); } E(1) NextByte(); } } Func(DetectFileEncoding) { read_size = utf8_kana = sjis_kana = eucjp_kana = 0; for(encoding = 28; !feof(infile) && encoding & (encoding - 1);) { ExtendBuffer(); if( !(encoding & 3) ) { if( *buffer - k255 || buffer[1] + 1 - k255 ) { if( *buffer + 1 - k255 || buffer[1] - k255 ) { for(i = 0; i < read_size - 1; i += 2) if( buffer[i] == 10 && !buffer[i + 1] ) { encoding = 1; goto finalize; E( !buffer[i] && buffer[i + 1] == 10 ) encoding = 2; goto finalize; } if( buffer[0] + 1 - k240 || buffer[1] + 5 - k192 || buffer[2] + 1 - k192 ) { for(i = 0; i < read_size - 3 && encoding & 4;) { NextUTF8Char(); if( c < 0 ) { encoding &= ~4; E( (c > ke38081+k255 && c < ke3827f-k192+1) || (c > ke3827f && c < ke3827f+21) ) utf8_kana++; } } for(i = 0; i < read_size - 1 && encoding & 8;) { NextSJISChar(); if( c < 0 ) { encoding &= ~8; E( c > k33438 && c < k33438+84 ) sjis_kana++; } } for(i = 0; i < read_size - 1 && encoding & 16;) { NextEUCJPChar(); if( c < 0 ) { encoding &= ~16; E( c > 42144 && c < 42228 ) eucjp_kana++; } } E(1) encoding = 4; } E(1) encoding = 2; } E(1) encoding = 1; } } } finalize: if( !(encoding & 3) ) { if( !(encoding & 4) ) utf8_kana = 0; if( !(encoding & 8) ) sjis_kana = 0; if( !(encoding & 16) ) eucjp_kana = 0; encoding = utf8_kana < sjis_kana ? sjis_kana < eucjp_kana ? 16 : 8 : utf8_kana < eucjp_kana ? 16 : encoding & 4 ? 4 : 8; } } #define IsAlpha \ (c > 96 && c < k128-5) || \ (c > 64 && c < 91) || \ (c > 47 && c < 58) || \ c == 45 || c == 95 || \ c == 32 || c == 9 #define IsExclaimationPunct \ c == 33 || \ (encoding & 3 && c == k65281) || \ (encoding & 4 && c == kefbc81) || \ (encoding & 8 && c == k33089+8) || \ (encoding & 16 && c == k41378+8) #define IsEOSPunct \ c == 46 || c == 63 || \ (encoding & 3 && \ (c == k65281+30 || c == k65281+13 || c == k12289+1 || \ c == k65281+96)) || \ (encoding & 4 && \ (c == kefbc81+30 || c == kefbc81+13 || c == ke38081+1 || \ c == kefbda1)) || \ (encoding & 8 && \ (c == k33089+7 || c == k33089+3 || c == k33089+1)) || \ (encoding & 16 && \ (c == k41378+7 || c == k41378+3 || c == k41378+1)) || \ IsExclaimationPunct #define IsPunct \ c == 44 || c == 10 || \ (encoding & 3 && \ (c == k65281+11 || c == k12289 || c == k65281+99)) || \ (encoding & 4 && \ (c == kefbc81+11 || c == ke38081 || c == kefbda1+3)) || \ (encoding & 8 && \ (c == k33089+2 || c == k33089)) || \ (encoding & 16 && \ (c == k41378+2 || c == k41378)) || \ IsEOSPunct Func(ToUpperChar) { buffer[u] -= buffer[u] > 96 && buffer[u] < k128-5 ? 32 : 0; u++; } Func(ToUpperUCS2LE) { for(u = i0; u < i; u++) ToUpperChar(); } Func(ToUpperUCS2BE) { i0++; ToUpperUCS2LE(); i0--; } Func(ToUpperUTF8) { for(u = i0; u < i;) if( (buffer[u] & k248) - k240 ) { if( (buffer[u] & k240) - k224 ) { if( (buffer[u] & k224) - k192 ) { ToUpperChar(); E(1) u += 2; } E(1) u += 3; } E(1) u += 4; } } Func(ToUpperJIS) { for(u = i0; u < i;) if( (buffer[u] > k128 && buffer[i] < k160) || (buffer[u] >= k224 && buffer[i] < k240) ) { u++; E(1) ToUpperChar(); } } Func(WriteExclamationMark) { Output(const_str + (c > k255 ? encoding & 1 ? 9 : encoding & 2 ? 10 : encoding & 4 ? 5 : encoding & 8 ? 7 : 12 : encoding & 2 ? 1 : 2), (c > k255 ? encoding & 4 ? 3 : 2 : encoding & 3 ? 2 : 1)) } Func(FlushWithSpace) { for(; i0 < last_char_offset; i0 += u) { Output(buffer + i0, u = ( encoding & 3 ? 2 : encoding & 4 ? (buffer[i0] & k248) - k240 ? (buffer[i0] & k240) - k224 ? (buffer[i0] & k224) - k192 ? 1 : 2 : 3 : 4 : buffer[i0] >= k128 ? 2 : 1)) Output(const_str + (encoding & 2 ? 3 : 0), encoding & 3 ? 2 : 1) } } #define Filter(NextChar, ToUpper) \ for(i = i0 = state = 0; !feof(infile) || i < read_size;) \ { \ last_char_offset = i; \ NextChar(); \ if( i < read_size - 3 ) \ { \ memmove(buffer, buffer + i0, read_size - i0); \ read_size -= i0; \ i -= i0; \ last_char_offset -= i0; \ i0 = 0; \ \ ExtendBuffer(); \ } \ \ if( !state ) \ { \ if( c >= k128 ) \ { \ state = 2; \ E( IsAlpha ) \ state = 1; \ E(1) \ Output(buffer + last_char_offset, i - last_char_offset) \ } \ i0 = last_char_offset; \ E(1) \ if( IsPunct || \ (state - 1 && (IsAlpha)) || \ (state - 2 && !(IsAlpha)) ) \ { \ if( IsEOSPunct ) \ { \ if( state - 1 ) \ { \ FlushWithSpace(); \ E(1) \ ToUpper(); \ } \ } \ if( i0 - last_char_offset ) \ Output(buffer + i0, last_char_offset - i0) \ if( IsExclaimationPunct ) \ { \ WriteExclamationMark(); \ WriteExclamationMark(); \ E( (IsPunct) && c - 10 ) \ WriteExclamationMark(); \ E(1) \ Output(buffer + last_char_offset, i - last_char_offset) \ } \ state = 0; \ } \ } \ } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 4096; buffer = malloc(buffer_size); if( argc > 1 ) { for(a = 1; a < argc; a++) { Output(argv[a], strlen(argv[a])) Output(": ", 2) if( (infile = fopen(argv[a], "rb")) ) { DetectFileEncoding(); tmp = encoding - 1 ? encoding - 2 ? encoding - 4 ? encoding - 8 ? "EUC-JP" : "Shift_JIS" : "UTF-8" : "UCS-2BE" : "UCS-2LE"; fclose(infile); E(1) tmp = "can not open file"; } puts(tmp); } E(1) infile = stdin; DetectFileEncoding(); if( encoding & 3 ) { if( encoding & 1 ) { Filter(NextUCS2LEChar, ToUpperUCS2LE) E(1) Filter(NextUCS2BEChar, ToUpperUCS2BE) } E( encoding & 4 ) Filter(NextUTF8Char, ToUpperUTF8) E(1) if( encoding & 8 ) { Filter(NextSJISChar, ToUpperJIS) E(1) Filter(NextEUCJPChar, ToUpperJIS) } } } free(buffer); return 0; }