/* shindou4.c - Don Yang (uguu.org) 02/18/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif typedef enum { UCS2LE = 1, UCS2BE = 2, UTF8 = 4, SJIS = 8, EUCJP = 16 } Encoding; static FILE *infile; static unsigned char *tmp, *buffer; static int u, a, state, buffer_size, read_size, i, i0, last_char_offset, c, utf8_kana, sjis_kana, eucjp_kana, encoding; static void ExtendBuffer(void) { if( read_size == buffer_size ) { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); assert(buffer != NULL); memcpy(buffer, tmp, read_size); free(tmp); } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } static void NextByte(void) { c = (c << 8) | buffer[i++]; } static void NextUCS2BEChar(void) { c = 0; NextByte(); NextByte(); } static void NextUCS2LEChar(void) { c = buffer[i] | (buffer[i + 1] << 8); i += 2; } static void NextUTF8Char(void) { c = 0; if( (buffer[i] & 0xe0) != 0xc0 ) { if( (buffer[i] & 0xf0) != 0xe0 ) { if( (buffer[i] & 0xf8) != 0xf0 ) { NextByte(); } else { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); } } } static void NextSJISChar(void) { c = 0; if( (buffer[i] >= 0x81 && buffer[i] <= 0x9f) || (buffer[i] >= 0xe0 && buffer[i] <= 0xef) ) { if( buffer[i + 1] < 0x40 || buffer[i + 1] > 0xfc ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } static void NextEUCJPChar(void) { c = 0; if( (buffer[i] >= 0xa1 && buffer[i] <= 0xa8) || (buffer[i] == 0xad) || (buffer[i] >= 0xb0 && buffer[i] <= 0xf4) || (buffer[i] >= 0xf9 && buffer[i] <= 0xfc) ) { if( buffer[i + 1] < 0xa1 || buffer[i + 1] > 0xfe ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } static void DetectEncoding(void) { if( !(encoding & (UCS2LE|UCS2BE)) ) { if( buffer[0] == 0xff && buffer[1] == 0xfe ) { encoding = UCS2LE; } else if( buffer[0] == 0xfe && buffer[1] == 0xff ) { encoding = UCS2BE; } else { for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x0a && buffer[i + 1] == 0x00 ) { encoding = UCS2LE; return; } if( buffer[i] == 0x00 && buffer[i + 1] == 0x0a ) { encoding = UCS2BE; return; } } if( buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf ) { encoding = UTF8; } else { if( encoding & UTF8 ) { for(i = 0; i < read_size - 3;) { NextUTF8Char(); if( c == -1 ) { encoding &= ~UTF8; break; } if( (c >= 0xe38181 && c <= 0xe381bf) || (c >= 0xe38280 && c <= 0xe38293) ) { utf8_kana++; } } } if( encoding & SJIS ) { for(i = 0; i < read_size - 1;) { NextSJISChar(); if( c == -1 ) { encoding &= ~SJIS; break; } if( c >= 0x829f && c <= 0x82f1 ) sjis_kana++; } } if( encoding & EUCJP ) { for(i = 0; i < read_size - 1;) { NextEUCJPChar(); if( c == -1 ) { encoding &= ~EUCJP; break; } if( c >= 0xa4a1 && c <= 0xa4f3 ) eucjp_kana++; } } } } } } static void DetectFileEncoding(void) { read_size = utf8_kana = sjis_kana = eucjp_kana = 0; encoding = UTF8 | SJIS | EUCJP; for(; !feof(infile) && (encoding & (encoding - 1));) { ExtendBuffer(); DetectEncoding(); } if( !(encoding & (UCS2LE|UCS2BE)) ) { if( !(encoding & UTF8) ) utf8_kana = 0; if( !(encoding & SJIS) ) sjis_kana = 0; if( !(encoding & EUCJP) ) eucjp_kana = 0; if( utf8_kana < sjis_kana ) { encoding = (sjis_kana < eucjp_kana) ? EUCJP : SJIS; } else { if( utf8_kana < eucjp_kana ) encoding = EUCJP; else encoding = (encoding & UTF8) ? UTF8 : EUCJP; } } } static int IsAlpha(void) { return (c >= 0x61 && c <= 0x7a) || (c >= 0x41 && c <= 0x5a) || (c >= 0x30 && c <= 0x39) || c == 0x2d || c == 0x5f || c == 0x20 || c == 0x09; } static int IsExclaimationPunct(void) { return c == 0x21 || (encoding & (UCS2LE|UCS2BE) && c == 0xff01) || (encoding & UTF8 && c == 0xefbc81) || (encoding & SJIS && c == 0x8149) || (encoding & EUCJP && c == 0xa1aa); } static int IsEOSPunct(void) { return c == 0x2e || c == 0x3f || (encoding & (UCS2LE|UCS2BE) && (c == 0xff1f || c == 0xff0e || c == 0x3002 || c == 0xff61)) || (encoding & UTF8 && (c == 0xefbc9f || c == 0xefbc8e || c == 0xe38082 || c == 0xefbda1)) || (encoding & SJIS && (c == 0x8148 || c == 0x8144 || c == 0x8142)) || (encoding & EUCJP && (c == 0xa1a9 || c == 0xa1a5 || c == 0xa1a3)) || IsExclaimationPunct(); } static int IsPunct(void) { return c == 0x2c || c == 0x0a || (encoding & (UCS2LE|UCS2BE) && (c == 0xff0c || c == 0x3001 || c == 0xff64)) || (encoding & UTF8 && (c == 0xefbc8c || c == 0xe38081 || c == 0xefbda4)) || (encoding & SJIS && (c == 0x8143 || c == 0x8141)) || (encoding & EUCJP && (c == 0xa1a4 || c == 0xa1a2)) || IsEOSPunct(); } static void ToUpperUCS2BE(void) { for(u = i0; u < i; u += 2) { if( buffer[u + 1] >= 0x61 && buffer[u + 1] <= 0x7a ) buffer[u + 1] -= 32; } } static void ToUpperUCS2LE(void) { for(u = i0; u < i; u += 2) { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } static void ToUpperUTF8(void) { for(u = i0; u < i;) { if( (buffer[u] & 0xf8) != 0xf0 ) { if( (buffer[u] & 0xf0) != 0xe0 ) { if( (buffer[u] & 0xe0) != 0xc0 ) { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; u++; } else { u += 2; } } else { u += 3; } } else { u += 4; } } } static void ToUpperJIS(void) { for(u = i0; u < i; u++) { if( (buffer[u] >= 0x81 && buffer[i] <= 0x9f) || (buffer[u] >= 0xe0 && buffer[i] <= 0xef) ) { u++; } else { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } } static void WriteExclamationMark(void) { if( c > 255 ) { if( encoding & UCS2BE ) { putchar(0xff); putchar(0x01); } else if( encoding & UCS2LE ) { putchar(0x01); putchar(0xff); } else if( encoding & UTF8 ) { putchar(0xef); putchar(0xbc); putchar(0x81); } else if( encoding & SJIS ) { putchar(0x81); putchar(0x49); } else { putchar(0xa1); putchar(0xaa); } } else { if( encoding & UCS2BE ) putchar(0); putchar('!'); if( encoding & UCS2LE ) putchar(0); } } static void Flush(void) { if( i0 != last_char_offset ) fwrite(buffer + i0, last_char_offset - i0, 1, stdout); if( IsExclaimationPunct() ) { WriteExclamationMark(); WriteExclamationMark(); } else if( IsPunct() && c != 0x0a ) { WriteExclamationMark(); } else { fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } } static void FlushUCS2WithSpace(void) { for(; i0 < last_char_offset; i0 += 2) { fwrite(buffer + i0, 2, 1, stdout); if( encoding & UCS2LE ) { putchar(32); putchar(0); } else { putchar(0); putchar(32); } } } static void FlushUTF8WithSpace(void) { for(; i0 < last_char_offset; putchar(32)) { if( (buffer[i0] & 0xf8) != 0xf0 ) { if( (buffer[i0] & 0xf0) != 0xe0 ) { if( (buffer[i0] & 0xe0) != 0xc0 ) { putchar(buffer[i0]); i0++; } else { fwrite(buffer + i0, 2, 1, stdout); i0 += 2; } } else { fwrite(buffer + i0, 3, 1, stdout); i0 += 3; } } else { fwrite(buffer + i0, 4, 1, stdout); i0 += 4; } } } static void FlushJISWithSpace(void) { for(; i0 < last_char_offset; i0++) { if( buffer[i0] > 127 ) { fwrite(buffer + i0, 2, 1, stdout); i0++; } else { putchar(buffer[i0]); } putchar(32); } } static void Filter(void (*NextChar)(void), void (*ToUpper)(void), void (*FlushWithSpace)(void)) { i = i0 = state = 0; for(; !feof(stdin) || i < read_size;) { last_char_offset = i; NextChar(); if( i < read_size - 3 ) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; ExtendBuffer(); } if( state != 1 ) { if( state != 2 ) { if( c > 127 ) { state = 2; } else if( IsAlpha() ) { state = 1; } else { fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } i0 = last_char_offset; } else { if( IsAlpha() || IsPunct() ) { if( IsEOSPunct() ) FlushWithSpace(); Flush(); state = 0; } } } else { if( !IsAlpha() || IsPunct() ) { if( IsEOSPunct() ) ToUpper(); Flush(); state = 0; } } } } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 0x1000; buffer = malloc(buffer_size); assert(buffer != NULL); if( argc > 1 ) { for(a = 1; a < argc; a++) { if( (infile = fopen(argv[a], "rb")) ) { DetectFileEncoding(); printf("%s: %s\n", argv[a], encoding != UCS2LE ? encoding != UCS2BE ? encoding != UTF8 ? encoding != SJIS ? "EUC-JP" : "Shift_JIS" : "UTF-8" : "UCS-2BE" : "UCS-2LE"); fclose(infile); } else { printf("%s: can not open file\n", argv[a]); } } } else { infile = stdin; DetectFileEncoding(); switch( encoding ) { case UCS2BE: Filter(NextUCS2BEChar, ToUpperUCS2BE, FlushUCS2WithSpace); break; case UCS2LE: Filter(NextUCS2LEChar, ToUpperUCS2LE, FlushUCS2WithSpace); break; case UTF8: Filter(NextUTF8Char, ToUpperUTF8, FlushUTF8WithSpace); break; case SJIS: Filter(NextSJISChar, ToUpperJIS, FlushJISWithSpace); break; case EUCJP: Filter(NextEUCJPChar, ToUpperJIS, FlushJISWithSpace); break; default: assert(0); } } free(buffer); return 0; }