/* shindou2.c - Don Yang (uguu.org) 02/18/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif /* Encoding bitmasks */ typedef enum { UCS2LE = 1, UCS2BE = 2, UTF8 = 4, SJIS = 8, EUCJP = 16 } Encoding; /* Local variables */ static int u, a; static unsigned char *tmp; /* Filter states */ static int state; /* Read buffer */ static FILE *infile; static unsigned char *buffer; static int buffer_size, read_size; /* Read offset/char */ static int i, i0, last_char_offset; static int c; /* Decoding stats. It's often the case that we will read through the entire file, and still not able to determine which encoding it is. The conflicts are resolved by assuming input to be Japanese, and selecting the encoding that decodes the most number of hiragana characters. This only applies to the 3 encodings below. For UCS2 encodings, we select encoding based on presence of a linefeed character, since U+000A is linefeed and U+0A00 is not a character, and no other encodings allow zeroes. Only works if input contains linefeed, of course. */ static int utf8_kana; static int sjis_kana; static int eucjp_kana; /* Buffer encoding */ static int encoding; /* Read more data into buffer. */ static void ExtendBuffer(void) { if( read_size == buffer_size ) { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); assert(buffer != NULL); memcpy(buffer, tmp, read_size); free(tmp); } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } /* Shift buffer to discard leading bytes. */ static void ShiftBuffer(void) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; } /* Get next byte and update offset. */ static void NextByte(void) { c = (c << 8) | buffer[i++]; } /* Get UCS-2BE bytes for next character. */ static void NextUCS2BEChar(void) { c = 0; NextByte(); NextByte(); } /* Get next UCS-2LE bytes for next character, also convert to little-endian. */ static void NextUCS2LEChar(void) { c = buffer[i] | (buffer[i + 1] << 8); i += 2; } /* Get UTF-8 bytes for next character. */ static void NextUTF8Char(void) { c = 0; if( (buffer[i] & 0xe0) != 0xc0 ) { if( (buffer[i] & 0xf0) != 0xe0 ) { if( (buffer[i] & 0xf8) != 0xf0 ) { NextByte(); } else { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); NextByte(); } } } else { if( (buffer[i + 1] & 0xc0) != 0x80 ) { c--; } else { NextByte(); NextByte(); } } } /* Get Shift_JIS bytes for next character. */ static void NextSJISChar(void) { c = 0; if( (buffer[i] >= 0x81 && buffer[i] <= 0x9f) || (buffer[i] >= 0xe0 && buffer[i] <= 0xef) ) { if( buffer[i + 1] < 0x40 || buffer[i + 1] > 0xfc ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } /* Get EUC-JP bytes for next character. */ static void NextEUCJPChar(void) { c = 0; if( (buffer[i] >= 0xa1 && buffer[i] <= 0xa8) || (buffer[i] == 0xad) || (buffer[i] >= 0xb0 && buffer[i] <= 0xf4) || (buffer[i] >= 0xf9 && buffer[i] <= 0xfc) ) { if( buffer[i + 1] < 0xa1 || buffer[i + 1] > 0xfe ) { c--; } else { NextByte(); NextByte(); } } else { NextByte(); } } /* Check if buffer matches one of the accepted input encodings */ static void DetectEncoding(void) { /* Check for UCS-2 first */ if( encoding & (UCS2LE|UCS2BE) ) return; /* Test UCS-2 byte order mark */ if( buffer[0] == 0xff && buffer[1] == 0xfe ) { encoding = UCS2LE; return; } if( buffer[0] == 0xfe && buffer[1] == 0xff ) { encoding = UCS2BE; return; } /* Test UCS-2 linefeed */ for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x0a && buffer[i + 1] == 0x00 ) { encoding = UCS2LE; return; } if( buffer[i] == 0x00 && buffer[i + 1] == 0x0a ) { encoding = UCS2BE; return; } } /* Test UTF-8 byte order mark */ if( buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf ) { encoding = UTF8; return; } /* Try UTF-8 */ if( encoding & UTF8 ) { /* Stop reading when there is less than 4 bytes left in buffer. This is so that if the read block boundaries happen to land in the middle of a character, the uninitialized memory might cause the buffer to be marked invalid for UTF-8, so we avoid that by stopping a few bytes early. */ for(i = 0; i < read_size - 3;) { NextUTF8Char(); if( c == -1 ) { encoding &= ~UTF8; break; } /* Try decoding hiragana characters */ if( (c >= 0xe38181 && c <= 0xe381bf) || (c >= 0xe38280 && c <= 0xe38293) ) { utf8_kana++; } } } /* Try Shift_JIS */ if( encoding & SJIS ) { /* Maximum Shift_JIS character is 2 bytes, so we stop 1 byte early */ for(i = 0; i < read_size - 1;) { NextSJISChar(); if( c == -1 ) { encoding &= ~SJIS; break; } /* Try decoding hiragana characters */ if( c >= 0x829f && c <= 0x82f1 ) sjis_kana++; } } /* Try EUC-JP */ if( encoding & EUCJP ) { for(i = 0; i < read_size - 1;) { NextEUCJPChar(); if( c == -1 ) { encoding &= ~EUCJP; break; } /* Try decoding hiragana characters */ if( c >= 0xa4a1 && c <= 0xa4f3 ) eucjp_kana++; } } } /* Detect encoding from file */ static void DetectFileEncoding(void) { read_size = 0; /* Buffer data until only a single encoding left */ utf8_kana = sjis_kana = eucjp_kana = 0; encoding = UTF8 | SJIS | EUCJP; while( !feof(infile) && (encoding & (encoding - 1)) ) { ExtendBuffer(); DetectEncoding(); } /* Finalize encoding based on character frequency */ if( !(encoding & (UCS2LE|UCS2BE)) ) { if( !(encoding & UTF8) ) utf8_kana = 0; if( !(encoding & SJIS) ) sjis_kana = 0; if( !(encoding & EUCJP) ) eucjp_kana = 0; /* Prefer UTF-8 > Shift_JIS > EUC-JP */ if( utf8_kana < sjis_kana ) { encoding = (sjis_kana < eucjp_kana) ? EUCJP : SJIS; } else { if( utf8_kana < eucjp_kana ) encoding = EUCJP; else encoding = ((encoding & UTF8) != 0) ? UTF8 : EUCJP; } } } /* Check if character is alphanumeric. */ static int IsAlpha(void) { return (c >= 0x61 && c <= 0x7a) || (c >= 0x41 && c <= 0x5a) || (c >= 0x30 && c <= 0x39) || c == 0x2d || c == 0x5f || c == 0x20 || c == 0x09; } /* Check if character is an exclamation mark. */ static int IsExclaimationPunct(void) { return c == 0x21 || ((encoding == UCS2LE || encoding == UCS2BE) && c == 0xff01) || (encoding == UTF8 && c == 0xefbc81) || (encoding == SJIS && c == 0x8149) || (encoding == EUCJP && c == 0xa1aa); } /* Check if character is a end-of-sentence punctuation character. */ static int IsEOSPunct(void) { return c == 0x2e || c == 0x3f || ((encoding == UCS2LE || encoding == UCS2BE) && (c == 0xff1f || c == 0xff0e || c == 0x3002 || c == 0xff61)) || (encoding == UTF8 && (c == 0xefbc9f || c == 0xefbc8e || c == 0xe38082 || c == 0xefbda1)) || (encoding == SJIS && (c == 0x8148 || c == 0x8144 || c == 0x8142)) || (encoding == EUCJP && (c == 0xa1a9 || c == 0xa1a5 || c == 0xa1a3)) || IsExclaimationPunct(); } /* Check if character is a punctuation character. */ static int IsPunct(void) { return c == 0x2c || c == 0x0a || ((encoding == UCS2LE || encoding == UCS2BE) && (c == 0xff0c || c == 0x3001 || c == 0xff64)) || (encoding == UTF8 && (c == 0xefbc8c || c == 0xe38081 || c == 0xefbda4)) || (encoding == SJIS && (c == 0x8143 || c == 0x8141)) || (encoding == EUCJP && (c == 0xa1a4 || c == 0xa1a2)) || IsEOSPunct(); } /* Convert UCS-2BE characters to uppercase. */ static void ToUpperUCS2BE(void) { for(u = i0; u < i; u += 2) { if( buffer[u + 1] >= 0x61 && buffer[u + 1] <= 0x7a ) buffer[u + 1] -= 32; } } /* Convert UCS-2LE characters to uppercase. */ static void ToUpperUCS2LE(void) { for(u = i0; u < i; u += 2) { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } /* Convert UTF-8 characters to uppercase. */ static void ToUpperUTF8(void) { for(u = i0; u < i;) { if( (buffer[u] & 0xf8) != 0xf0 ) { if( (buffer[u] & 0xf0) != 0xe0 ) { if( (buffer[u] & 0xe0) != 0xc0 ) { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; u++; } else { u += 2; } } else { u += 3; } } else { u += 4; } } } /* Convert Shift_JIS or EUC-JP characters to uppercase. */ static void ToUpperJIS(void) { for(u = i0; u < i; u++) { if( (buffer[u] >= 0x81 && buffer[i] <= 0x9f) || (buffer[u] >= 0xe0 && buffer[i] <= 0xef) ) { u++; } else { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } } /* Write exclamation mark to stdout. */ static void WriteExclamationMark(void) { if( c > 255 ) { /* Write full-width exclamation mark */ if( encoding == UCS2BE ) { putchar(0xff); putchar(0x01); } else if( encoding == UCS2LE ) { putchar(0x01); putchar(0xff); } else if( encoding == UTF8 ) { putchar(0xef); putchar(0xbc); putchar(0x81); } else if( encoding == SJIS ) { putchar(0x81); putchar(0x49); } else /* encoding == EUCJP */ { putchar(0xa1); putchar(0xaa); } } else { /* Write half-width exclamation mark */ if( encoding == UCS2BE ) { putchar(0); putchar('!'); } else if( encoding == UCS2LE ) { putchar('!'); putchar(0); } else { putchar('!'); } } } /* Flush buffered characters to stdout, echo exclamation mark if needed. */ static void Flush(void) { /* Write all characters up to last punctuation */ if( i0 != last_char_offset ) fwrite(buffer + i0, last_char_offset - i0, 1, stdout); if( IsExclaimationPunct() != 0 ) { /* Write 2 exclamation marks */ WriteExclamationMark(); WriteExclamationMark(); } else if( IsPunct() != 0 && c != 0x0a ) { /* Write 1 exclamation mark */ WriteExclamationMark(); } else { /* Unrecognized punctuation */ fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } } /* Write UCS-2BE/UCS-2LE characters with white spaces inserted. */ static void FlushUCS2WithSpace(void) { for(; i0 < last_char_offset; i0 += 2) { fwrite(buffer + i0, 2, 1, stdout); if( encoding == UCS2LE ) { putchar(' '); putchar(0); } else { putchar(0); putchar(' '); } } } /* Write UTF-8 characters with white spaces inserted. */ static void FlushUTF8WithSpace(void) { while( i0 < last_char_offset ) { if( (buffer[i0] & 0xf8) == 0xf0 ) { fwrite(buffer + i0, 4, 1, stdout); i0 += 4; } else if( (buffer[i0] & 0xf0) == 0xe0 ) { fwrite(buffer + i0, 3, 1, stdout); i0 += 3; } else if( (buffer[i0] & 0xe0) == 0xc0 ) { fwrite(buffer + i0, 2, 1, stdout); i0 += 2; } else { putchar(buffer[i0]); i0++; } putchar(' '); } } /* Write Shift_JIS/EUC-JP characters with white spaces inserted. */ static void FlushJISWithSpace(void) { for(; i0 < last_char_offset; i0++) { if( buffer[i0] > 127 ) { fwrite(buffer + i0, 2, 1, stdout); i0++; } else { putchar(buffer[i0]); } putchar(32); } } /* Filter stdin. */ static void Filter(void (*NextChar)(void), void (*ToUpper)(void), void (*FlushWithSpace)(void)) { i = i0 = 0; state = 0; do { /* Decode next character */ last_char_offset = i; NextChar(); if( i < read_size - 3 ) { ShiftBuffer(); ExtendBuffer(); } if( state == 1 ) { /* Buffer characters until punctuation or non-EN character, flush with uppercase as appropriate. */ if( IsAlpha() == 0 || IsPunct() != 0 ) { if( IsEOSPunct() != 0 ) ToUpper(); Flush(); state = 0; } } else if( state == 2 ) { /* Buffer characters until punctuation or non-JP character, flush with single space inserted as appropriate. */ if( IsAlpha() != 0 || IsPunct() != 0 ) { if( IsEOSPunct() != 0 ) FlushWithSpace(); Flush(); state = 0; } } else { /* Initial state, buffer characters until punctuation */ if( c > 127 ) { state = 2; } else if( IsAlpha() != 0 ) { state = 1; } else { fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } i0 = last_char_offset; } } while( feof(stdin) == 0 || i < read_size ); } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 0x1000; buffer = malloc(buffer_size); assert(buffer != NULL); if( argc == 1 ) { /* Translate stdin */ infile = stdin; DetectFileEncoding(); switch( encoding ) { case UCS2BE: Filter(NextUCS2BEChar, ToUpperUCS2BE, FlushUCS2WithSpace); break; case UCS2LE: Filter(NextUCS2LEChar, ToUpperUCS2LE, FlushUCS2WithSpace); break; case UTF8: Filter(NextUTF8Char, ToUpperUTF8, FlushUTF8WithSpace); break; case SJIS: Filter(NextSJISChar, ToUpperJIS, FlushJISWithSpace); break; case EUCJP: Filter(NextEUCJPChar, ToUpperJIS, FlushJISWithSpace); break; default: assert(0); } } else { /* Output encoding for each file specified on command line */ for(a = 1; a < argc; a++) { if( (infile = fopen(argv[a], "rb")) == NULL ) { printf("%s: can not open file\n", argv[a]); continue; } DetectFileEncoding(); printf("%s: ", argv[a]); switch( encoding ) { case UCS2LE: puts("UCS-2LE"); break; case UCS2BE: puts("UCS-2BE"); break; case UTF8: puts("UTF-8"); break; case SJIS: puts("Shift_JIS"); break; case EUCJP: puts("EUC-JP"); break; default: assert(0); } fclose(infile); } } /* Cleanup */ free(buffer); return 0; }