/* shindou1.c - Don Yang (uguu.org) 02/18/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif /* Encoding bitmasks */ typedef enum { UCS2LE = 1, UCS2BE = 2, UTF8 = 4, SJIS = 8, EUCJP = 16 } Encoding; /* Local variables */ static int u, a; static unsigned char *tmp; /* Filter states */ static int state; /* Read buffer */ static FILE *infile; static unsigned char *buffer; static int buffer_size, read_size; /* Read offset/char */ static int i, i0, last_char_offset; static int c; /* Decoding stats. It's often the case that we will read through the entire file, and still not able to determine which encoding it is. The conflicts are resolved by assuming input to be Japanese, and selecting the encoding that decodes the most number of hiragana characters. This only applies to the 3 encodings below. For UCS2 encodings, we select encoding based on presence of a linefeed character, since U+000A is linefeed and U+0A00 is not a character, and no other encodings allow zeroes. Only works if input contains linefeed, of course. */ static int utf8_kana; static int sjis_kana; static int eucjp_kana; /* Buffer encoding */ static int encoding; /* Read more data into buffer. buffer = read buffer read_size = number of bytes in buffer buffer_size = maximum buffer size */ static void ExtendBuffer(void) { if( read_size == buffer_size ) { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); assert(buffer != NULL); memcpy(buffer, tmp, read_size); free(tmp); } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } /* Shift buffer to discard leading bytes. buffer = read buffer read_size = number of bytes in buffer (updated on return) i0 = start offset of buffered characters (zero on return) i = next read offset (updated on return) last_char_offset = start offset of last buffered character (updated on return) */ static void ShiftBuffer(void) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; } /* Get next byte and update offset. buffer = read buffer i = read offset c = current character (updated on return) */ static void NextByte(void) { c = (c << 8) | buffer[i++]; } /* Get UCS-2BE bytes for next character. buffer = read buffer i = start offset of next character (updated on return) c = next character, or -1 if invalid (updated on return) */ static void NextUCS2BEChar(void) { c = 0; NextByte(); NextByte(); } /* Get next UCS-2LE bytes for next character, also convert to little-endian. buffer = read buffer i = start offset of next character (updated on return) c = next character, or -1 if invalid (updated on return) */ static void NextUCS2LEChar(void) { c = buffer[i] | (buffer[i + 1] << 8); i += 2; } /* Get UTF-8 bytes for next character. buffer = read buffer i = start offset of next character (updated on return) c = next character, or -1 if invalid (updated on return) */ static void NextUTF8Char(void) { c = 0; if( (buffer[i] & 0xe0) == 0xc0 ) { if( (buffer[i + 1] & 0xc0) != 0x80 ) { c--; return; } NextByte(); NextByte(); return; } if( (buffer[i] & 0xf0) == 0xe0 ) { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; return; } NextByte(); NextByte(); NextByte(); return; } if( (buffer[i] & 0xf8) == 0xf0 ) { if( (buffer[i + 1] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 || (buffer[i + 2] & 0xc0) != 0x80 ) { c--; return; } NextByte(); NextByte(); NextByte(); NextByte(); return; } NextByte(); } /* Get Shift_JIS bytes for next character. buffer = read buffer i = start offset of next character (updated on return) c = next character, or -1 if invalid (updated on return) */ static void NextSJISChar(void) { c = 0; if( (buffer[i] >= 0x81 && buffer[i] <= 0x9f) || (buffer[i] >= 0xe0 && buffer[i] <= 0xef) ) { if( buffer[i + 1] < 0x40 || buffer[i + 1] > 0xfc ) { c--; return; } NextByte(); NextByte(); return; } NextByte(); } /* Get EUC-JP bytes for next character. buffer = read buffer i = start offset of next character (updated on return) c = next character, or -1 if invalid (updated on return) */ static void NextEUCJPChar(void) { c = 0; if( (buffer[i] >= 0xa1 && buffer[i] <= 0xa8) || (buffer[i] == 0xad) || (buffer[i] >= 0xb0 && buffer[i] <= 0xf4) || (buffer[i] >= 0xf9 && buffer[i] <= 0xfc) ) { if( buffer[i + 1] < 0xa1 || buffer[i + 1] > 0xfe ) { c--; return; } NextByte(); NextByte(); return; } NextByte(); } /* Check if buffer matches one of the accepted input encodings encoding = encoding bitmask */ static void DetectEncoding(void) { /* Check for UCS-2 first */ if( (encoding & (UCS2LE|UCS2BE)) != 0 ) return; /* Test UCS-2 byte order mark */ if( buffer[0] == 0xff && buffer[1] == 0xfe ) { encoding = UCS2LE; return; } if( buffer[0] == 0xfe && buffer[1] == 0xff ) { encoding = UCS2BE; return; } /* Test UCS-2 linefeed */ for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x0a && buffer[i + 1] == 0x00 ) { encoding = UCS2LE; return; } if( buffer[i] == 0x00 && buffer[i + 1] == 0x0a ) { encoding = UCS2BE; return; } } /* Test UTF-8 byte order mark */ if( buffer[0] == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf ) { encoding = UTF8; return; } /* Try UTF-8 */ if( (encoding & UTF8) != 0 ) { /* Stop reading when there is less than 4 bytes left in buffer. This is so that if the read block boundaries happen to land in the middle of a character, the uninitialized memory might cause the buffer to be marked invalid for UTF-8, so we avoid that by stopping a few bytes early. */ for(i = 0; i < read_size - 3;) { NextUTF8Char(); if( c == -1 ) { encoding &= ~UTF8; break; } /* Try decoding hiragana characters */ if( (c >= 0xe38181 && c <= 0xe381bf) || (c >= 0xe38280 && c <= 0xe38293) ) { utf8_kana++; } } } /* Try Shift_JIS */ if( (encoding & SJIS) != 0 ) { /* Maximum Shift_JIS character is 2 bytes, so we stop 1 byte early */ for(i = 0; i < read_size - 1;) { NextSJISChar(); if( c == -1 ) { encoding &= ~SJIS; break; } /* Try decoding hiragana characters */ if( c >= 0x829f && c <= 0x82f1 ) sjis_kana++; } } /* Try EUC-JP */ if( (encoding & EUCJP) != 0 ) { for(i = 0; i < read_size - 1;) { NextEUCJPChar(); if( c == -1 ) { encoding &= ~EUCJP; break; } /* Try decoding hiragana characters */ if( c >= 0xa4a1 && c <= 0xa4f3 ) eucjp_kana++; } } } /* Detect encoding from file */ static void DetectFileEncoding(void) { read_size = 0; /* Buffer data until only a single encoding left */ utf8_kana = sjis_kana = eucjp_kana = 0; encoding = UTF8 | SJIS | EUCJP; while( feof(infile) == 0 ) { ExtendBuffer(); DetectEncoding(); if( (encoding & (encoding - 1)) == 0 ) break; } /* Finalize encoding based on character frequency */ if( (encoding & (UCS2LE|UCS2BE)) == 0 ) { if( (encoding & UTF8) == 0 ) utf8_kana = 0; if( (encoding & SJIS) == 0 ) sjis_kana = 0; if( (encoding & EUCJP) == 0 ) eucjp_kana = 0; /* Prefer UTF-8 > Shift_JIS > EUC-JP */ if( utf8_kana < sjis_kana ) { encoding = (sjis_kana < eucjp_kana) ? EUCJP : SJIS; } else { if( utf8_kana < eucjp_kana ) encoding = EUCJP; else encoding = ((encoding & UTF8) != 0) ? UTF8 : EUCJP; } } assert((encoding & (encoding - 1)) == 0); } /* Check if character is alphanumeric. c = character to check */ static int IsAlpha(void) { return (c >= 0x61 && c <= 0x7a) || (c >= 0x41 && c <= 0x5a) || (c >= 0x30 && c <= 0x39) || c == 0x2d || c == 0x5f || c == 0x20 || c == 0x09; } /* Check if character is an exclamation mark. c = character to check encoding = buffer encoding */ static int IsExclaimationPunct(void) { return c == 0x21 || ((encoding == UCS2LE || encoding == UCS2BE) && c == 0xff01) || (encoding == UTF8 && c == 0xefbc81) || (encoding == SJIS && c == 0x8149) || (encoding == EUCJP && c == 0xa1aa); } /* Check if character is a end-of-sentence punctuation character. c = character to check encoding = buffer encoding */ static int IsEOSPunct(void) { return c == 0x2e || c == 0x3f || ((encoding == UCS2LE || encoding == UCS2BE) && (c == 0xff1f || c == 0xff0e || c == 0x3002 || c == 0xff61)) || (encoding == UTF8 && (c == 0xefbc9f || c == 0xefbc8e || c == 0xe38082 || c == 0xefbda1)) || (encoding == SJIS && (c == 0x8148 || c == 0x8144 || c == 0x8142)) || (encoding == EUCJP && (c == 0xa1a9 || c == 0xa1a5 || c == 0xa1a3)) || IsExclaimationPunct(); } /* Check if character is a punctuation character. c = character to check encoding = buffer encoding */ static int IsPunct(void) { return c == 0x2c || c == 0x0a || ((encoding == UCS2LE || encoding == UCS2BE) && (c == 0xff0c || c == 0x3001 || c == 0xff64)) || (encoding == UTF8 && (c == 0xefbc8c || c == 0xe38081 || c == 0xefbda4)) || (encoding == SJIS && (c == 0x8143 || c == 0x8141)) || (encoding == EUCJP && (c == 0xa1a4 || c == 0xa1a2)) || IsEOSPunct(); } /* Convert UCS-2BE characters to uppercase. buffer = read buffer (updated on return) i = next read offset i0 = start offset of buffered characters */ static void ToUpperUCS2BE(void) { for(u = i0; u < i; u += 2) { if( buffer[u + 1] >= 0x61 && buffer[u + 1] <= 0x7a ) buffer[u + 1] -= 32; } } /* Convert UCS-2LE characters to uppercase. buffer = read buffer (updated on return) i0 = start offset of buffered characters last_char_offset = offset of last character before punctuation */ static void ToUpperUCS2LE(void) { for(u = i0; u < i; u += 2) { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } /* Convert UTF-8 characters to uppercase. buffer = read buffer (updated on return) i0 = start offset of buffered characters last_char_offset = offset of last character before punctuation */ static void ToUpperUTF8(void) { for(u = i0; u < i;) { if( (buffer[u] & 0xf8) == 0xf0 ) { u += 4; } else if( (buffer[u] & 0xf0) == 0xe0 ) { u += 3; } else if( (buffer[u] & 0xe0) == 0xc0 ) { u += 2; } else { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; u++; } } } /* Convert Shift_JIS or EUC-JP characters to uppercase. buffer = read buffer (updated on return) i0 = start offset of buffered characters last_char_offset = offset of last character before punctuation */ static void ToUpperJIS(void) { for(u = i0; u < i; u++) { if( (buffer[u] >= 0x81 && buffer[i] <= 0x9f) || (buffer[u] >= 0xe0 && buffer[i] <= 0xef) ) { u++; } else { if( buffer[u] >= 0x61 && buffer[u] <= 0x7a ) buffer[u] -= 32; } } } /* Write exclamation mark to stdout. c = last character encoding = buffer encoding */ static void WriteExclamationMark(void) { if( c > 255 ) { /* Write full-width exclamation mark */ if( encoding == UCS2BE ) { putchar(0xff); putchar(0x01); } else if( encoding == UCS2LE ) { putchar(0x01); putchar(0xff); } else if( encoding == UTF8 ) { putchar(0xef); putchar(0xbc); putchar(0x81); } else if( encoding == SJIS ) { putchar(0x81); putchar(0x49); } else /* encoding == EUCJP */ { putchar(0xa1); putchar(0xaa); } } else { /* Write half-width exclamation mark */ if( encoding == UCS2BE ) { putchar(0); putchar('!'); } else if( encoding == UCS2LE ) { putchar('!'); putchar(0); } else { putchar('!'); } } } /* Flush buffered characters to stdout, echo exclamation mark if needed. buffer = read buffer c = last character i0 = start offset of buffered characters last_char_offset = offset of last character before punctuation i = next read offset */ static void Flush(void) { /* Write all characters up to last punctuation */ if( i0 != last_char_offset ) fwrite(buffer + i0, last_char_offset - i0, 1, stdout); if( IsExclaimationPunct() != 0 ) { /* Write 2 exclamation marks */ WriteExclamationMark(); WriteExclamationMark(); } else if( IsPunct() != 0 && c != 0x0a ) { /* Write 1 exclamation mark */ WriteExclamationMark(); } else { /* Unrecognized punctuation */ fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } } /* Write UCS-2BE/UCS-2LE characters with white spaces inserted. buffer = read buffer i0 = start offset of buffered characters (destroyed on return) last_char_offset = offset of last character before punctuation */ static void FlushUCS2WithSpace(void) { for(; i0 < last_char_offset; i0 += 2) { fwrite(buffer + i0, 2, 1, stdout); if( encoding == UCS2LE ) { putchar(' '); putchar(0); } else { putchar(0); putchar(' '); } } } /* Write UTF-8 characters with white spaces inserted. buffer = read buffer i0 = start offset of buffered characters (destroyed on return) last_char_offset = offset of last character before punctuation */ static void FlushUTF8WithSpace(void) { while( i0 < last_char_offset ) { if( (buffer[i0] & 0xf8) == 0xf0 ) { fwrite(buffer + i0, 4, 1, stdout); i0 += 4; } else if( (buffer[i0] & 0xf0) == 0xe0 ) { fwrite(buffer + i0, 3, 1, stdout); i0 += 3; } else if( (buffer[i0] & 0xe0) == 0xc0 ) { fwrite(buffer + i0, 2, 1, stdout); i0 += 2; } else { putchar(buffer[i0]); i0++; } putchar(' '); } } /* Write Shift_JIS/EUC-JP characters with white spaces inserted. buffer = read buffer i0 = start offset of buffered characters (destroyed on return) last_char_offset = offset of last character before punctuation */ static void FlushJISWithSpace(void) { for(; i0 < last_char_offset; i0++) { if( buffer[i0] > 127 ) { fwrite(buffer + i0, 2, 1, stdout); i0++; } else { putchar(buffer[i0]); } putchar(32); } } /* Filter stdin. buffer = read buffer read_size = number of bytes in buffer i = start offset of next character (destroyed on return) i0 = start offset of buffered characters (destroyed on return) c = current character (destroyed on return) last_char_offset = start offset of last buffered character (destroyed on return) */ static void Filter(void (*NextChar)(void), void (*ToUpper)(void), void (*FlushWithSpace)(void)) { i = i0 = 0; state = 0; do { /* Decode next character */ last_char_offset = i; NextChar(); if( i < read_size - 3 ) { ShiftBuffer(); ExtendBuffer(); } if( state == 1 ) { /* Buffer characters until punctuation or non-EN character, flush with uppercase as appropriate. */ if( IsAlpha() == 0 || IsPunct() != 0 ) { if( IsEOSPunct() != 0 ) ToUpper(); Flush(); state = 0; } } else if( state == 2 ) { /* Buffer characters until punctuation or non-JP character, flush with single space inserted as appropriate. */ if( IsAlpha() != 0 || IsPunct() != 0 ) { if( IsEOSPunct() != 0 ) FlushWithSpace(); Flush(); state = 0; } } else { /* Initial state, buffer characters until punctuation */ if( c > 127 ) { state = 2; } else if( IsAlpha() != 0 ) { state = 1; } else { fwrite(buffer + last_char_offset, i - last_char_offset, 1, stdout); } i0 = last_char_offset; } } while( feof(stdin) == 0 || i < read_size ); } int main(int argc, char **argv) { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif buffer_size = 0x1000; buffer = malloc(buffer_size); assert(buffer != NULL); if( argc == 1 ) { /* Translate stdin */ infile = stdin; DetectFileEncoding(); switch( encoding ) { case UCS2BE: Filter(NextUCS2BEChar, ToUpperUCS2BE, FlushUCS2WithSpace); break; case UCS2LE: Filter(NextUCS2LEChar, ToUpperUCS2LE, FlushUCS2WithSpace); break; case UTF8: Filter(NextUTF8Char, ToUpperUTF8, FlushUTF8WithSpace); break; case SJIS: Filter(NextSJISChar, ToUpperJIS, FlushJISWithSpace); break; case EUCJP: Filter(NextEUCJPChar, ToUpperJIS, FlushJISWithSpace); break; default: assert(0); } } else { /* Output encoding for each file specified on command line */ for(a = 1; a < argc; a++) { if( (infile = fopen(argv[a], "rb")) == NULL ) { printf("%s: can not open file\n", argv[a]); continue; } DetectFileEncoding(); printf("%s: ", argv[a]); switch( encoding ) { case UCS2LE: puts("UCS-2LE"); break; case UCS2BE: puts("UCS-2BE"); break; case UTF8: puts("UTF-8"); break; case SJIS: puts("Shift_JIS"); break; case EUCJP: puts("EUC-JP"); break; default: assert(0); } fclose(infile); } } /* Cleanup */ free(buffer); return 0; }