/* shindou2.c - Don Yang (uguu.org) 02/25/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif #define KANA_RANGE (0x3093-0x3041) static FILE *infile; static unsigned char *buffer, *new_buffer; static int buffer_size, read_size; static int i, c, encoding; static int last_char_start, i0, next_offset; static char *const_str = /* literal encoding: length + data, length = 31+char parser encoding: lowerbound^208, upperbound^208 ... length (if matched) */ /* literals 1: length=8, offset=0 [ucs2le literal] +0 "!!\0" [ucs2be literal] +1 "!\0 " [shared literal] +3 " " [shared literal] +4 " !" [ucs2be literal] +5 "!\xff\1" 012 3456 7 */ "!!\0 !\xff\1" /* utf8 label: length=6, offset=8 012345 */ "UTF-8\0" /* literals 5: length=7, offset=14 [sjis literal] +0 "!\x81I" 01 23456 */ "!\x81IOCCC" /* literals 2: length=22, offset=21 [ucs2le literal] +0 "!\1\xff" [ucs2le bom] +2 "\xff\xfe" [ucs2be bom] +3 "\xfe\xff" [text] +5 "can not open!\0" [ucs2be literal] +17 "!\0!" [ucs2le literal] +19 "! \0" 01 2 3 4 56789012345678 901 */ "!\1\xff\xfe\xff""can not open!\0! \0" /* literals 3: length=3, offset=43 [utf8 bom] +0 "\xef\xbb\xbf" 0 1 2 */ "\xef\xbb\xbf" /* eucjp label: length=7, offset=46 0123456 */ "EUC-JP\0" /* utf8 parser: length=22, offset=53 012345678 9012345 6 7 890 1 */ " 'PoPoPo\4""0?PoPo\3\20\17Po\2\1" /* literals 4: length=4, offset=75 [utf8 literal] +0 "\"\xef\xbc\x81" 0 1 2 3 */ "\"\xef\xbc\x81" /* sjis label: length=10, offset=79 0123456789 */ "Shift_JIS\0" /* literals 6: length=3, offset=89 [eucjp literal] +0 "!\xa1\xaa" 01 2 */ "!\xa1\xaa" /* sjis parser: length=11, offset=92 012 34 567 89 0 */ "QO\220,\2""0?\220,\2\1" /* ucs2 parser: length=1, offset=96 */ /* ucs2le label: length=8, offset=103 01234567 */ "UCS-2LE\0" /* eucjp parser: length=33, offset=111 01234 56789 01234 56789 01234 5678901 2 */ "qxq.\2}}q.\2`$q.\2),q.\2^^q.\2__q.q.\3\1" /* ucs2be label: length=8, offset=144 01234567 */ "UCS-2BE\0" ; enum { UCS2LE, UCS2BE, UTF8, SJIS, EUCJP }; enum { PUNCT_HALF_EX, PUNCT_FULL_EX, PUNCT_HALF_EN_P, PUNCT_HALF_Q, PUNCT_FULL_EN_P, PUNCT_FULL_Q, PUNCT_HALF_JP_P, PUNCT_FULL_JP_P, PUNCT_HALF_EN_C, PUNCT_FULL_EN_C, PUNCT_HALF_JP_C, PUNCT_FULL_JP_C, PARSER_OFFSET, LITERAL_HALF_EX_OFFSET, LITERAL_FULL_EX_OFFSET, LITERAL_SPACE_OFFSET, LABEL_OFFSET, KANA_START, KANA_COUNT, FIELD_COUNT }; enum { PUNCT_EXCLAMATION_RANGE = 2, PUNCT_EOS_RANGE = 8, PUNCT_GENERIC_RANGE = 12 }; static int data[5][FIELD_COUNT] = { /* ucs2le */ { 0x0021, /* PUNCT_HALF_EX */ 0xff01, /* PUNCT_FULL_EX */ 0x002e, /* PUNCT_HALF_EN_P */ 0x003f, /* PUNCT_HALF_Q */ 0xff0e, /* PUNCT_FULL_EN_P */ 0xff1f, /* PUNCT_FULL_Q */ 0xff61, /* PUNCT_HALF_JP_P */ 0x3002, /* PUNCT_FULL_JP_P */ 0x002c, /* PUNCT_HALF_EN_C */ 0xff0c, /* PUNCT_FULL_EN_C */ 0xff64, /* PUNCT_HALF_JP_C */ 0x3001, /* PUNCT_FULL_JP_C */ 96, /* PARSER_OFFSET */ 0, /* LITERAL_HALF_EX_OFFSET */ 21, /* LITERAL_FULL_EX_OFFSET */ 40, /* LITERAL_SPACE_OFFSET */ 103, /* LABEL_OFFSET */ 0x3041, /* KANA_START */ 0 /* KANA_COUNT */ }, /* ucs2be */ { 0x0021, /* PUNCT_HALF_EX */ 0xff01, /* PUNCT_FULL_EX */ 0x002e, /* PUNCT_HALF_EN_P */ 0x003f, /* PUNCT_HALF_Q */ 0xff0e, /* PUNCT_FULL_EN_P */ 0xff1f, /* PUNCT_FULL_Q */ 0xff61, /* PUNCT_HALF_JP_P */ 0x3002, /* PUNCT_FULL_JP_P */ 0x002c, /* PUNCT_HALF_EN_C */ 0xff0c, /* PUNCT_FULL_EN_C */ 0xff64, /* PUNCT_HALF_JP_C */ 0x3001, /* PUNCT_FULL_JP_C */ 96, /* PARSER_OFFSET */ 38, /* LITERAL_HALF_EX_OFFSET */ 5, /* LITERAL_FULL_EX_OFFSET */ 1, /* LITERAL_SPACE_OFFSET */ 144, /* LABEL_OFFSET */ 0x3041, /* KANA_START */ 0 /* KANA_COUNT */ }, /* utf8 */ { 0x0021, /* PUNCT_HALF_EX */ 0xff01, /* PUNCT_FULL_EX */ 0x002e, /* PUNCT_HALF_EN_P */ 0x003f, /* PUNCT_HALF_Q */ 0xff0e, /* PUNCT_FULL_EN_P */ 0xff1f, /* PUNCT_FULL_Q */ 0xff61, /* PUNCT_HALF_JP_P */ 0x3002, /* PUNCT_FULL_JP_P */ 0x002c, /* PUNCT_HALF_EN_C */ 0xff0c, /* PUNCT_FULL_EN_C */ 0xff64, /* PUNCT_HALF_JP_C */ 0x3001, /* PUNCT_FULL_JP_C */ 53, /* PARSER_OFFSET */ 4, /* LITERAL_HALF_EX_OFFSET */ 75, /* LITERAL_FULL_EX_OFFSET */ 3, /* LITERAL_SPACE_OFFSET */ 8, /* LABEL_OFFSET */ 0x3041, /* KANA_START */ 0 /* KANA_COUNT */ }, /* sjis */ { 0x21, /* PUNCT_HALF_EX */ 0x8149, /* PUNCT_FULL_EX */ 0x2e, /* PUNCT_HALF_EN_P */ 0x3f, /* PUNCT_HALF_Q */ 0x8144, /* PUNCT_FULL_EN_P */ 0x8148, /* PUNCT_FULL_Q */ 0xa1, /* PUNCT_HALF_JP_P */ 0x8142, /* PUNCT_FULL_JP_P */ 0x2c, /* PUNCT_HALF_EN_C */ 0x8143, /* PUNCT_FULL_EN_C */ 0xa4, /* PUNCT_HALF_JP_C */ 0x8141, /* PUNCT_FULL_JP_C */ 92, /* PARSER_OFFSET */ 4, /* LITERAL_HALF_EX_OFFSET */ 14, /* LITERAL_FULL_EX_OFFSET */ 3, /* LITERAL_SPACE_OFFSET */ 79, /* LABEL_OFFSET */ 0x829f, /* KANA_START */ 0 /* KANA_COUNT */ }, /* eucjp */ { 0x21, /* PUNCT_HALF_EX */ 0xa1aa, /* PUNCT_FULL_EX */ 0x2e, /* PUNCT_HALF_EN_P */ 0x3f, /* PUNCT_HALF_Q */ 0xa1a5, /* PUNCT_FULL_EN_P */ 0xa1a9, /* PUNCT_FULL_Q */ 0x8ea1, /* PUNCT_HALF_JP_P */ 0xa1a3, /* PUNCT_FULL_JP_P */ 0x2c, /* PUNCT_HALF_EN_C */ 0xa1a4, /* PUNCT_FULL_EN_C */ 0x8ea4, /* PUNCT_HALF_JP_C */ 0xa1a2, /* PUNCT_FULL_JP_C */ 111, /* PARSER_OFFSET */ 4, /* LITERAL_HALF_EX_OFFSET */ 89, /* LITERAL_FULL_EX_OFFSET */ 3, /* LITERAL_SPACE_OFFSET */ 46, /* LABEL_OFFSET */ 0xa4a1, /* KANA_START */ 0 /* KANA_COUNT */ } }; static void LoadInput(void) { buffer = (unsigned char*)malloc(buffer_size = 0x1000); for(read_size = 0; feof(infile) == 0;) { if( read_size == buffer_size ) { new_buffer = (unsigned char*)malloc(buffer_size *= 2); assert(new_buffer != NULL); memcpy(new_buffer, buffer, read_size); free(buffer); buffer = new_buffer; } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } } static void NextChar(void) { unsigned char *p; int byte_offset; c = byte_offset = 0; for(p = const_str + data[encoding][PARSER_OFFSET]; *p > 4 && c >= 0; p++) { if( buffer[i + byte_offset] < (*p ^ 208) || buffer[i + byte_offset] > (p[1] ^ 208) ) { /* Byte not within range */ if( byte_offset ) { /* Not matching first byte -> character does not fit encoding */ c = -1; } else { /* Try next range */ for(; *p > 4; p++); } } else { /* Byte within range */ byte_offset++; p++; } } for(byte_offset = 0; c >= 0 && byte_offset < *p; byte_offset++) c = (c << 8) | buffer[i++]; if( encoding == UTF8 && c > 0xe00000 ) { /* Convert to UCS2BE */ c = ((c >> 4) & 0xf000) | ((c >> 2) & 0xfc0) | (c & 0x3f); } else if( encoding == UCS2LE ) { /* Convert to UCS2BE */ c = (c >> 8) | ((c & 0xff) << 8); } } static void NextCharUnconditional(void) { NextChar(); if( c == -1 ) i++; } static void DetectEncoding(void) { /* Look for byte order mark */ int bom[3] = {23, 24, 43}; for(i = 0; i < 3; i++) { if( memcmp(buffer, const_str + bom[i], i == 2 ? 3 : 2) == 0 ) { encoding = i; return; } } /* Look for UCS2 newline */ for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x00 && buffer[i + 1] == 0x0a ) { encoding = UCS2BE; return; } if( buffer[i] == 0x0a && buffer[i + 1] == 0x00 ) { encoding = UCS2LE; return; } } /* Try each encoding */ for(encoding = 2; encoding < 5; encoding++) { data[encoding][KANA_COUNT] = 0; for(i = 0; i < read_size - 3;) { NextChar(); if( c != -1 ) { if( c >= data[encoding][KANA_START] && c <= data[encoding][KANA_START] + KANA_RANGE ) data[encoding][KANA_COUNT]++; } else { data[encoding][KANA_COUNT] = -1; break; } } } /* Select encoding with most kana characters matched */ for(i = encoding = 2; i < 5; i++) { if( data[i][KANA_COUNT] > data[encoding][KANA_COUNT] ) encoding = i; } } static int IsPunct(int range) { int t; for(t = 0; t < range; t++) { if( c == data[encoding][t] ) return 1; } return 0; } static void Output(void *bin, int size) { if( size > 0 ) fwrite(bin, size, 1, stdout); } static void WriteExclamationMark(void) { char *literal = const_str + data[encoding][c > 255 ? LITERAL_FULL_EX_OFFSET : LITERAL_HALF_EX_OFFSET]; Output(literal + 1, *literal - 31); } static void Filter(void) { char *literal; for(i = i0 = 0; i < read_size;) { /* Read net character */ last_char_start = i; NextCharUnconditional(); if( IsPunct(PUNCT_GENERIC_RANGE) || c == 10 ) { /* Flush buffered characters */ if( IsPunct(PUNCT_EOS_RANGE) ) { /* Write characters with spaces inserted */ next_offset = i; for(i = i0; i < last_char_start;) { NextCharUnconditional(); Output(buffer + i0, i - i0); literal = const_str + data[encoding][LITERAL_SPACE_OFFSET]; Output(literal + 1, *literal - 31); i0 = i; } /* Write exclamation mark */ NextCharUnconditional(); if( IsPunct(PUNCT_EXCLAMATION_RANGE) ) WriteExclamationMark(); WriteExclamationMark(); i0 = i; } else { /* Write characters up to last punctuation mark */ Output(buffer + i0, last_char_start - i0); /* Optionally convert last punctuation mark to exclamation mark */ if( c != 10 ) WriteExclamationMark(); else Output(buffer + last_char_start, i - last_char_start); i0 = i; } } } } int main(int argc, char **argv) { if( *++argv != NULL ) { for(; *argv != NULL; ++argv) { printf("%s: ", *argv); if( (infile = fopen(*argv, "rb")) != NULL ) { LoadInput(); fclose(infile); DetectEncoding(); puts(const_str + data[encoding][LABEL_OFFSET]); free(buffer); } else { puts("Can not open"); } } } else { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif infile = stdin; LoadInput(); DetectEncoding(); Filter(); free(buffer); } return 0; }