/* shindou1.c - Don Yang (uguu.org) 02/24/07 */ #include #include #include #include #ifdef _WIN32 #include #include #endif static FILE *infile; static unsigned char *buffer, *new_buffer; static int buffer_size, read_size; static void LoadInput(void) { buffer = (unsigned char*)malloc(buffer_size = 0x1000); for(read_size = 0; feof(infile) == 0;) { if( read_size == buffer_size ) { new_buffer = (unsigned char*)malloc(buffer_size *= 2); assert(new_buffer != NULL); memcpy(new_buffer, buffer, read_size); free(buffer); buffer = new_buffer; } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } } /**********************************************************************/ enum { UCS2LE, UCS2BE, UTF8, SJIS, EUCJP }; static int kana_count[5]; #define KANA_RANGE (0x3093-0x3041) static int i, c, encoding; static char *bom[3] = { "\xff\xfe", "\xfe\xff", "\xef\xbb\xbf" }; static unsigned char parse_ucs2[] = { 2 }; static unsigned char parse_utf8[] = { 0xf0, 0xf7, 0x80, 0xbf, 0x80, 0xbf, 0x80, 0xbf, 4, 0xe0, 0xef, 0x80, 0xbf, 0x80, 0xbf, 3, 0xc0, 0xdf, 0x80, 0xbf, 2, 1 }; static unsigned char parse_sjis[] = { 0x81, 0x9f, 0x40, 0xfc, 2, 0xe0, 0xef, 0x40, 0xfc, 2, 1 }; static unsigned char parse_eucjp[] = { 0xa1, 0xa8, 0xa1, 0xfe, 2, 0xad, 0xad, 0xa1, 0xfe, 2, 0xb0, 0xf4, 0xa1, 0xfe, 2, 0xf9, 0xfc, 0xa1, 0xfe, 2, 0x8e, 0x8e, 0xa1, 0xfe, 2, 0x8f, 0x8f, 0xa1, 0xfe, 0xa1, 0xfe, 3, 1 }; #define PUNCT_EXCLAMATION_RANGE 2 #define PUNCT_EOS_RANGE 8 #define PUNCT_GENERIC_RANGE 12 static int punct_ucs2[] = { /* half ! */ 0x0021, /* full ! */ 0xff01, /* half . */ 0x002e, /* half ? */ 0x003f, /* full . */ 0xff0e, /* full ? */ 0xff1f, /* half jp period */ 0xff61, /* full jp period */ 0x3002, /* half , */ 0x002c, /* full , */ 0xff0c, /* half jp comma */ 0xff64, /* full jp comma */ 0x3001 }; static int punct_sjis[] = { /* half ! */ 0x21, /* full ! */ 0x8149, /* half . */ 0x2e, /* half ? */ 0x3f, /* full . */ 0x8144, /* full ? */ 0x8148, /* half jp period */ 0xa1, /* full jp period */ 0x8142, /* half , */ 0x2c, /* full , */ 0x8143, /* half jp comma */ 0xa4, /* full jp comma */ 0x8141 }; static int punct_eucjp[] = { /* half ! */ 0x21, /* full ! */ 0xa1aa, /* half . */ 0x2e, /* half ? */ 0x3f, /* full . */ 0xa1a5, /* full ? */ 0xa1a9, /* half jp period */ 0x8ea1, /* full jp period */ 0xa1a3, /* half , */ 0x2c, /* full , */ 0xa1a4, /* half jp comma */ 0x8ea4, /* full jp comma */ 0xa1a2 }; typedef struct { unsigned char *parser; int *punct; char *label, *literal[3]; int kana_start; } EncodingSet; static EncodingSet db[5] = { { parse_ucs2, punct_ucs2, "UCS-2LE", { "2!\0", "2\1\xff", "2 \0" }, 0x3041 }, { parse_ucs2, punct_ucs2, "UCS-2BE", { "2\0!", "2\xff\1", "2\0 " }, 0x3041 }, { parse_utf8, punct_ucs2, "UTF-8", { "1!", "3\xef\xbc\x81", "1 " }, 0x3041 }, { parse_sjis, punct_sjis, "Shift_JIS", { "1!", "2\x81I", "1 " }, 0x829f }, { parse_eucjp, punct_eucjp, "EUC-JP", { "1!", "2\xa1\xaa", "1 " }, 0xa4a1 } }; static EncodingSet *e; static void NextChar(void) { unsigned char *p; int byte_offset; c = byte_offset = 0; for(p = e->parser; *p > 4 && c >= 0; p++) { if( buffer[i + byte_offset] < *p || buffer[i + byte_offset] > p[1] ) { /* Byte not within range */ if( byte_offset ) { /* Not matching first byte -> character does not fit encoding */ c = -1; } else { /* Try next range */ for(; *p > 4; p++); } } else { /* Byte within range */ byte_offset++; p++; } } for(byte_offset = 0; c >= 0 && byte_offset < *p; byte_offset++) c = (c << 8) | buffer[i++]; if( encoding == UTF8 && c > 0xe00000 ) { /* Convert to UCS2BE */ c = ((c >> 4) & 0xf000) | ((c >> 2) & 0xfc0) | (c & 0x3f); } else if( encoding == UCS2LE ) { /* Convert to UCS2BE */ c = (c >> 8) | ((c & 0xff) << 8); } } static void NextCharUnconditional(void) { NextChar(); if( c == -1 ) i++; } static void DetectEncoding(void) { /* Look for byte order mark */ for(i = 0; i < 3; i++) { if( memcmp(buffer, bom[i], strlen(bom[i])) == 0 ) { e = &db[encoding = i]; return; } } /* Look for UCS2 newline */ for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 0x00 && buffer[i + 1] == 0x0a ) { e = &db[encoding = UCS2BE]; return; } if( buffer[i] == 0x0a && buffer[i + 1] == 0x00 ) { e = &db[encoding = UCS2LE]; return; } } /* Try each encoding */ for(encoding = 2; encoding < 5; encoding++) { e = &db[encoding]; kana_count[encoding] = 0; for(i = 0; i < read_size - 3;) { NextChar(); if( c != -1 ) { if( c >= db[encoding].kana_start && c <= db[encoding].kana_start + KANA_RANGE ) kana_count[encoding]++; } else { kana_count[encoding] = -1; break; } } } /* Select encoding with most kana characters matched */ for(i = encoding = 2; i < 5; i++) { if( kana_count[i] > kana_count[encoding] ) encoding = i; } e = &db[encoding]; } static int last_char_start, i0, next_offset; static int IsPunct(int range) { int t; for(t = 0; t < range; t++) { if( c == e->punct[t] ) return 1; } return 0; } static void Output(void *data, int size) { if( size > 0 ) fwrite(data, size, 1, stdout); } static void WriteExclamationMark(void) { if( c > 255 ) Output(e->literal[1] + 1, *e->literal[1] - '0'); else Output(e->literal[0] + 1, *e->literal[0] - '0'); } static void Filter(void) { for(i = i0 = 0; i < read_size;) { /* Read net character */ last_char_start = i; NextCharUnconditional(); if( IsPunct(PUNCT_GENERIC_RANGE) || c == 10 ) { /* Flush buffered characters */ if( IsPunct(PUNCT_EOS_RANGE) ) { /* Write characters with spaces inserted */ next_offset = i; for(i = i0; i < last_char_start;) { NextCharUnconditional(); Output(buffer + i0, i - i0); Output(e->literal[2] + 1, *e->literal[2] - '0'); i0 = i; } /* Write exclamation mark */ NextCharUnconditional(); if( IsPunct(PUNCT_EXCLAMATION_RANGE) ) WriteExclamationMark(); WriteExclamationMark(); i0 = i; } else { /* Write characters up to last punctuation mark */ Output(buffer + i0, last_char_start - i0); /* Optionally convert last punctuation mark to exclamation mark */ if( c != 10 ) WriteExclamationMark(); else Output(buffer + last_char_start, i - last_char_start); i0 = i; } } } } int main(int argc, char **argv) { if( *++argv != NULL ) { for(; *argv != NULL; ++argv) { printf("%s: ", *argv); if( (infile = fopen(*argv, "rb")) != NULL ) { LoadInput(); fclose(infile); DetectEncoding(); puts(e->label); free(buffer); } else { puts("Can not open"); } } } else { #ifdef _WIN32 setmode(fileno(stdin), O_BINARY); setmode(fileno(stdout), O_BINARY); #endif infile = stdin; LoadInput(); DetectEncoding(); Filter(); free(buffer); } return 0; }