/* shindou10_no_ucase.c - Don Yang (uguu.org) 02/24/07 */ #include #include #include #define F(_) static void _(void) #define G(_, __) static int _(void) { return __; } #define X(_) } else if(_) { FILE *infile; int state, buffer_size, read_size, i, i0, last_char_offset, c, utf8_kana, sjis_kana, eucjp_kana, encoding, byte_offset, k255=255, k0x8141 = 0x8141, k0xa1a2 = 0xa1a2, k0x3001 = 0x3001, k0xe38081 = 0xe38081, k0xefbc81 = 0xefbc81, k0xff01 = 0xff01; unsigned char *tmp, *buffer, *const_str = " \0!\0 \1\xff\1\xef\xbc\x81I\xa1\xaa" " 'PoPoPo\4""0?PoPo\3\20\17Po\2\1" "QO\220,\2""0?\220,\2\1" "qxq.\2}}q.\2`$q.\2,q.\2^^q.\2__q.q.\3\1", *parser, *p; void Output(void *buffer, int size) { fwrite(buffer, size, 1, stdout); } F(ExtendBuffer) { if( read_size == buffer_size ) { tmp = buffer; buffer = malloc((buffer_size *= 2) + 3); memcpy(buffer, tmp, read_size); free(tmp); } read_size += fread(buffer + read_size, 1, buffer_size - read_size, infile); } F(NextChar) { c = byte_offset = 0; for(p = parser; *p > 4 && c + 1; p++) { if( buffer[i + byte_offset] < (*p ^ 208) || buffer[i + byte_offset] > (p[1] ^ 208) ) { if( byte_offset ) { c = -1; X(1) for(; *p > 4; p++); } X(1) byte_offset++; p++; } } for(byte_offset = 0; c + 1 && byte_offset < *p; byte_offset++) c = (c << 8) | buffer[i++]; if( encoding & 8 ) c = (c >> 8) | ((c & k255) << 8); } F(DetectFileEncoding) { read_size = utf8_kana = sjis_kana = eucjp_kana = 0; for(encoding = 7; !feof(infile) && (encoding & (encoding - 1));) { ExtendBuffer(); if( !(encoding & 24) ) { if( *buffer == k255 && buffer[1] == k255-1 ) { encoding = 8; X( *buffer == k255-1 && buffer[1] == k255 ) encoding = 16; X(1) for(i = 0; i < read_size - 1; i += 2) { if( buffer[i] == 10 && !buffer[i + 1] ) { encoding = 8; goto finalize; } if( !buffer[i] && buffer[i + 1] == 10 ) { encoding = 16; goto finalize; } } if( *buffer == 0xef && buffer[1] == 0xbb && buffer[2] == 0xbf ) { encoding = 1; X(1) parser = const_str + 14; for(i = 0; i < read_size - 3 && encoding & 1;) { NextChar(); if( c < 0 ) { encoding &= ~1; X( (c > k0xe38081+k255 && c < k0xe38081+319) || (c > k0xe38081+k255+k255 && c < k0xe38081+531) ) utf8_kana++; } } parser += 22; for(i = 0; i < read_size - 1 && encoding & 2;) { NextChar(); if( c < 0 ) { encoding &= ~2; X( c > k0x8141+349 && c < k0x8141+433 ) sjis_kana++; } } parser += 11; for(i = 0; i < read_size - 2 && encoding & 4;) { NextChar(); if( c < 0 ) { encoding &= ~4; X( c > k0xa1a2+766 && c < k0xa1a2+850 ) eucjp_kana++; } } } } } } finalize: if( !(encoding & 24) ) { if( !(encoding & 1) ) utf8_kana = -1; if( !(encoding & 2) ) sjis_kana = -1; if( !(encoding & 4) ) eucjp_kana = -1; encoding = utf8_kana < sjis_kana ? sjis_kana < eucjp_kana ? 4 : 2 : utf8_kana < eucjp_kana ? 4 : encoding & 1 ? 1 : 4; } } G(IsExclaimationPunct, c == 33 || (encoding & 24 && c == k0xff01) || (encoding & 1 && c == k0xefbc81) || (encoding & 2 && c == k0x8141+8) || (encoding & 4 && c == k0xa1a2+8)) G(IsEOSPunct, c == 46 || c == 63 || (encoding & 24 && (c == k0xff01+30 || c == k0xff01+13 || c == k0x3001+1 || c == k0xff01+96)) || (encoding & 1 && (c == k0xefbc81+30 || c == k0xefbc81+13 || c == k0xe38081+1 || c == k0xefbc81+288)) || (encoding & 2 && (c == k0x8141+7 || c == k0x8141+3 || c == k0x8141+1)) || (encoding & 4 && (c == k0xa1a2+7 || c == k0xa1a2+3 || c == k0xa1a2+1)) || IsExclaimationPunct()) G(IsPunct, c == 44 || c == 10 || (encoding & 24 && (c == k0xff01+11 || c == k0x3001 || c == k0xff01+99)) || (encoding & 1 && (c == k0xefbc81+11 || c == k0xe38081 || c == k0xefbc81+291)) || (encoding & 2 && (c == k0x8141+2 || c == k0x8141)) || (encoding & 4 && (c == k0xa1a2+2 || c == k0xa1a2)) || IsEOSPunct()) F(WriteExclamationMark) { Output(const_str + (c > k255 ? encoding & 16 ? 6 : encoding & 8 ? 5 : encoding & 1 ? 8 : encoding & 2 ? 10 : 12 : encoding & 16 ? 1 : 2), c > k255 ? encoding & 1 ? 3 : 2 : encoding & 24 ? 2 : 1); } int main(int argc, char **argv) { buffer_size = 0x1000; buffer = malloc(buffer_size); if( *++argv ) { for(; *argv; ++argv) { Output(*argv, strlen(*argv)); Output(": ", 2); if( (infile = fopen(*argv, "rb")) ) { DetectFileEncoding(); puts(encoding - 8 ? encoding - 16 ? encoding - 1 ? encoding - 2 ? "EUC-JP" : "Shift_JIS" : "UTF-8" : "UCS-2BE" : "UCS-2LE"); fclose(infile); X(1) puts("can not open"); } } X(1) infile = stdin; DetectFileEncoding(); parser = const_str + ( encoding - 1 ? encoding - 2 ? encoding - 4 ? 34 : 47 : 36 : 14 ); for(i = i0 = state = 0; !feof(infile) || i < read_size;) { last_char_offset = i; NextChar(); if( c < 0 ) i++; if( i < read_size - 3 ) { memmove(buffer, buffer + i0, read_size - i0); read_size -= i0; i -= i0; last_char_offset -= i0; i0 = 0; ExtendBuffer(); } if( state ) { if( IsPunct() ) { if( IsEOSPunct() ) { for(i = i0; i < last_char_offset; i0 = i) { NextChar(); Output(buffer + i0, i - i0); Output(const_str + (encoding & 16 ? 3 : 0), encoding & 24 ? 2 : 1); } NextChar(); } if( i0 < last_char_offset ) Output(buffer + i0, last_char_offset - i0); if( IsExclaimationPunct() ) { WriteExclamationMark(); WriteExclamationMark(); X( IsPunct() && c - 10 ) WriteExclamationMark(); X(1) Output(buffer + last_char_offset, i - last_char_offset); } state = 0; } X( c < 0 || IsPunct() ) Output(buffer + last_char_offset, i - last_char_offset); X(1) state = 1; i0 = last_char_offset; } } } free(buffer); return 0; }