// unicode.h - Don Yang (uguu.org) // // 06/22/11 #include"unicode.h" #include #include"util.h" // Parse a UTF-8 character unsigned int ParseUtf8Char(const string &input, string::const_iterator *i) { assert(*i != input.end()); const int c1 = static_cast(**i) & 0xff; assert((c1 & 0x80) != 0); do { #define READ_CONTINUATION_BYTE(x) \ ++*i; \ if( *i == input.end() ) { --*i; break; } \ const int x = static_cast(**i) & 0xff; \ if( (x & 0xc0) != 0x80 ) break if( (c1 & 0xe0) == 0xc0 ) { // 2 bytes READ_CONTINUATION_BYTE(c2); const unsigned int u = ((c1 & 0x1f) << 6) | (c2 & 0x3f); return u; } else if( (c1 & 0xf0) == 0xe0 ) { // 3 bytes READ_CONTINUATION_BYTE(c2); READ_CONTINUATION_BYTE(c3); return ((c1 & 0x0f) << 12) | ((c2 & 0x3f) << 6) | (c3 & 0x3f); } else if( (c1 & 0xf8) == 0xf0 ) { // 4 bytes READ_CONTINUATION_BYTE(c2); READ_CONTINUATION_BYTE(c3); READ_CONTINUATION_BYTE(c4); return ((c1 & 0x07) << 18) | ((c2 & 0x3f) << 12) | ((c3 & 0x3f) << 6) | (c4 & 0x3f); } else if( (c1 & 0xfc) == 0xf8 ) { // 5 bytes READ_CONTINUATION_BYTE(c2); READ_CONTINUATION_BYTE(c3); READ_CONTINUATION_BYTE(c4); READ_CONTINUATION_BYTE(c5); return ((c1 & 0x03) << 24) | ((c2 & 0x3f) << 18) | ((c3 & 0x3f) << 12) | ((c4 & 0x3f) << 6) | (c5 & 0x3f); } else if( (c1 & 0xfe) == 0xfc ) { // 6 bytes READ_CONTINUATION_BYTE(c2); READ_CONTINUATION_BYTE(c3); READ_CONTINUATION_BYTE(c4); READ_CONTINUATION_BYTE(c5); READ_CONTINUATION_BYTE(c6); return ((c1 & 0x01) << 30) | ((c2 & 0x3f) << 24) | ((c3 & 0x3f) << 18) | ((c4 & 0x3f) << 12) | ((c5 & 0x3f) << 6) | (c6 & 0x3f); } #undef READ_CONTINUATION_BYTE } while( false ); // Invalid input, return Unicode replacement character return 0xfffd; } // Advance an iterator to the next character. void NextChar(string *input, string::iterator *i) { assert(*i != input->end()); const int c1 = static_cast(**i) & 0xff; if( (c1 & 0x80) == 0 ) { ++*i; return; } int size; if( (c1 & 0xe0) == 0xc0 ) { size = 2; } else if( (c1 & 0xf0) == 0xe0 ) { size = 3; } else if( (c1 & 0xf8) == 0xf0 ) { size = 4; } else if( (c1 & 0xfc) == 0xf8 ) { size = 5; } else if( (c1 & 0xfe) == 0xfc ) { size = 6; } else { // Broken character, cursor is probably in the middle of // continuation bytes. size = 1; } const int bytes_left = input->end() - *i; if( bytes_left > size ) *i += size; else *i = input->end(); }