Okay, that seems horribly complicated. Oh the irony.
Here's my code for UTF8 to UCS4 conversion:
#include <string>
#include <vector>
/* So much code for so little meaning. This is so called purity. */
class SeqResult
{
bool Bool;
unsigned Length;
unsigned Value;
int ShiftType;
public:
SeqResult(): Bool(false), Length(),Value(),ShiftType() { }
SeqResult(bool v) : Bool(v), Length(),Value(),ShiftType() { }
SeqResult(bool v, unsigned len) : Bool(v), Length(len), Value(),ShiftType() { }
SeqResult(bool v, unsigned len, unsigned val) : Bool(v), Length(len), Value(val), ShiftType() { }
SeqResult(int seq, unsigned len): Bool(true), Length(len), Value(), ShiftType(seq) { }
operator bool() const { return Bool; }
unsigned GetLength() const { return Length; }
unsigned GetValue() const { return Value; }
int GetShiftType() const { return ShiftType; }
void SetBool(bool b) { Bool = b; }
void SetLength(unsigned l) { Length = l; }
void SetValue(unsigned n) { Value = n; }
void SetShift(int s) { ShiftType = s; }
};
namespace UTF8
{
void SeqValue(std::string& result, wchar_t ch, int&)
{
const unsigned n = ch;
if(n < 0x80) // <=7 bits
result += (char)n;
else
{
if(n < 0x800) // <11>>6));
else
{
if(n < 0x10000) // <16>>12));
else // <21>>18));
result += (char)(0x80 + ((n>>12)&63));
}
result += (char)(0x80 + ((n>>6)&63));
}
result += (char)(0x80 + (n&63));
}
}
const SeqResult IsData(const std::string& input, unsigned pos, int)
{
/* This must be the most obfuscated UTF-8 decoder I have seen.
* I tried to avoid having many hardcoded magical numbers. -Bisqwit
*/
unsigned char headbyte = input[pos];
/* How many bytes does each sequence take? */
static const char sizes[16] =
{ 1,1,1,1,1,1,1,1,
0,0,0,0,2,2,3,4 };
/* What is the minimum value of the given sequence? */
static const unsigned minimums[4] = { 0, 0x80, 0x800, 0x10000 };
/* How to mask bits from the first byte */
static const char masks[4] = { 0x7F, 0x1F, 0x0F, 0x07 };
unsigned len = sizes[headbyte >> 4];
//fprintf(stderr, "UTF-8 in: %c -> len=%u\n", (char)headbyte, len);
/* Ensure it wasn't an invalid sequence */
if(len == 0) return false;
/* Ensure we have enough bytes */
if(pos+len > input.size()) return false;
unsigned result=0, shl=0;
/* Process the tail bytes - last to first. */
for(unsigned n = len; --n > 0; )
{
unsigned char byte = input[pos+n];
//fprintf(stderr, "byte %u: %02X\n", n, byte);
// The tail bytes must be 10xxxxxx
if((byte & 0xC0) != 0x80) return false;
unsigned bits = byte & 0x3F;
result |= bits << shl;
shl += 6;
}
/* Process the head byte. */
// The top bits have already been verified in sizes[].
// Take the low bits.
unsigned bits = headbyte & masks[len-1];
result |= bits << shl;
if(result < minimums[len-1])
{
/* Non-optimal coding - not likely valid. */
return false;
}
//fprintf(stderr, "UTF-8 in: %.*s -> %X\n", len, input.c_str()+pos, result);
return SeqResult(true, len, result);
}
}
void UTF8toUCS4(const std::string& input,
std::vector<unsigned>& result)
{
for(std::string::size_type pos = 0; pos < input.size(); )
{
SeqResult r = UTF8::IsData(input, pos, 0);
if(r)
{
result.push_back(r.GetValue());
pos += r.GetLength();
}
else
{
result.push_back('?');
++pos;
}
}
}
void UCS4toUTF8(const std::vector<unsigned>& input,
std::string& result)
{
for(std::string::size_type pos = 0; pos < input.size(); ++pos)
{
int shift;
UTF8::SeqValue(result, input[pos], shift);
}
}
I've got simpler code too, but this is very robust code that handles all kinds of error situations neatly and is suitably well commented.