From 75757cb675f91b3e010221cb9687842c70f89009 Mon Sep 17 00:00:00 2001 From: Nathan Jeffords Date: Sun, 27 Jan 2013 10:43:41 -0800 Subject: [PATCH] Created a class to represent a stream of UTF8 characters. --- components/misc/utf8stream.hpp | 115 +++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) create mode 100644 components/misc/utf8stream.hpp diff --git a/components/misc/utf8stream.hpp b/components/misc/utf8stream.hpp new file mode 100644 index 000000000..a491ed082 --- /dev/null +++ b/components/misc/utf8stream.hpp @@ -0,0 +1,115 @@ +#ifndef MISC_UTF8ITER_HPP +#define MISC_UTF8ITER_HPP + +#include + +class utf8_stream +{ +public: + + typedef uint32_t unicode_char; + typedef unsigned char const * point; + + static const unicode_char sBadChar = 0xFFFFFFFF; + + utf8_stream (point begin, point end) : + cur (begin), nxt (begin), end (end) + { + } + + utf8_stream (std::pair range) : + cur (range.first), nxt (range.first), end (range.second) + { + } + + bool eof () const + { + return cur == end; + } + + point current () const + { + return cur; + } + + unicode_char peek () + { + if (cur == nxt) + next (); + return val; + } + + unicode_char consume () + { + if (cur == nxt) + next (); + cur = nxt; + return val; + } + + static std::pair decode (point cur, point end) + { + if ((*cur & 0x80) == 0) + { + unicode_char chr = *cur++; + + return std::make_pair (chr, cur); + } + + int octets; + unicode_char chr; + + boost::tie (octets, chr) = octet_count (*cur++); + + if (octets > 5) + return std::make_pair (sBadChar, cur); + + auto eoc = cur + octets; + + if (eoc > end) + return std::make_pair (sBadChar, cur); + + while (cur != eoc) + { + if ((*cur & 0xC0) != 0x80) // check continuation mark + return std::make_pair (sBadChar, cur);; + + chr = (chr << 6) | unicode_char ((*cur++) & 0x3F); + } + + return std::make_pair (chr, cur); + } + +private: + + static std::pair octet_count (unsigned char octet) + { + int octets; + + unsigned char mark = 0xC0; + unsigned char mask = 0xE0; + + for (octets = 1; octets <= 5; ++octets) + { + if ((octet & mask) == mark) + break; + + mark = (mark >> 1) | 0x80; + mask = (mask >> 1) | 0x80; + } + + return std::make_pair (octets, octet & ~mask); + } + + void next () + { + boost::tie (val, nxt) = decode (nxt, end); + } + + point cur; + point nxt; + point end; + unicode_char val; +}; + +#endif \ No newline at end of file