#ifndef MISC_UTF8ITER_HPP
#define MISC_UTF8ITER_HPP

#include <cstring>
#include <tuple>

class Utf8Stream
{
public:

    typedef uint32_t UnicodeChar;
    typedef unsigned char const * Point;

    //static const unicode_char sBadChar = 0xFFFFFFFF; gcc can't handle this
    static UnicodeChar sBadChar () { return UnicodeChar (0xFFFFFFFF); }

    Utf8Stream (Point begin, Point end) :
        cur (begin), nxt (begin), end (end), val(Utf8Stream::sBadChar())
    {
    }

    Utf8Stream (const char * str) :
        cur ((unsigned char*) str), nxt ((unsigned char*) str), end ((unsigned char*) str + strlen(str)), val(Utf8Stream::sBadChar())
    {
    }

    Utf8Stream (std::pair <Point, Point> range) :
        cur (range.first), nxt (range.first), end (range.second), val(Utf8Stream::sBadChar())
    {
    }

    bool eof () const
    {
        return cur == end;
    }

    Point current () const
    {
        return cur;
    }

    UnicodeChar peek ()
    {
        if (cur == nxt)
            next ();
        return val;
    }

    UnicodeChar consume ()
    {
        if (cur == nxt)
            next ();
        cur = nxt;
        return val;
    }

    static std::pair <UnicodeChar, Point> decode (Point cur, Point end)
    {
        if ((*cur & 0x80) == 0)
        {
            UnicodeChar chr = *cur++;

            return std::make_pair (chr, cur);
        }

        int octets;
        UnicodeChar chr;

        std::tie (octets, chr) = octet_count (*cur++);

        if (octets > 5)
            return std::make_pair (sBadChar(), cur);

        Point eoc = cur + octets;

        if (eoc > end)
            return std::make_pair (sBadChar(), cur);

        while (cur != eoc)
        {
            if ((*cur & 0xC0) != 0x80) // check continuation mark
                return std::make_pair (sBadChar(), cur);

            chr = (chr << 6) | UnicodeChar ((*cur++) & 0x3F);
        }

        return std::make_pair (chr, cur);
    }

private:

    static std::pair <int, UnicodeChar> octet_count (unsigned char octet)
    {
        int octets;

        unsigned char mark = 0xC0;
        unsigned char mask = 0xE0;

        for (octets = 1; octets <= 5; ++octets)
        {
            if ((octet & mask) == mark)
                break;

            mark = (mark >> 1) | 0x80;
            mask = (mask >> 1) | 0x80;
        }

        return std::make_pair (octets, octet & ~mask);
    }

    void next ()
    {
        std::tie (val, nxt) = decode (nxt, end);
    }

    Point cur;
    Point nxt;
    Point end;
    UnicodeChar val;
};

#endif