#include <set>
#include "SBuf.h"

/** 
 * Efficiently converts raw input into a stream of basic tokens.
 * Custom token boundary/separation rules are supported via caller-provided,
 * pre-computed character sets. The caller (a parser of some kind) defines
 * the input grammar by using an appropriate sequence of token(), prefix(),
 * and skip() calls, with the right parameters restricting token composition.
 */
class Tokenizer {
public:
    /// a collection of unique characters; TODO: support negation, merging
    typedef std::set<char> CharacterSet; // TODO: optimize using a bool array

    explicit Tokenizer(const SBuf &inBuf);

    bool atEnd() const { return !buf_.length(); }
    const SBuf &remaining() const { return buf_; }
    void reset(const SBuf &newBuf) { buf_ = newBuf; }

    /* The following methods start from the beginning of the input buffer.
     * They return true and consume parsed chars if a non-empty token is found.
     * Otherwise, they return false without any side-effects. */

    /** Basic strtok(3):
     *  Skips all leading delimiters (if any),
     *  accumulates all characters up to the first delimiter (a token), and
     *  skips all trailing delimiters (if any).
     *  Want to extract delimiters? Use three prefix() calls instead.
     */
    bool token(SBuf &token, const CharacterSet &whitespace);

    /// Accumulates all sequential permitted characters (a token).
    bool prefix(SBuf &token, const CharacterSet &tokenChars);

    /// Skips all sequential permitted characters (a token).
    bool skip(const CharacterSet &tokenChars);

    /// Skips a given token.
    bool skip(const SBuf &token);

    /// Skips a given character (a token).
    bool skip(const char token);

private:
    SBuf buf_; ///< yet unparsed input
};


