From 69b8f140b414712379761cc2f8ed5114b229dba3 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Mon, 2 May 2016 19:47:29 +0100 Subject: Add a flexible extensible lexer --- gentoobrowse-api/service/utils/lexer.cpp | 137 +++++++++++++++++++++++++++++++ gentoobrowse-api/service/utils/lexer.h | 62 ++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 gentoobrowse-api/service/utils/lexer.cpp create mode 100644 gentoobrowse-api/service/utils/lexer.h diff --git a/gentoobrowse-api/service/utils/lexer.cpp b/gentoobrowse-api/service/utils/lexer.cpp new file mode 100644 index 0000000..8ed83b8 --- /dev/null +++ b/gentoobrowse-api/service/utils/lexer.cpp @@ -0,0 +1,137 @@ +#include "lexer.h" + +namespace Gentoo { + namespace Utils { + const Lexer::State Lexer::InitialState = ""; + + class Regex : public Lexer::Pattern { + public: + Regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match) : + err(nullptr), + regex(g_regex_new(pattern.c_str(), compile, match, &err)), + info(nullptr) + { + if (!regex) { + std::runtime_error e(std::string("Failed to create GRegex: ") + err->message); + g_error_free(err); + throw e; + } + } + + ~Regex() + { + if (err) { + g_error_free(err); + } + if (info) { + g_match_info_free(info); + } + g_regex_unref(regex); + } + + bool matches(const gchar * string, size_t length, size_t position) const override + { + if (info) { + g_match_info_free(info); + } + g_regex_match_full(regex, string, length, position, G_REGEX_MATCH_ANCHORED, &info, &err); + if (err) { + std::runtime_error e(std::string("Failed to execute regex: ") + err->message); + g_error_free(err); + throw e; + } + str = string; + return g_match_info_matches(info); + } + + size_t matchedLength() const override + { + gint start, end; + g_match_info_fetch_pos(info, 0, &start, &end); + return end - start; + } + + boost::optional match(int n) const override + { + gint start, end; + if (g_match_info_fetch_pos(info, n, &start, &end)) { + if (start == -1 && end == -1) { + return boost::optional(); + } + return Glib::ustring(str + start, end - start); + } + return boost::optional(); + } + + private: + mutable GError * err; + GRegex * regex; + mutable GMatchInfo * info; + mutable const gchar * str; + }; + + Lexer::PatternPtr + Lexer::regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match) + { + return PatternPtr(new Regex(pattern, compile, match)); + } + + void + Lexer::extract(const gchar * string, size_t length) const + { + ExecuteState es; + while (es.position < length) { + const Rule * selected = nullptr; + for (const auto & r : rules) { + const auto & s = boost::get<0>(r); + if (s.find(es.getState()) == s.end()) { + continue; + } + const auto & p = boost::get<1>(r); + if (p->matches(string, length, es.position)) { + selected = &r; + break; + } + } + if (!selected) { + throw std::runtime_error(std::string("Unexpected input at ") + (string + es.position)); + } + es.pattern = boost::get<1>(*selected); + const auto & h = boost::get<2>(*selected); + h(&es); + es.position += es.pattern->matchedLength(); + } + + } + + Lexer::ExecuteState::ExecuteState() : + position(0) + { + stateStack.push_back(InitialState); + } + + void + Lexer::ExecuteState::setState(const State & s) + { + stateStack.back() = s; + } + + void + Lexer::ExecuteState::pushState(const State & s) + { + stateStack.push_back(s); + } + + void + Lexer::ExecuteState::popState() + { + stateStack.pop_back(); + } + + const Lexer::State & + Lexer::ExecuteState::getState() const + { + return stateStack.back(); + } + } +} diff --git a/gentoobrowse-api/service/utils/lexer.h b/gentoobrowse-api/service/utils/lexer.h new file mode 100644 index 0000000..44d3d57 --- /dev/null +++ b/gentoobrowse-api/service/utils/lexer.h @@ -0,0 +1,62 @@ +#ifndef GENTOOBROWSE_SERVICE_UTILS_LEXER_H +#define GENTOOBROWSE_SERVICE_UTILS_LEXER_H + +#include +#include +#include +#include +#include +#include +#include + +namespace Gentoo { + namespace Utils { + class Lexer { + public: + class Pattern { + public: + virtual ~Pattern() = default; + + virtual bool matches(const gchar *, size_t, size_t) const = 0; + virtual size_t matchedLength() const = 0; + virtual boost::optional match(int) const = 0; + }; + typedef boost::shared_ptr PatternPtr; + + typedef std::string State; + typedef std::set States; + + class ExecuteState { + public: + ExecuteState(); + + void pushState(const State &); + void popState(); + void setState(const State &); + const State & getState() const; + + size_t position; + PatternPtr pattern; + + private: + std::vector stateStack; + }; + + typedef boost::function Handler; + typedef boost::tuple Rule; + typedef std::vector Rules; + + static const State InitialState; + Rules rules; + + static PatternPtr regex(const Glib::ustring &, GRegexCompileFlags compile = (GRegexCompileFlags)0, GRegexMatchFlags match = (GRegexMatchFlags)0); + + public: + void extract(const gchar * string, size_t length) const; + }; + + } +} + +#endif + -- cgit v1.2.3