From 7567f3f24c17dde282a09235a9c43bfc979d17c4 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Fri, 14 Oct 2016 01:58:19 +0100 Subject: Migrate extensible lexer from gentoobrowse-api --- libadhocutil/Jamfile.jam | 1 + libadhocutil/lexer-regex.cpp | 78 +++++++++++++++++++++++++++++++++++ libadhocutil/lexer-regex.h | 13 ++++++ libadhocutil/lexer.cpp | 78 +++++++++++++++++++++++++++++++++++ libadhocutil/lexer.h | 61 +++++++++++++++++++++++++++ libadhocutil/unittests/Jamfile.jam | 11 +++++ libadhocutil/unittests/testLexer.cpp | 80 ++++++++++++++++++++++++++++++++++++ 7 files changed, 322 insertions(+) create mode 100644 libadhocutil/lexer-regex.cpp create mode 100644 libadhocutil/lexer-regex.h create mode 100644 libadhocutil/lexer.cpp create mode 100644 libadhocutil/lexer.h create mode 100644 libadhocutil/unittests/testLexer.cpp diff --git a/libadhocutil/Jamfile.jam b/libadhocutil/Jamfile.jam index e0504d2..5793a8d 100644 --- a/libadhocutil/Jamfile.jam +++ b/libadhocutil/Jamfile.jam @@ -21,6 +21,7 @@ lib adhocutil : boost_filesystem boost_thread curl + ..//glibmm dl : : . diff --git a/libadhocutil/lexer-regex.cpp b/libadhocutil/lexer-regex.cpp new file mode 100644 index 0000000..9ffee8b --- /dev/null +++ b/libadhocutil/lexer-regex.cpp @@ -0,0 +1,78 @@ +#include "lexer-regex.h" + +namespace AdHoc { + namespace LexerMatchers { + class Regex : public Lexer::Pattern { + public: + Regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match) : + err(nullptr), + regex(g_regex_new(pattern.c_str(), compile, match, &err)), + info(nullptr) + { + if (!regex) { + std::runtime_error e(std::string("Failed to create GRegex: ") + err->message); + g_error_free(err); + throw e; + } + } + + ~Regex() + { + if (err) { + g_error_free(err); + } + if (info) { + g_match_info_free(info); + } + g_regex_unref(regex); + } + + bool matches(const gchar * string, size_t length, size_t position) const override + { + if (info) { + g_match_info_free(info); + } + g_regex_match_full(regex, string, length, position, G_REGEX_MATCH_ANCHORED, &info, &err); + if (err) { + std::runtime_error e(std::string("Failed to execute regex: ") + err->message); + g_error_free(err); + throw e; + } + str = string; + return g_match_info_matches(info); + } + + size_t matchedLength() const override + { + gint start, end; + g_match_info_fetch_pos(info, 0, &start, &end); + return end - start; + } + + boost::optional match(int n) const override + { + gint start, end; + if (g_match_info_fetch_pos(info, n, &start, &end)) { + if (start == -1 && end == -1) { + return boost::optional(); + } + return Glib::ustring(str + start, end - start); + } + return boost::optional(); + } + + private: + mutable GError * err; + GRegex * regex; + mutable GMatchInfo * info; + mutable const gchar * str; + }; + + Lexer::PatternPtr + regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match) + { + return Lexer::PatternPtr(new Regex(pattern, compile, match)); + } + } +} + diff --git a/libadhocutil/lexer-regex.h b/libadhocutil/lexer-regex.h new file mode 100644 index 0000000..2397071 --- /dev/null +++ b/libadhocutil/lexer-regex.h @@ -0,0 +1,13 @@ +#ifndef ADHOCUTIL_LEXER_REGEX_H +#define ADHOCUTIL_LEXER_REGEX_H + +#include "lexer.h" + +namespace AdHoc { + namespace LexerMatchers { + DLL_PUBLIC Lexer::PatternPtr regex(const Glib::ustring &, GRegexCompileFlags compile = (GRegexCompileFlags)0, GRegexMatchFlags match = (GRegexMatchFlags)0); + } +}; + +#endif + diff --git a/libadhocutil/lexer.cpp b/libadhocutil/lexer.cpp new file mode 100644 index 0000000..95e079b --- /dev/null +++ b/libadhocutil/lexer.cpp @@ -0,0 +1,78 @@ +#include "lexer.h" + +namespace AdHoc { + const Lexer::State Lexer::InitialState = ""; + + Lexer::Lexer() + { + } + + Lexer::Lexer(const Rules & r) : rules(r) + { + } + + void + Lexer::extract(const gchar * string, size_t length) const + { + ExecuteState es; + while (es.position < length) { + const Rule * selected = nullptr; + for (const auto & r : rules) { + const auto & s = boost::get<0>(r); + if (s.find(es.getState()) == s.end()) { + continue; + } + const auto & p = boost::get<1>(r); + if (p->matches(string, length, es.position)) { + selected = &r; + break; + } + } + if (!selected) { + throw std::runtime_error(std::string("Unexpected input in state (" + es.getState() + ") at ") + (string + es.position)); + } + es.pattern = boost::get<1>(*selected); + const auto & h = boost::get<2>(*selected); + h(&es); + es.position += es.pattern->matchedLength(); + } + + } + + Lexer::ExecuteState::ExecuteState() : + position(0) + { + stateStack.push_back(InitialState); + } + + void + Lexer::ExecuteState::setState(const State & s) + { + stateStack.back() = s; + } + + void + Lexer::ExecuteState::pushState(const State & s) + { + stateStack.push_back(s); + } + + void + Lexer::ExecuteState::popState() + { + stateStack.pop_back(); + } + + const Lexer::State & + Lexer::ExecuteState::getState() const + { + return stateStack.back(); + } + + size_t + Lexer::ExecuteState::depth() const + { + return stateStack.size(); + } +} + diff --git a/libadhocutil/lexer.h b/libadhocutil/lexer.h new file mode 100644 index 0000000..fdc4507 --- /dev/null +++ b/libadhocutil/lexer.h @@ -0,0 +1,61 @@ +#ifndef ADHOCUTIL_LEXER_H +#define ADHOCUTIL_LEXER_H + +#include +#include +#include +#include +#include +#include +#include +#include "visibility.h" + +namespace AdHoc { + class DLL_PUBLIC Lexer { + public: + class Pattern { + public: + virtual ~Pattern() = default; + + virtual bool matches(const gchar *, size_t, size_t) const = 0; + virtual size_t matchedLength() const = 0; + virtual boost::optional match(int) const = 0; + }; + typedef boost::shared_ptr PatternPtr; + + typedef std::string State; + typedef std::set States; + + class ExecuteState { + public: + ExecuteState(); + + void pushState(const State &); + void popState(); + void setState(const State &); + const State & getState() const; + size_t depth() const; + + size_t position; + PatternPtr pattern; + + private: + std::vector stateStack; + }; + + typedef boost::function Handler; + typedef boost::tuple Rule; + typedef std::vector Rules; + + static const State InitialState; + Lexer(); + Lexer(const Rules &); + + Rules rules; + + void extract(const gchar * string, size_t length) const; + }; +} + +#endif + diff --git a/libadhocutil/unittests/Jamfile.jam b/libadhocutil/unittests/Jamfile.jam index 36250dc..b27f1d6 100644 --- a/libadhocutil/unittests/Jamfile.jam +++ b/libadhocutil/unittests/Jamfile.jam @@ -249,3 +249,14 @@ run boost_filesystem ; +run + testLexer.cpp + : : : + BOOST_TEST_DYN_LINK + ..//adhocutil + boost_utf + ROOT=\"$(me)\" + boost_system + boost_filesystem + ; + diff --git a/libadhocutil/unittests/testLexer.cpp b/libadhocutil/unittests/testLexer.cpp new file mode 100644 index 0000000..c0b973b --- /dev/null +++ b/libadhocutil/unittests/testLexer.cpp @@ -0,0 +1,80 @@ +#define BOOST_TEST_MODULE Lexer +#include + +#include +#include + +using namespace AdHoc; +using namespace AdHoc::LexerMatchers; + +BOOST_AUTO_TEST_CASE( defaultConstructor ) +{ + AdHoc::Lexer l; + l.rules.push_back({ { AdHoc::Lexer::InitialState }, regex("a"), [](auto) { } }); +} + +BOOST_AUTO_TEST_CASE( simple ) +{ + int m = 0; + AdHoc::Lexer l({ + { { AdHoc::Lexer::InitialState }, regex("a"), [&](auto) { m += 1; } } + }); + BOOST_REQUIRE_EQUAL(0, m); + l.extract("aaaa", 4); + BOOST_REQUIRE_EQUAL(4, m); + BOOST_REQUIRE_THROW({ + l.extract("abcd", 4); + }, std::runtime_error); +} + +BOOST_AUTO_TEST_CASE( state ) +{ + int m = 0; + std::string s; + AdHoc::Lexer l({ + { { AdHoc::Lexer::InitialState }, regex("a"), [&](auto es) + { + m += 1; + BOOST_REQUIRE_EQUAL(1, es->depth()); + es->pushState("2"); + BOOST_REQUIRE_EQUAL(2, es->depth()); + } }, + { { "2" }, regex("a"), [&](auto es) + { + m += 2; + BOOST_REQUIRE_EQUAL("2", es->getState()); + BOOST_REQUIRE_EQUAL(2, es->depth()); + es->pushState("3"); + BOOST_REQUIRE_EQUAL("3", es->getState()); + BOOST_REQUIRE_EQUAL(3, es->depth()); + } }, + { { "3" }, regex("a"), [&](auto es) + { + m += 3; + s += *es->pattern->match(0); + BOOST_REQUIRE_EQUAL(3, es->depth()); + es->setState("4"); + BOOST_REQUIRE_EQUAL(3, es->depth()); + BOOST_REQUIRE_EQUAL("4", es->getState()); + BOOST_REQUIRE_EQUAL(3, es->depth()); + BOOST_REQUIRE(!es->pattern->match(1)); + BOOST_REQUIRE(!es->pattern->match(2)); + es->popState(); + BOOST_REQUIRE_EQUAL(2, es->depth()); + BOOST_REQUIRE_EQUAL("2", es->getState()); + es->pushState("3"); + BOOST_REQUIRE_EQUAL(3, es->depth()); + BOOST_REQUIRE_EQUAL("3", es->getState()); + } } + }); + BOOST_REQUIRE_EQUAL(0, m); + l.extract("aaaa", 4); + BOOST_REQUIRE_EQUAL(9, m); + BOOST_REQUIRE_EQUAL("aa", s); +} + +BOOST_AUTO_TEST_CASE( badre ) +{ + BOOST_REQUIRE_THROW(regex("["), std::runtime_error); +} + -- cgit v1.2.3