From 69b8f140b414712379761cc2f8ed5114b229dba3 Mon Sep 17 00:00:00 2001
From: Dan Goodliffe <dan@randomdan.homeip.net>
Date: Mon, 2 May 2016 19:47:29 +0100
Subject: Add a flexible extensible lexer

---
 gentoobrowse-api/service/utils/lexer.cpp | 137 +++++++++++++++++++++++++++++++
 gentoobrowse-api/service/utils/lexer.h   |  62 ++++++++++++++
 2 files changed, 199 insertions(+)
 create mode 100644 gentoobrowse-api/service/utils/lexer.cpp
 create mode 100644 gentoobrowse-api/service/utils/lexer.h

diff --git a/gentoobrowse-api/service/utils/lexer.cpp b/gentoobrowse-api/service/utils/lexer.cpp
new file mode 100644
index 0000000..8ed83b8
--- /dev/null
+++ b/gentoobrowse-api/service/utils/lexer.cpp
@@ -0,0 +1,137 @@
+#include "lexer.h"
+
+namespace Gentoo {
+	namespace Utils {
+		const Lexer::State Lexer::InitialState = "";
+
+		class Regex : public Lexer::Pattern {
+			public:
+				Regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match) :
+					err(nullptr),
+					regex(g_regex_new(pattern.c_str(), compile, match, &err)),
+					info(nullptr)
+				{
+					if (!regex) {
+						std::runtime_error e(std::string("Failed to create GRegex: ") + err->message);
+						g_error_free(err);
+						throw e;
+					}
+				}
+
+				~Regex()
+				{
+					if (err) {
+						g_error_free(err);
+					}
+					if (info) {
+						g_match_info_free(info);
+					}
+					g_regex_unref(regex);
+				}
+
+				bool matches(const gchar * string, size_t length, size_t position) const override
+				{
+					if (info) {
+						g_match_info_free(info);
+					}
+					g_regex_match_full(regex, string, length, position, G_REGEX_MATCH_ANCHORED, &info, &err);
+					if (err) {
+						std::runtime_error e(std::string("Failed to execute regex: ") + err->message);
+						g_error_free(err);
+						throw e;
+					}
+					str = string;
+					return g_match_info_matches(info);
+				}
+
+				size_t matchedLength() const override
+				{
+					gint start, end;
+					g_match_info_fetch_pos(info, 0, &start, &end);
+					return end - start;
+				}
+
+				boost::optional<Glib::ustring> match(int n) const override
+				{
+					gint start, end;
+					if (g_match_info_fetch_pos(info, n, &start, &end)) {
+						if (start == -1 && end == -1) {
+							return boost::optional<Glib::ustring>();
+						}
+						return Glib::ustring(str + start, end - start);
+					}
+					return boost::optional<Glib::ustring>();
+				}
+
+			private:
+				mutable GError * err;
+				GRegex * regex;
+				mutable GMatchInfo * info;
+				mutable const gchar * str;
+		};
+
+		Lexer::PatternPtr
+		Lexer::regex(const Glib::ustring & pattern, GRegexCompileFlags compile, GRegexMatchFlags match)
+		{
+			return PatternPtr(new Regex(pattern, compile, match));
+		}
+
+		void
+		Lexer::extract(const gchar * string, size_t length) const
+		{
+			ExecuteState es;
+			while (es.position < length) {
+				const Rule * selected = nullptr;
+				for (const auto & r : rules) {
+					const auto & s = boost::get<0>(r);
+					if (s.find(es.getState()) == s.end()) {
+						continue;
+					}
+					const auto & p = boost::get<1>(r);
+					if (p->matches(string, length, es.position)) {
+						selected = &r;
+						break;
+					}
+				}
+				if (!selected) {
+					throw std::runtime_error(std::string("Unexpected input at ") + (string + es.position));
+				}
+				es.pattern = boost::get<1>(*selected);
+				const auto & h = boost::get<2>(*selected);
+				h(&es);
+				es.position += es.pattern->matchedLength();
+			}
+			
+		}
+
+		Lexer::ExecuteState::ExecuteState() :
+			position(0)
+		{
+			stateStack.push_back(InitialState);
+		}
+
+		void
+		Lexer::ExecuteState::setState(const State & s)
+		{
+			stateStack.back() = s;
+		}
+
+		void
+		Lexer::ExecuteState::pushState(const State & s)
+		{
+			stateStack.push_back(s);
+		}
+
+		void
+		Lexer::ExecuteState::popState()
+		{
+			stateStack.pop_back();
+		}
+
+		const Lexer::State &
+		Lexer::ExecuteState::getState() const
+		{
+			return stateStack.back();
+		}
+	}
+}
diff --git a/gentoobrowse-api/service/utils/lexer.h b/gentoobrowse-api/service/utils/lexer.h
new file mode 100644
index 0000000..44d3d57
--- /dev/null
+++ b/gentoobrowse-api/service/utils/lexer.h
@@ -0,0 +1,62 @@
+#ifndef GENTOOBROWSE_SERVICE_UTILS_LEXER_H
+#define GENTOOBROWSE_SERVICE_UTILS_LEXER_H
+
+#include <vector>
+#include <glibmm/ustring.h>
+#include <set>
+#include <boost/tuple/tuple.hpp>
+#include <boost/function.hpp>
+#include <boost/shared_ptr.hpp>
+#include <boost/optional.hpp>
+
+namespace Gentoo {
+	namespace Utils {
+		class Lexer {
+			public:
+				class Pattern {
+					public:
+						virtual ~Pattern() = default;
+
+						virtual bool matches(const gchar *, size_t, size_t) const = 0;
+						virtual size_t matchedLength() const = 0;
+						virtual boost::optional<Glib::ustring> match(int) const = 0;
+				};
+				typedef boost::shared_ptr<Pattern> PatternPtr;
+
+				typedef std::string State;
+				typedef std::set<State> States;
+
+				class ExecuteState {
+					public:
+						ExecuteState();
+
+						void pushState(const State &);
+						void popState();
+						void setState(const State &);
+						const State & getState() const;
+
+						size_t position;
+						PatternPtr pattern;
+
+					private:
+						std::vector<State> stateStack;
+				};
+
+				typedef boost::function<void(ExecuteState *)> Handler;
+				typedef boost::tuple<States, PatternPtr, Handler> Rule;
+				typedef std::vector<Rule> Rules;
+
+				static const State InitialState;
+				Rules rules;
+
+				static PatternPtr regex(const Glib::ustring &, GRegexCompileFlags compile = (GRegexCompileFlags)0, GRegexMatchFlags match = (GRegexMatchFlags)0);
+
+			public:
+				void extract(const gchar * string, size_t length) const;
+		};
+
+	}
+}
+
+#endif
+
-- 
cgit v1.2.3