From b2416925f8845b70ed25fb4ec7cde8ef11e8c239 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Thu, 21 Aug 2025 20:39:52 +0100 Subject: Initial commit; basic Apache log parsing --- .gitignore | 1 + .gitmodules | 3 + Jamroot.jam | 29 ++++++++ src/Jamfile.jam | 7 ++ src/ingestor.cpp | 40 +++++++++++ src/ingestor.hpp | 23 ++++++ src/logTypes.cpp | 96 +++++++++++++++++++++++++ src/logTypes.hpp | 42 +++++++++++ src/webstat_logger_main.cpp | 7 ++ test/Jamfile.jam | 14 ++++ test/test-ingest.cpp | 166 ++++++++++++++++++++++++++++++++++++++++++++ thirdparty/Jamfile.jam | 17 +++++ thirdparty/scnlib | 1 + 13 files changed, 446 insertions(+) create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 Jamroot.jam create mode 100644 src/Jamfile.jam create mode 100644 src/ingestor.cpp create mode 100644 src/ingestor.hpp create mode 100644 src/logTypes.cpp create mode 100644 src/logTypes.hpp create mode 100644 src/webstat_logger_main.cpp create mode 100644 test/Jamfile.jam create mode 100644 test/test-ingest.cpp create mode 100644 thirdparty/Jamfile.jam create mode 160000 thirdparty/scnlib diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba077a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +bin diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..66abcf4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thirdparty/scnlib"] + path = thirdparty/scnlib + url = https://github.com/eliaskosunen/scnlib diff --git a/Jamroot.jam b/Jamroot.jam new file mode 100644 index 0000000..c2f41d7 --- /dev/null +++ b/Jamroot.jam @@ -0,0 +1,29 @@ +import testing ; + +build-project src ; +build-project test ; + +project webstat : requirements + 26 + hidden + static + gcc,debug:pedantic + clang,debug:extra + clang:-Wno-c23-extensions + debug:on + debug:-Wnon-virtual-dtor + debug:-Wcast-align + debug:-Wunused + debug:-Woverloaded-virtual + debug:-Wconversion + debug:-Wsign-conversion + debug:-Wnull-dereference + debug:-Wdouble-promotion + debug:-Wformat=2 + gcc,debug:-Wold-style-cast + gcc,debug:-Wduplicated-cond + gcc,debug:-Wduplicated-branches + gcc,debug:-Wlogical-op + gcc,debug:-Wuseless-cast + release:on-thin + ; diff --git a/src/Jamfile.jam b/src/Jamfile.jam new file mode 100644 index 0000000..40ec28e --- /dev/null +++ b/src/Jamfile.jam @@ -0,0 +1,7 @@ +lib webstat : ingestor.cpp logTypes.cpp : + . + ../thirdparty//scn + : : + . + ; +exe webstat_logger : webstat_logger_main.cpp : webstat ; diff --git a/src/ingestor.cpp b/src/ingestor.cpp new file mode 100644 index 0000000..5724b33 --- /dev/null +++ b/src/ingestor.cpp @@ -0,0 +1,40 @@ +#include "ingestor.hpp" +#include +#include + +namespace WebStat { + Ingestor::ScanResult + Ingestor::scanLogLine(std::string_view input) + { + return scn::scan< // Field : Apache format specifier : example + std::string_view, // virtual_host : %v : some.host.name + std::string_view, // remoteip : %a : 1.2.3.4 (or ipv6) + uint64_t, // request_time : %{usec}t : 123456790 + std::string_view, // method : %m : GET + QuotedString, // URL : "%u" : "/foo/bar" + QueryString, // query_string : "%q" : "?query=string" or "" + std::string_view, // protocol : %r : HTTPS/2.0 + unsigned short, // status : %>s : 200 + unsigned int, // size : %B : 1234 + unsigned int, // duration : %D : 1234 + CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-" + CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-" + >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})"); + } + + void + Ingestor::ingestLog(std::FILE * input) + { + while (auto line = scn::scan(input, "{:[^\n]}\n")) { + linesRead++; + if (auto result = scanLogLine(line->value())) { + linesParsed++; + std::ignore = result->values(); + } + else { + syslog(LOG_WARNING, "Discarded line: [%s]", line->value().c_str()); + linesDiscarded++; + } + } + } +} diff --git a/src/ingestor.hpp b/src/ingestor.hpp new file mode 100644 index 0000000..97ce9f9 --- /dev/null +++ b/src/ingestor.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "logTypes.hpp" +#include +#include + +namespace WebStat { + class Ingestor { + public: + using ScanResult = decltype(scn::scan(std::declval(), "")); + + [[nodiscard]] static ScanResult scanLogLine(std::string_view); + + void ingestLog(std::FILE *); + + protected: + size_t linesRead = 0; + size_t linesParsed = 0; + size_t linesDiscarded = 0; + }; +} diff --git a/src/logTypes.cpp b/src/logTypes.cpp new file mode 100644 index 0000000..42f0979 --- /dev/null +++ b/src/logTypes.cpp @@ -0,0 +1,96 @@ +#include "logTypes.hpp" + +namespace scn { + scan_expected + scanner::scan(WebStat::QuotedString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + return empty->begin(); + } + + auto result = scn::scan(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected + scanner::scan(WebStat::QueryString & value, ContextType & ctx) + { + if (auto null = scn::scan<>(ctx.range(), R"("")")) { + return null->begin(); + } + + if (auto empty = scn::scan<>(ctx.range(), R"("?")")) { + value.emplace(); + return empty->begin(); + } + + auto result = scn::scan(ctx.range(), R"("?{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected + scanner::scan(WebStat::CLFString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + value.emplace(); + return empty->begin(); + } + + if (auto null = scn::scan<>(ctx.range(), R"("-")")) { + return null->begin(); + } + + auto result = scn::scan(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + decode(*value); + return result->begin(); + } + + void + scanner::decode(std::string & value) + { + static constexpr auto BS_MAP = []() { + std::array map {}; + map['f'] = '\f'; + map['n'] = '\n'; + map['r'] = '\r'; + map['t'] = '\t'; + map['v'] = '\v'; + map['"'] = '"'; + map['\\'] = '\\'; + return map; + }(); + + if (auto src = std::ranges::find(value, '\\'); src != value.end()) { + auto dest = src; + while (src != value.cend()) { + if (*src == '\\') { + const std::string_view escaped {++src, value.end()}; + if (auto chr = BS_MAP[static_cast(*src)]) { + *dest++ = chr; + src++; + } + else if (auto hex = scn::scan(escaped, R"(x{:.2x})")) { + *dest++ = static_cast(hex->value()); + src += 3; + } + } + else { + *dest++ = *src++; + } + } + value.erase(dest, value.end()); + } + } +} diff --git a/src/logTypes.hpp b/src/logTypes.hpp new file mode 100644 index 0000000..d4f1b7b --- /dev/null +++ b/src/logTypes.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include +#include +#include + +namespace WebStat { + struct QuotedString : std::string { + using std::string::string; + using std::string::operator=; + }; + + struct QueryString : std::optional { + using std::optional::optional; + using std::optional::operator=; + bool operator<=>(const QueryString &) const = default; + }; + + struct CLFString : std::optional { + using std::optional::optional; + using std::optional::operator=; + bool operator<=>(const CLFString &) const = default; + }; +} + +namespace scn { + using ContextType = scn::v4::basic_scan_context; + + template<> struct scanner : scanner { + static scan_expected scan(WebStat::QuotedString & value, ContextType & ctx); + }; + + template<> struct scanner : scanner { + static scan_expected scan(WebStat::QueryString & value, ContextType & ctx); + }; + + template<> struct scanner : scanner { + static scan_expected scan(WebStat::CLFString & value, ContextType & ctx); + + static void decode(std::string &); + }; +} diff --git a/src/webstat_logger_main.cpp b/src/webstat_logger_main.cpp new file mode 100644 index 0000000..c4d31d6 --- /dev/null +++ b/src/webstat_logger_main.cpp @@ -0,0 +1,7 @@ +#include "ingestor.hpp" + +int +main(int, char **) +{ + WebStat::Ingestor {}.ingestLog(stdin); +} diff --git a/test/Jamfile.jam b/test/Jamfile.jam new file mode 100644 index 0000000..a008606 --- /dev/null +++ b/test/Jamfile.jam @@ -0,0 +1,14 @@ +lib boost_unit_test_framework : : shared ; + +path-constant src : ../src ; +path-constant test : . ; + +run test-ingest.cpp : + -- : + : + BOOST_TEST_DYN_LINK + SRC=\"$(src)\" + TEST=\"$(test)\" + $(src)//webstat + boost_unit_test_framework + ; diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp new file mode 100644 index 0000000..f7eab19 --- /dev/null +++ b/test/test-ingest.cpp @@ -0,0 +1,166 @@ +#define BOOST_TEST_MODULE ingest +#include +#include + +#include + +using ScanValues = std::remove_cvref_t()->values())>; +template using ParseData = std::tuple; +template +using DeleteWith = decltype([](auto obj) { + return Deleter(obj); +}); +using FilePtr = std::unique_ptr>; + +namespace std { + template + ostream & + operator<<(ostream & strm, const std::optional & value) + { + if (value) { + strm << *value; + } + return strm; + } + + template + ostream & + operator<<(ostream & strm, const std::tuple & values) + { + return std::apply( + [&strm](auto &&... elems) -> decltype(auto) { + return ((strm << elems << '\n'), ...); + }, + values); + } +} + +BOOST_DATA_TEST_CASE(QuotedStringsGood, + boost::unit_test::data::make>({ + {R"("")", ""}, + {R"("-")", "-"}, + {R"(".")", "."}, + {R"("/url/path")", "/url/path"}, + }), + input, expected) +{ + const auto result = scn::scan(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(QuotedStringsBad, + boost::unit_test::data::make({ + R"()", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan(input, "{}")); +} + +BOOST_DATA_TEST_CASE(QueryStringsGood, + boost::unit_test::data::make>({ + {R"("")", std::nullopt}, + {R"("?")", ""}, + {R"("?something")", "something"}, + {R"("?some=thing")", "some=thing"}, + {R"("?some=thing&other=thing")", "some=thing&other=thing"}, + }), + input, expected) +{ + const auto result = scn::scan(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(QueryStringsBad, + boost::unit_test::data::make({ + R"()", + R"("-")", + R"(".")", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan(input, "{}")); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::timeout(1)) + +BOOST_DATA_TEST_CASE(CLFStringsDecode, + boost::unit_test::data::make>({ + {"", ""}, + {"plain", "plain"}, + {R"(hex\x41)", "hexA"}, + {R"(hex\x4141)", "hexA41"}, + {R"(hex\x41\x41)", "hexAA"}, + {R"(hex\t\x41)", "hex\tA"}, + }), + input, expected) +{ + std::string value {input}; + scn::scanner::decode(value); + BOOST_CHECK_EQUAL(value, expected); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsDecode")) + +BOOST_DATA_TEST_CASE(CLFStringsGood, + boost::unit_test::data::make>({ + {R"("")", ""}, + {R"("-")", std::nullopt}, + {R"("?")", "?"}, + {R"(".")", "."}, + {R"("something")", "something"}, + {R"("https://google.com")", "https://google.com"}, + {R"("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")", + R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)"}, + }), + input, expected) +{ + const auto result = scn::scan(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(CLFStringsBad, + boost::unit_test::data::make({ + R"()", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan(input, "{}")); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QuotedStringsGood")) +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QueryStringsGood")) +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsGood")) + +BOOST_DATA_TEST_CASE(ExtractFields, + boost::unit_test::data::make>({ + {R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36")LOG", + {"git.randomdan.homeip.net", "98.82.40.168", 1755561576768318, "GET", + R"(/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest)", + R"(h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714)", "HTTP/1.1", + 200, 1884, 107791, std::nullopt, + R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)"}}, + {R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")LOG", + {"www.randomdan.homeip.net", "43.128.84.166", 1755561575973204, "GET", + "/app-dicts/myspell-et/Manifest", std::nullopt, "HTTP/1.1", 200, 312, 10369, + "https://google.com", + R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)"}}, + }), + input, expected) +{ + const auto result = WebStat::Ingestor::scanLogLine(input); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->values(), expected); +} diff --git a/thirdparty/Jamfile.jam b/thirdparty/Jamfile.jam new file mode 100644 index 0000000..95481b6 --- /dev/null +++ b/thirdparty/Jamfile.jam @@ -0,0 +1,17 @@ +path-constant inc : scnlib/include ; + +lib scn : + scnlib/src/scn/impl.cpp + : + scnlib/src + SCN_DISABLE_FAST_FLOAT + SCN_DISABLE_REGEX + off + -debug\:on + off + -isystem\ $(inc) + : : + SCN_DISABLE_FAST_FLOAT + SCN_DISABLE_REGEX + -isystem\ $(inc) + ; diff --git a/thirdparty/scnlib b/thirdparty/scnlib new file mode 160000 index 0000000..e937be1 --- /dev/null +++ b/thirdparty/scnlib @@ -0,0 +1 @@ +Subproject commit e937be1a52588621b406d58ce8614f96bb5de747 -- cgit v1.2.3