diff options
author | Dan Goodliffe <dan@randomdan.homeip.net> | 2025-08-21 20:39:52 +0100 |
---|---|---|
committer | Dan Goodliffe <dan@randomdan.homeip.net> | 2025-08-25 16:00:59 +0100 |
commit | b2416925f8845b70ed25fb4ec7cde8ef11e8c239 (patch) | |
tree | 9ed898937ddceca6bcf0e2a6d6dfda3754dceefe | |
download | webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.bz2 webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.xz webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.zip |
Initial commit; basic Apache log parsing
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | .gitmodules | 3 | ||||
-rw-r--r-- | Jamroot.jam | 29 | ||||
-rw-r--r-- | src/Jamfile.jam | 7 | ||||
-rw-r--r-- | src/ingestor.cpp | 40 | ||||
-rw-r--r-- | src/ingestor.hpp | 23 | ||||
-rw-r--r-- | src/logTypes.cpp | 96 | ||||
-rw-r--r-- | src/logTypes.hpp | 42 | ||||
-rw-r--r-- | src/webstat_logger_main.cpp | 7 | ||||
-rw-r--r-- | test/Jamfile.jam | 14 | ||||
-rw-r--r-- | test/test-ingest.cpp | 166 | ||||
-rw-r--r-- | thirdparty/Jamfile.jam | 17 | ||||
m--------- | thirdparty/scnlib | 0 |
13 files changed, 445 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba077a4 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +bin diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..66abcf4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "thirdparty/scnlib"] + path = thirdparty/scnlib + url = https://github.com/eliaskosunen/scnlib diff --git a/Jamroot.jam b/Jamroot.jam new file mode 100644 index 0000000..c2f41d7 --- /dev/null +++ b/Jamroot.jam @@ -0,0 +1,29 @@ +import testing ; + +build-project src ; +build-project test ; + +project webstat : requirements + <cxxstd>26 + <visibility>hidden + <link>static + <toolset>gcc,<variant>debug:<warnings>pedantic + <toolset>clang,<variant>debug:<warnings>extra + <toolset>clang:<cflags>-Wno-c23-extensions + <variant>debug:<warnings-as-errors>on + <variant>debug:<cflags>-Wnon-virtual-dtor + <variant>debug:<cflags>-Wcast-align + <variant>debug:<cflags>-Wunused + <variant>debug:<cflags>-Woverloaded-virtual + <variant>debug:<cflags>-Wconversion + <variant>debug:<cflags>-Wsign-conversion + <variant>debug:<cflags>-Wnull-dereference + <variant>debug:<cflags>-Wdouble-promotion + <variant>debug:<cflags>-Wformat=2 + <toolset>gcc,<variant>debug:<cflags>-Wold-style-cast + <toolset>gcc,<variant>debug:<cflags>-Wduplicated-cond + <toolset>gcc,<variant>debug:<cflags>-Wduplicated-branches + <toolset>gcc,<variant>debug:<cflags>-Wlogical-op + <toolset>gcc,<variant>debug:<cflags>-Wuseless-cast + <variant>release:<lto>on-thin + ; diff --git a/src/Jamfile.jam b/src/Jamfile.jam new file mode 100644 index 0000000..40ec28e --- /dev/null +++ b/src/Jamfile.jam @@ -0,0 +1,7 @@ +lib webstat : ingestor.cpp logTypes.cpp : + <include>. + <library>../thirdparty//scn + : : + <include>. + ; +exe webstat_logger : webstat_logger_main.cpp : <library>webstat ; diff --git a/src/ingestor.cpp b/src/ingestor.cpp new file mode 100644 index 0000000..5724b33 --- /dev/null +++ b/src/ingestor.cpp @@ -0,0 +1,40 @@ +#include "ingestor.hpp" +#include <scn/scan.h> +#include <syslog.h> + +namespace WebStat { + Ingestor::ScanResult + Ingestor::scanLogLine(std::string_view input) + { + return scn::scan< // Field : Apache format specifier : example + std::string_view, // virtual_host : %v : some.host.name + std::string_view, // remoteip : %a : 1.2.3.4 (or ipv6) + uint64_t, // request_time : %{usec}t : 123456790 + std::string_view, // method : %m : GET + QuotedString, // URL : "%u" : "/foo/bar" + QueryString, // query_string : "%q" : "?query=string" or "" + std::string_view, // protocol : %r : HTTPS/2.0 + unsigned short, // status : %>s : 200 + unsigned int, // size : %B : 1234 + unsigned int, // duration : %D : 1234 + CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-" + CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-" + >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})"); + } + + void + Ingestor::ingestLog(std::FILE * input) + { + while (auto line = scn::scan<std::string>(input, "{:[^\n]}\n")) { + linesRead++; + if (auto result = scanLogLine(line->value())) { + linesParsed++; + std::ignore = result->values(); + } + else { + syslog(LOG_WARNING, "Discarded line: [%s]", line->value().c_str()); + linesDiscarded++; + } + } + } +} diff --git a/src/ingestor.hpp b/src/ingestor.hpp new file mode 100644 index 0000000..97ce9f9 --- /dev/null +++ b/src/ingestor.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "logTypes.hpp" +#include <cstdio> +#include <scn/scan.h> + +namespace WebStat { + class Ingestor { + public: + using ScanResult = decltype(scn::scan<std::string_view, std::string_view, uint64_t, std::string_view, + QuotedString, QueryString, std::string_view, unsigned short, unsigned int, unsigned int, CLFString, + CLFString>(std::declval<std::string_view>(), "")); + + [[nodiscard]] static ScanResult scanLogLine(std::string_view); + + void ingestLog(std::FILE *); + + protected: + size_t linesRead = 0; + size_t linesParsed = 0; + size_t linesDiscarded = 0; + }; +} diff --git a/src/logTypes.cpp b/src/logTypes.cpp new file mode 100644 index 0000000..42f0979 --- /dev/null +++ b/src/logTypes.cpp @@ -0,0 +1,96 @@ +#include "logTypes.hpp" + +namespace scn { + scan_expected<typename ContextType::iterator> + scanner<WebStat::QuotedString>::scan(WebStat::QuotedString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + return empty->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected<typename ContextType::iterator> + scanner<WebStat::QueryString>::scan(WebStat::QueryString & value, ContextType & ctx) + { + if (auto null = scn::scan<>(ctx.range(), R"("")")) { + return null->begin(); + } + + if (auto empty = scn::scan<>(ctx.range(), R"("?")")) { + value.emplace(); + return empty->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("?{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected<typename ContextType::iterator> + scanner<WebStat::CLFString>::scan(WebStat::CLFString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + value.emplace(); + return empty->begin(); + } + + if (auto null = scn::scan<>(ctx.range(), R"("-")")) { + return null->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + decode(*value); + return result->begin(); + } + + void + scanner<WebStat::CLFString>::decode(std::string & value) + { + static constexpr auto BS_MAP = []() { + std::array<char, 128> map {}; + map['f'] = '\f'; + map['n'] = '\n'; + map['r'] = '\r'; + map['t'] = '\t'; + map['v'] = '\v'; + map['"'] = '"'; + map['\\'] = '\\'; + return map; + }(); + + if (auto src = std::ranges::find(value, '\\'); src != value.end()) { + auto dest = src; + while (src != value.cend()) { + if (*src == '\\') { + const std::string_view escaped {++src, value.end()}; + if (auto chr = BS_MAP[static_cast<unsigned char>(*src)]) { + *dest++ = chr; + src++; + } + else if (auto hex = scn::scan<unsigned char>(escaped, R"(x{:.2x})")) { + *dest++ = static_cast<char>(hex->value()); + src += 3; + } + } + else { + *dest++ = *src++; + } + } + value.erase(dest, value.end()); + } + } +} diff --git a/src/logTypes.hpp b/src/logTypes.hpp new file mode 100644 index 0000000..d4f1b7b --- /dev/null +++ b/src/logTypes.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include <optional> +#include <scn/scan.h> +#include <string> + +namespace WebStat { + struct QuotedString : std::string { + using std::string::string; + using std::string::operator=; + }; + + struct QueryString : std::optional<std::string> { + using std::optional<std::string>::optional; + using std::optional<std::string>::operator=; + bool operator<=>(const QueryString &) const = default; + }; + + struct CLFString : std::optional<std::string> { + using std::optional<std::string>::optional; + using std::optional<std::string>::operator=; + bool operator<=>(const CLFString &) const = default; + }; +} + +namespace scn { + using ContextType = scn::v4::basic_scan_context<scn::v4::detail::buffer_range_tag, char>; + + template<> struct scanner<WebStat::QuotedString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::QuotedString & value, ContextType & ctx); + }; + + template<> struct scanner<WebStat::QueryString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::QueryString & value, ContextType & ctx); + }; + + template<> struct scanner<WebStat::CLFString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::CLFString & value, ContextType & ctx); + + static void decode(std::string &); + }; +} diff --git a/src/webstat_logger_main.cpp b/src/webstat_logger_main.cpp new file mode 100644 index 0000000..c4d31d6 --- /dev/null +++ b/src/webstat_logger_main.cpp @@ -0,0 +1,7 @@ +#include "ingestor.hpp" + +int +main(int, char **) +{ + WebStat::Ingestor {}.ingestLog(stdin); +} diff --git a/test/Jamfile.jam b/test/Jamfile.jam new file mode 100644 index 0000000..a008606 --- /dev/null +++ b/test/Jamfile.jam @@ -0,0 +1,14 @@ +lib boost_unit_test_framework : : <link>shared ; + +path-constant src : ../src ; +path-constant test : . ; + +run test-ingest.cpp : + -- : + : + <define>BOOST_TEST_DYN_LINK + <define>SRC=\"$(src)\" + <define>TEST=\"$(test)\" + <library>$(src)//webstat + <library>boost_unit_test_framework + ; diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp new file mode 100644 index 0000000..f7eab19 --- /dev/null +++ b/test/test-ingest.cpp @@ -0,0 +1,166 @@ +#define BOOST_TEST_MODULE ingest +#include <boost/test/data/test_case.hpp> +#include <boost/test/unit_test.hpp> + +#include <ingestor.hpp> + +using ScanValues = std::remove_cvref_t<decltype(std::declval<WebStat::Ingestor::ScanResult>()->values())>; +template<typename Out> using ParseData = std::tuple<std::string_view, Out>; +template<auto Deleter> +using DeleteWith = decltype([](auto obj) { + return Deleter(obj); +}); +using FilePtr = std::unique_ptr<std::FILE, DeleteWith<&fclose>>; + +namespace std { + template<typename T> + ostream & + operator<<(ostream & strm, const std::optional<T> & value) + { + if (value) { + strm << *value; + } + return strm; + } + + template<typename... T> + ostream & + operator<<(ostream & strm, const std::tuple<T...> & values) + { + return std::apply( + [&strm](auto &&... elems) -> decltype(auto) { + return ((strm << elems << '\n'), ...); + }, + values); + } +} + +BOOST_DATA_TEST_CASE(QuotedStringsGood, + boost::unit_test::data::make<ParseData<WebStat::QuotedString>>({ + {R"("")", ""}, + {R"("-")", "-"}, + {R"(".")", "."}, + {R"("/url/path")", "/url/path"}, + }), + input, expected) +{ + const auto result = scn::scan<WebStat::QuotedString>(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(QuotedStringsBad, + boost::unit_test::data::make<std::string_view>({ + R"()", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan<WebStat::QuotedString>(input, "{}")); +} + +BOOST_DATA_TEST_CASE(QueryStringsGood, + boost::unit_test::data::make<ParseData<WebStat::QueryString>>({ + {R"("")", std::nullopt}, + {R"("?")", ""}, + {R"("?something")", "something"}, + {R"("?some=thing")", "some=thing"}, + {R"("?some=thing&other=thing")", "some=thing&other=thing"}, + }), + input, expected) +{ + const auto result = scn::scan<WebStat::QueryString>(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(QueryStringsBad, + boost::unit_test::data::make<std::string_view>({ + R"()", + R"("-")", + R"(".")", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan<WebStat::QueryString>(input, "{}")); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::timeout(1)) + +BOOST_DATA_TEST_CASE(CLFStringsDecode, + boost::unit_test::data::make<ParseData<std::string>>({ + {"", ""}, + {"plain", "plain"}, + {R"(hex\x41)", "hexA"}, + {R"(hex\x4141)", "hexA41"}, + {R"(hex\x41\x41)", "hexAA"}, + {R"(hex\t\x41)", "hex\tA"}, + }), + input, expected) +{ + std::string value {input}; + scn::scanner<WebStat::CLFString>::decode(value); + BOOST_CHECK_EQUAL(value, expected); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsDecode")) + +BOOST_DATA_TEST_CASE(CLFStringsGood, + boost::unit_test::data::make<ParseData<WebStat::CLFString>>({ + {R"("")", ""}, + {R"("-")", std::nullopt}, + {R"("?")", "?"}, + {R"(".")", "."}, + {R"("something")", "something"}, + {R"("https://google.com")", "https://google.com"}, + {R"("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")", + R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)"}, + }), + input, expected) +{ + const auto result = scn::scan<WebStat::CLFString>(input, "{}"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->value(), expected); +} + +BOOST_DATA_TEST_CASE(CLFStringsBad, + boost::unit_test::data::make<std::string_view>({ + R"()", + R"(-)", + R"(word)", + R"(/url/path)", + }), + input) +{ + BOOST_REQUIRE(!scn::scan<WebStat::CLFString>(input, "{}")); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QuotedStringsGood")) +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QueryStringsGood")) +BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsGood")) + +BOOST_DATA_TEST_CASE(ExtractFields, + boost::unit_test::data::make<ParseData<ScanValues>>({ + {R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36")LOG", + {"git.randomdan.homeip.net", "98.82.40.168", 1755561576768318, "GET", + R"(/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest)", + R"(h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714)", "HTTP/1.1", + 200, 1884, 107791, std::nullopt, + R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)"}}, + {R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")LOG", + {"www.randomdan.homeip.net", "43.128.84.166", 1755561575973204, "GET", + "/app-dicts/myspell-et/Manifest", std::nullopt, "HTTP/1.1", 200, 312, 10369, + "https://google.com", + R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)"}}, + }), + input, expected) +{ + const auto result = WebStat::Ingestor::scanLogLine(input); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(result->values(), expected); +} diff --git a/thirdparty/Jamfile.jam b/thirdparty/Jamfile.jam new file mode 100644 index 0000000..95481b6 --- /dev/null +++ b/thirdparty/Jamfile.jam @@ -0,0 +1,17 @@ +path-constant inc : scnlib/include ; + +lib scn : + scnlib/src/scn/impl.cpp + : + <include>scnlib/src + <define>SCN_DISABLE_FAST_FLOAT + <define>SCN_DISABLE_REGEX + <warnings>off + -<variant>debug\:<warnings-as-errors>on + <warnings-as-errors>off + <cflags>-isystem\ $(inc) + : : + <define>SCN_DISABLE_FAST_FLOAT + <define>SCN_DISABLE_REGEX + <cflags>-isystem\ $(inc) + ; diff --git a/thirdparty/scnlib b/thirdparty/scnlib new file mode 160000 +Subproject e937be1a52588621b406d58ce8614f96bb5de74 |