summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Goodliffe <dan@randomdan.homeip.net>2025-08-21 20:39:52 +0100
committerDan Goodliffe <dan@randomdan.homeip.net>2025-08-25 16:00:59 +0100
commitb2416925f8845b70ed25fb4ec7cde8ef11e8c239 (patch)
tree9ed898937ddceca6bcf0e2a6d6dfda3754dceefe
downloadwebstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.bz2
webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.xz
webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.zip
Initial commit; basic Apache log parsing
-rw-r--r--.gitignore1
-rw-r--r--.gitmodules3
-rw-r--r--Jamroot.jam29
-rw-r--r--src/Jamfile.jam7
-rw-r--r--src/ingestor.cpp40
-rw-r--r--src/ingestor.hpp23
-rw-r--r--src/logTypes.cpp96
-rw-r--r--src/logTypes.hpp42
-rw-r--r--src/webstat_logger_main.cpp7
-rw-r--r--test/Jamfile.jam14
-rw-r--r--test/test-ingest.cpp166
-rw-r--r--thirdparty/Jamfile.jam17
m---------thirdparty/scnlib0
13 files changed, 445 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ba077a4
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+bin
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000..66abcf4
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "thirdparty/scnlib"]
+ path = thirdparty/scnlib
+ url = https://github.com/eliaskosunen/scnlib
diff --git a/Jamroot.jam b/Jamroot.jam
new file mode 100644
index 0000000..c2f41d7
--- /dev/null
+++ b/Jamroot.jam
@@ -0,0 +1,29 @@
+import testing ;
+
+build-project src ;
+build-project test ;
+
+project webstat : requirements
+ <cxxstd>26
+ <visibility>hidden
+ <link>static
+ <toolset>gcc,<variant>debug:<warnings>pedantic
+ <toolset>clang,<variant>debug:<warnings>extra
+ <toolset>clang:<cflags>-Wno-c23-extensions
+ <variant>debug:<warnings-as-errors>on
+ <variant>debug:<cflags>-Wnon-virtual-dtor
+ <variant>debug:<cflags>-Wcast-align
+ <variant>debug:<cflags>-Wunused
+ <variant>debug:<cflags>-Woverloaded-virtual
+ <variant>debug:<cflags>-Wconversion
+ <variant>debug:<cflags>-Wsign-conversion
+ <variant>debug:<cflags>-Wnull-dereference
+ <variant>debug:<cflags>-Wdouble-promotion
+ <variant>debug:<cflags>-Wformat=2
+ <toolset>gcc,<variant>debug:<cflags>-Wold-style-cast
+ <toolset>gcc,<variant>debug:<cflags>-Wduplicated-cond
+ <toolset>gcc,<variant>debug:<cflags>-Wduplicated-branches
+ <toolset>gcc,<variant>debug:<cflags>-Wlogical-op
+ <toolset>gcc,<variant>debug:<cflags>-Wuseless-cast
+ <variant>release:<lto>on-thin
+ ;
diff --git a/src/Jamfile.jam b/src/Jamfile.jam
new file mode 100644
index 0000000..40ec28e
--- /dev/null
+++ b/src/Jamfile.jam
@@ -0,0 +1,7 @@
+lib webstat : ingestor.cpp logTypes.cpp :
+ <include>.
+ <library>../thirdparty//scn
+ : :
+ <include>.
+ ;
+exe webstat_logger : webstat_logger_main.cpp : <library>webstat ;
diff --git a/src/ingestor.cpp b/src/ingestor.cpp
new file mode 100644
index 0000000..5724b33
--- /dev/null
+++ b/src/ingestor.cpp
@@ -0,0 +1,40 @@
+#include "ingestor.hpp"
+#include <scn/scan.h>
+#include <syslog.h>
+
+namespace WebStat {
+ Ingestor::ScanResult
+ Ingestor::scanLogLine(std::string_view input)
+ {
+ return scn::scan< // Field : Apache format specifier : example
+ std::string_view, // virtual_host : %v : some.host.name
+ std::string_view, // remoteip : %a : 1.2.3.4 (or ipv6)
+ uint64_t, // request_time : %{usec}t : 123456790
+ std::string_view, // method : %m : GET
+ QuotedString, // URL : "%u" : "/foo/bar"
+ QueryString, // query_string : "%q" : "?query=string" or ""
+ std::string_view, // protocol : %r : HTTPS/2.0
+ unsigned short, // status : %>s : 200
+ unsigned int, // size : %B : 1234
+ unsigned int, // duration : %D : 1234
+ CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-"
+ CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-"
+ >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})");
+ }
+
+ void
+ Ingestor::ingestLog(std::FILE * input)
+ {
+ while (auto line = scn::scan<std::string>(input, "{:[^\n]}\n")) {
+ linesRead++;
+ if (auto result = scanLogLine(line->value())) {
+ linesParsed++;
+ std::ignore = result->values();
+ }
+ else {
+ syslog(LOG_WARNING, "Discarded line: [%s]", line->value().c_str());
+ linesDiscarded++;
+ }
+ }
+ }
+}
diff --git a/src/ingestor.hpp b/src/ingestor.hpp
new file mode 100644
index 0000000..97ce9f9
--- /dev/null
+++ b/src/ingestor.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "logTypes.hpp"
+#include <cstdio>
+#include <scn/scan.h>
+
+namespace WebStat {
+ class Ingestor {
+ public:
+ using ScanResult = decltype(scn::scan<std::string_view, std::string_view, uint64_t, std::string_view,
+ QuotedString, QueryString, std::string_view, unsigned short, unsigned int, unsigned int, CLFString,
+ CLFString>(std::declval<std::string_view>(), ""));
+
+ [[nodiscard]] static ScanResult scanLogLine(std::string_view);
+
+ void ingestLog(std::FILE *);
+
+ protected:
+ size_t linesRead = 0;
+ size_t linesParsed = 0;
+ size_t linesDiscarded = 0;
+ };
+}
diff --git a/src/logTypes.cpp b/src/logTypes.cpp
new file mode 100644
index 0000000..42f0979
--- /dev/null
+++ b/src/logTypes.cpp
@@ -0,0 +1,96 @@
+#include "logTypes.hpp"
+
+namespace scn {
+ scan_expected<typename ContextType::iterator>
+ scanner<WebStat::QuotedString>::scan(WebStat::QuotedString & value, ContextType & ctx)
+ {
+ if (auto empty = scn::scan<>(ctx.range(), R"("")")) {
+ return empty->begin();
+ }
+
+ auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")");
+ if (!result) {
+ return unexpected(result.error());
+ }
+ value = result->value();
+ return result->begin();
+ }
+
+ scan_expected<typename ContextType::iterator>
+ scanner<WebStat::QueryString>::scan(WebStat::QueryString & value, ContextType & ctx)
+ {
+ if (auto null = scn::scan<>(ctx.range(), R"("")")) {
+ return null->begin();
+ }
+
+ if (auto empty = scn::scan<>(ctx.range(), R"("?")")) {
+ value.emplace();
+ return empty->begin();
+ }
+
+ auto result = scn::scan<std::string>(ctx.range(), R"("?{:[^"]}")");
+ if (!result) {
+ return unexpected(result.error());
+ }
+ value = result->value();
+ return result->begin();
+ }
+
+ scan_expected<typename ContextType::iterator>
+ scanner<WebStat::CLFString>::scan(WebStat::CLFString & value, ContextType & ctx)
+ {
+ if (auto empty = scn::scan<>(ctx.range(), R"("")")) {
+ value.emplace();
+ return empty->begin();
+ }
+
+ if (auto null = scn::scan<>(ctx.range(), R"("-")")) {
+ return null->begin();
+ }
+
+ auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")");
+ if (!result) {
+ return unexpected(result.error());
+ }
+ value = result->value();
+ decode(*value);
+ return result->begin();
+ }
+
+ void
+ scanner<WebStat::CLFString>::decode(std::string & value)
+ {
+ static constexpr auto BS_MAP = []() {
+ std::array<char, 128> map {};
+ map['f'] = '\f';
+ map['n'] = '\n';
+ map['r'] = '\r';
+ map['t'] = '\t';
+ map['v'] = '\v';
+ map['"'] = '"';
+ map['\\'] = '\\';
+ return map;
+ }();
+
+ if (auto src = std::ranges::find(value, '\\'); src != value.end()) {
+ auto dest = src;
+ while (src != value.cend()) {
+ if (*src == '\\') {
+ const std::string_view escaped {++src, value.end()};
+ if (auto chr = BS_MAP[static_cast<unsigned char>(*src)]) {
+ *dest++ = chr;
+ src++;
+ }
+ else if (auto hex = scn::scan<unsigned char>(escaped, R"(x{:.2x})")) {
+ *dest++ = static_cast<char>(hex->value());
+ src += 3;
+ }
+ }
+ else {
+ *dest++ = *src++;
+ }
+ }
+ value.erase(dest, value.end());
+ }
+ }
+}
diff --git a/src/logTypes.hpp b/src/logTypes.hpp
new file mode 100644
index 0000000..d4f1b7b
--- /dev/null
+++ b/src/logTypes.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <optional>
+#include <scn/scan.h>
+#include <string>
+
+namespace WebStat {
+ struct QuotedString : std::string {
+ using std::string::string;
+ using std::string::operator=;
+ };
+
+ struct QueryString : std::optional<std::string> {
+ using std::optional<std::string>::optional;
+ using std::optional<std::string>::operator=;
+ bool operator<=>(const QueryString &) const = default;
+ };
+
+ struct CLFString : std::optional<std::string> {
+ using std::optional<std::string>::optional;
+ using std::optional<std::string>::operator=;
+ bool operator<=>(const CLFString &) const = default;
+ };
+}
+
+namespace scn {
+ using ContextType = scn::v4::basic_scan_context<scn::v4::detail::buffer_range_tag, char>;
+
+ template<> struct scanner<WebStat::QuotedString> : scanner<std::string, char> {
+ static scan_expected<typename ContextType::iterator> scan(WebStat::QuotedString & value, ContextType & ctx);
+ };
+
+ template<> struct scanner<WebStat::QueryString> : scanner<std::string, char> {
+ static scan_expected<typename ContextType::iterator> scan(WebStat::QueryString & value, ContextType & ctx);
+ };
+
+ template<> struct scanner<WebStat::CLFString> : scanner<std::string, char> {
+ static scan_expected<typename ContextType::iterator> scan(WebStat::CLFString & value, ContextType & ctx);
+
+ static void decode(std::string &);
+ };
+}
diff --git a/src/webstat_logger_main.cpp b/src/webstat_logger_main.cpp
new file mode 100644
index 0000000..c4d31d6
--- /dev/null
+++ b/src/webstat_logger_main.cpp
@@ -0,0 +1,7 @@
+#include "ingestor.hpp"
+
+int
+main(int, char **)
+{
+ WebStat::Ingestor {}.ingestLog(stdin);
+}
diff --git a/test/Jamfile.jam b/test/Jamfile.jam
new file mode 100644
index 0000000..a008606
--- /dev/null
+++ b/test/Jamfile.jam
@@ -0,0 +1,14 @@
+lib boost_unit_test_framework : : <link>shared ;
+
+path-constant src : ../src ;
+path-constant test : . ;
+
+run test-ingest.cpp :
+ -- :
+ :
+ <define>BOOST_TEST_DYN_LINK
+ <define>SRC=\"$(src)\"
+ <define>TEST=\"$(test)\"
+ <library>$(src)//webstat
+ <library>boost_unit_test_framework
+ ;
diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp
new file mode 100644
index 0000000..f7eab19
--- /dev/null
+++ b/test/test-ingest.cpp
@@ -0,0 +1,166 @@
+#define BOOST_TEST_MODULE ingest
+#include <boost/test/data/test_case.hpp>
+#include <boost/test/unit_test.hpp>
+
+#include <ingestor.hpp>
+
+using ScanValues = std::remove_cvref_t<decltype(std::declval<WebStat::Ingestor::ScanResult>()->values())>;
+template<typename Out> using ParseData = std::tuple<std::string_view, Out>;
+template<auto Deleter>
+using DeleteWith = decltype([](auto obj) {
+ return Deleter(obj);
+});
+using FilePtr = std::unique_ptr<std::FILE, DeleteWith<&fclose>>;
+
+namespace std {
+ template<typename T>
+ ostream &
+ operator<<(ostream & strm, const std::optional<T> & value)
+ {
+ if (value) {
+ strm << *value;
+ }
+ return strm;
+ }
+
+ template<typename... T>
+ ostream &
+ operator<<(ostream & strm, const std::tuple<T...> & values)
+ {
+ return std::apply(
+ [&strm](auto &&... elems) -> decltype(auto) {
+ return ((strm << elems << '\n'), ...);
+ },
+ values);
+ }
+}
+
+BOOST_DATA_TEST_CASE(QuotedStringsGood,
+ boost::unit_test::data::make<ParseData<WebStat::QuotedString>>({
+ {R"("")", ""},
+ {R"("-")", "-"},
+ {R"(".")", "."},
+ {R"("/url/path")", "/url/path"},
+ }),
+ input, expected)
+{
+ const auto result = scn::scan<WebStat::QuotedString>(input, "{}");
+ BOOST_REQUIRE(result);
+ BOOST_CHECK_EQUAL(result->value(), expected);
+}
+
+BOOST_DATA_TEST_CASE(QuotedStringsBad,
+ boost::unit_test::data::make<std::string_view>({
+ R"()",
+ R"(-)",
+ R"(word)",
+ R"(/url/path)",
+ }),
+ input)
+{
+ BOOST_REQUIRE(!scn::scan<WebStat::QuotedString>(input, "{}"));
+}
+
+BOOST_DATA_TEST_CASE(QueryStringsGood,
+ boost::unit_test::data::make<ParseData<WebStat::QueryString>>({
+ {R"("")", std::nullopt},
+ {R"("?")", ""},
+ {R"("?something")", "something"},
+ {R"("?some=thing")", "some=thing"},
+ {R"("?some=thing&other=thing")", "some=thing&other=thing"},
+ }),
+ input, expected)
+{
+ const auto result = scn::scan<WebStat::QueryString>(input, "{}");
+ BOOST_REQUIRE(result);
+ BOOST_CHECK_EQUAL(result->value(), expected);
+}
+
+BOOST_DATA_TEST_CASE(QueryStringsBad,
+ boost::unit_test::data::make<std::string_view>({
+ R"()",
+ R"("-")",
+ R"(".")",
+ R"(-)",
+ R"(word)",
+ R"(/url/path)",
+ }),
+ input)
+{
+ BOOST_REQUIRE(!scn::scan<WebStat::QueryString>(input, "{}"));
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::timeout(1))
+
+BOOST_DATA_TEST_CASE(CLFStringsDecode,
+ boost::unit_test::data::make<ParseData<std::string>>({
+ {"", ""},
+ {"plain", "plain"},
+ {R"(hex\x41)", "hexA"},
+ {R"(hex\x4141)", "hexA41"},
+ {R"(hex\x41\x41)", "hexAA"},
+ {R"(hex\t\x41)", "hex\tA"},
+ }),
+ input, expected)
+{
+ std::string value {input};
+ scn::scanner<WebStat::CLFString>::decode(value);
+ BOOST_CHECK_EQUAL(value, expected);
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsDecode"))
+
+BOOST_DATA_TEST_CASE(CLFStringsGood,
+ boost::unit_test::data::make<ParseData<WebStat::CLFString>>({
+ {R"("")", ""},
+ {R"("-")", std::nullopt},
+ {R"("?")", "?"},
+ {R"(".")", "."},
+ {R"("something")", "something"},
+ {R"("https://google.com")", "https://google.com"},
+ {R"("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36")",
+ R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36)"},
+ }),
+ input, expected)
+{
+ const auto result = scn::scan<WebStat::CLFString>(input, "{}");
+ BOOST_REQUIRE(result);
+ BOOST_CHECK_EQUAL(result->value(), expected);
+}
+
+BOOST_DATA_TEST_CASE(CLFStringsBad,
+ boost::unit_test::data::make<std::string_view>({
+ R"()",
+ R"(-)",
+ R"(word)",
+ R"(/url/path)",
+ }),
+ input)
+{
+ BOOST_REQUIRE(!scn::scan<WebStat::CLFString>(input, "{}"));
+}
+
+BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QuotedStringsGood"))
+BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QueryStringsGood"))
+BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsGood"))
+
+BOOST_DATA_TEST_CASE(ExtractFields,
+ boost::unit_test::data::make<ParseData<ScanValues>>({
+ {R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36")LOG",
+ {"git.randomdan.homeip.net", "98.82.40.168", 1755561576768318, "GET",
+ R"(/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest)",
+ R"(h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714)", "HTTP/1.1",
+ 200, 1884, 107791, std::nullopt,
+ R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)"}},
+ {R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")LOG",
+ {"www.randomdan.homeip.net", "43.128.84.166", 1755561575973204, "GET",
+ "/app-dicts/myspell-et/Manifest", std::nullopt, "HTTP/1.1", 200, 312, 10369,
+ "https://google.com",
+ R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)"}},
+ }),
+ input, expected)
+{
+ const auto result = WebStat::Ingestor::scanLogLine(input);
+ BOOST_REQUIRE(result);
+ BOOST_CHECK_EQUAL(result->values(), expected);
+}
diff --git a/thirdparty/Jamfile.jam b/thirdparty/Jamfile.jam
new file mode 100644
index 0000000..95481b6
--- /dev/null
+++ b/thirdparty/Jamfile.jam
@@ -0,0 +1,17 @@
+path-constant inc : scnlib/include ;
+
+lib scn :
+ scnlib/src/scn/impl.cpp
+ :
+ <include>scnlib/src
+ <define>SCN_DISABLE_FAST_FLOAT
+ <define>SCN_DISABLE_REGEX
+ <warnings>off
+ -<variant>debug\:<warnings-as-errors>on
+ <warnings-as-errors>off
+ <cflags>-isystem\ $(inc)
+ : :
+ <define>SCN_DISABLE_FAST_FLOAT
+ <define>SCN_DISABLE_REGEX
+ <cflags>-isystem\ $(inc)
+ ;
diff --git a/thirdparty/scnlib b/thirdparty/scnlib
new file mode 160000
+Subproject e937be1a52588621b406d58ce8614f96bb5de74