diff options
author | Dan Goodliffe <dan@randomdan.homeip.net> | 2025-08-21 20:39:52 +0100 |
---|---|---|
committer | Dan Goodliffe <dan@randomdan.homeip.net> | 2025-08-25 16:00:59 +0100 |
commit | b2416925f8845b70ed25fb4ec7cde8ef11e8c239 (patch) | |
tree | 9ed898937ddceca6bcf0e2a6d6dfda3754dceefe /src | |
download | webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.bz2 webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.tar.xz webstat-b2416925f8845b70ed25fb4ec7cde8ef11e8c239.zip |
Initial commit; basic Apache log parsing
Diffstat (limited to 'src')
-rw-r--r-- | src/Jamfile.jam | 7 | ||||
-rw-r--r-- | src/ingestor.cpp | 40 | ||||
-rw-r--r-- | src/ingestor.hpp | 23 | ||||
-rw-r--r-- | src/logTypes.cpp | 96 | ||||
-rw-r--r-- | src/logTypes.hpp | 42 | ||||
-rw-r--r-- | src/webstat_logger_main.cpp | 7 |
6 files changed, 215 insertions, 0 deletions
diff --git a/src/Jamfile.jam b/src/Jamfile.jam new file mode 100644 index 0000000..40ec28e --- /dev/null +++ b/src/Jamfile.jam @@ -0,0 +1,7 @@ +lib webstat : ingestor.cpp logTypes.cpp : + <include>. + <library>../thirdparty//scn + : : + <include>. + ; +exe webstat_logger : webstat_logger_main.cpp : <library>webstat ; diff --git a/src/ingestor.cpp b/src/ingestor.cpp new file mode 100644 index 0000000..5724b33 --- /dev/null +++ b/src/ingestor.cpp @@ -0,0 +1,40 @@ +#include "ingestor.hpp" +#include <scn/scan.h> +#include <syslog.h> + +namespace WebStat { + Ingestor::ScanResult + Ingestor::scanLogLine(std::string_view input) + { + return scn::scan< // Field : Apache format specifier : example + std::string_view, // virtual_host : %v : some.host.name + std::string_view, // remoteip : %a : 1.2.3.4 (or ipv6) + uint64_t, // request_time : %{usec}t : 123456790 + std::string_view, // method : %m : GET + QuotedString, // URL : "%u" : "/foo/bar" + QueryString, // query_string : "%q" : "?query=string" or "" + std::string_view, // protocol : %r : HTTPS/2.0 + unsigned short, // status : %>s : 200 + unsigned int, // size : %B : 1234 + unsigned int, // duration : %D : 1234 + CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-" + CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-" + >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})"); + } + + void + Ingestor::ingestLog(std::FILE * input) + { + while (auto line = scn::scan<std::string>(input, "{:[^\n]}\n")) { + linesRead++; + if (auto result = scanLogLine(line->value())) { + linesParsed++; + std::ignore = result->values(); + } + else { + syslog(LOG_WARNING, "Discarded line: [%s]", line->value().c_str()); + linesDiscarded++; + } + } + } +} diff --git a/src/ingestor.hpp b/src/ingestor.hpp new file mode 100644 index 0000000..97ce9f9 --- /dev/null +++ b/src/ingestor.hpp @@ -0,0 +1,23 @@ +#pragma once + +#include "logTypes.hpp" +#include <cstdio> +#include <scn/scan.h> + +namespace WebStat { + class Ingestor { + public: + using ScanResult = decltype(scn::scan<std::string_view, std::string_view, uint64_t, std::string_view, + QuotedString, QueryString, std::string_view, unsigned short, unsigned int, unsigned int, CLFString, + CLFString>(std::declval<std::string_view>(), "")); + + [[nodiscard]] static ScanResult scanLogLine(std::string_view); + + void ingestLog(std::FILE *); + + protected: + size_t linesRead = 0; + size_t linesParsed = 0; + size_t linesDiscarded = 0; + }; +} diff --git a/src/logTypes.cpp b/src/logTypes.cpp new file mode 100644 index 0000000..42f0979 --- /dev/null +++ b/src/logTypes.cpp @@ -0,0 +1,96 @@ +#include "logTypes.hpp" + +namespace scn { + scan_expected<typename ContextType::iterator> + scanner<WebStat::QuotedString>::scan(WebStat::QuotedString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + return empty->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected<typename ContextType::iterator> + scanner<WebStat::QueryString>::scan(WebStat::QueryString & value, ContextType & ctx) + { + if (auto null = scn::scan<>(ctx.range(), R"("")")) { + return null->begin(); + } + + if (auto empty = scn::scan<>(ctx.range(), R"("?")")) { + value.emplace(); + return empty->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("?{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + return result->begin(); + } + + scan_expected<typename ContextType::iterator> + scanner<WebStat::CLFString>::scan(WebStat::CLFString & value, ContextType & ctx) + { + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { + value.emplace(); + return empty->begin(); + } + + if (auto null = scn::scan<>(ctx.range(), R"("-")")) { + return null->begin(); + } + + auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); + if (!result) { + return unexpected(result.error()); + } + value = result->value(); + decode(*value); + return result->begin(); + } + + void + scanner<WebStat::CLFString>::decode(std::string & value) + { + static constexpr auto BS_MAP = []() { + std::array<char, 128> map {}; + map['f'] = '\f'; + map['n'] = '\n'; + map['r'] = '\r'; + map['t'] = '\t'; + map['v'] = '\v'; + map['"'] = '"'; + map['\\'] = '\\'; + return map; + }(); + + if (auto src = std::ranges::find(value, '\\'); src != value.end()) { + auto dest = src; + while (src != value.cend()) { + if (*src == '\\') { + const std::string_view escaped {++src, value.end()}; + if (auto chr = BS_MAP[static_cast<unsigned char>(*src)]) { + *dest++ = chr; + src++; + } + else if (auto hex = scn::scan<unsigned char>(escaped, R"(x{:.2x})")) { + *dest++ = static_cast<char>(hex->value()); + src += 3; + } + } + else { + *dest++ = *src++; + } + } + value.erase(dest, value.end()); + } + } +} diff --git a/src/logTypes.hpp b/src/logTypes.hpp new file mode 100644 index 0000000..d4f1b7b --- /dev/null +++ b/src/logTypes.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include <optional> +#include <scn/scan.h> +#include <string> + +namespace WebStat { + struct QuotedString : std::string { + using std::string::string; + using std::string::operator=; + }; + + struct QueryString : std::optional<std::string> { + using std::optional<std::string>::optional; + using std::optional<std::string>::operator=; + bool operator<=>(const QueryString &) const = default; + }; + + struct CLFString : std::optional<std::string> { + using std::optional<std::string>::optional; + using std::optional<std::string>::operator=; + bool operator<=>(const CLFString &) const = default; + }; +} + +namespace scn { + using ContextType = scn::v4::basic_scan_context<scn::v4::detail::buffer_range_tag, char>; + + template<> struct scanner<WebStat::QuotedString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::QuotedString & value, ContextType & ctx); + }; + + template<> struct scanner<WebStat::QueryString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::QueryString & value, ContextType & ctx); + }; + + template<> struct scanner<WebStat::CLFString> : scanner<std::string, char> { + static scan_expected<typename ContextType::iterator> scan(WebStat::CLFString & value, ContextType & ctx); + + static void decode(std::string &); + }; +} diff --git a/src/webstat_logger_main.cpp b/src/webstat_logger_main.cpp new file mode 100644 index 0000000..c4d31d6 --- /dev/null +++ b/src/webstat_logger_main.cpp @@ -0,0 +1,7 @@ +#include "ingestor.hpp" + +int +main(int, char **) +{ + WebStat::Ingestor {}.ingestLog(stdin); +} |