From a7a082d8d471bc38f9aa1160b0e9f8daf3d35ba3 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Thu, 9 Oct 2025 14:26:40 +0100 Subject: Fix up QuotedString/CLFString parsing Refactors CLFString in terms of QuotedString, but with the optional of being null (nullopt) Moves the whole decode function into QuotedString's parser, fixing support for escaping of " which would otherwise prematurely end the string in the middle. --- src/logTypes.cpp | 98 +++++++++++++++++++++++----------------------------- src/logTypes.hpp | 8 ++--- test/test-ingest.cpp | 40 +++++++++++---------- 3 files changed, 68 insertions(+), 78 deletions(-) diff --git a/src/logTypes.cpp b/src/logTypes.cpp index 42f0979..85c5f4b 100644 --- a/src/logTypes.cpp +++ b/src/logTypes.cpp @@ -4,16 +4,52 @@ namespace scn { scan_expected scanner::scan(WebStat::QuotedString & value, ContextType & ctx) { + static constexpr auto BS_MAP = []() { + std::array map {}; + map['f'] = '\f'; + map['n'] = '\n'; + map['r'] = '\r'; + map['t'] = '\t'; + map['v'] = '\v'; + map['"'] = '"'; + map['\\'] = '\\'; + return map; + }(); + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { return empty->begin(); } - auto result = scn::scan(ctx.range(), R"("{:[^"]}")"); - if (!result) { - return unexpected(result.error()); + auto simple = scn::scan(ctx.range(), R"("{:[^\"]}")"); + if (simple) { + value = std::move(simple->value()); + return simple->begin(); } - value = result->value(); - return result->begin(); + + if (auto openQuote = scn::scan<>(ctx.range(), R"(")")) { + ctx.advance_to(openQuote->begin()); + while (true) { + if (auto closeQuote = scn::scan<>(ctx.range(), R"(")")) { + return closeQuote->begin(); + } + if (auto plain = scn::scan(ctx.range(), R"({:[^\"]})")) { + value.append(plain->value()); + ctx.advance_to(plain->begin()); + } + else if (auto hex = scn::scan(ctx.range(), R"HEX(\x{:.2x})HEX")) { + value.append(1, static_cast(hex->value())); + ctx.advance_to(hex->begin()); + } + else if (auto escaped = scn::scan(ctx.range(), R"ESC(\{:.1[fnrtv"\]})ESC")) { + value.append(1, BS_MAP[static_cast(escaped->value().front())]); + ctx.advance_to(escaped->begin()); + } + else { + return unexpected(simple.error()); + } + } + } + return unexpected(simple.error()); } scan_expected @@ -32,65 +68,17 @@ namespace scn { if (!result) { return unexpected(result.error()); } - value = result->value(); + value = std::move(result->value()); return result->begin(); } scan_expected scanner::scan(WebStat::CLFString & value, ContextType & ctx) { - if (auto empty = scn::scan<>(ctx.range(), R"("")")) { - value.emplace(); - return empty->begin(); - } - if (auto null = scn::scan<>(ctx.range(), R"("-")")) { return null->begin(); } - auto result = scn::scan(ctx.range(), R"("{:[^"]}")"); - if (!result) { - return unexpected(result.error()); - } - value = result->value(); - decode(*value); - return result->begin(); - } - - void - scanner::decode(std::string & value) - { - static constexpr auto BS_MAP = []() { - std::array map {}; - map['f'] = '\f'; - map['n'] = '\n'; - map['r'] = '\r'; - map['t'] = '\t'; - map['v'] = '\v'; - map['"'] = '"'; - map['\\'] = '\\'; - return map; - }(); - - if (auto src = std::ranges::find(value, '\\'); src != value.end()) { - auto dest = src; - while (src != value.cend()) { - if (*src == '\\') { - const std::string_view escaped {++src, value.end()}; - if (auto chr = BS_MAP[static_cast(*src)]) { - *dest++ = chr; - src++; - } - else if (auto hex = scn::scan(escaped, R"(x{:.2x})")) { - *dest++ = static_cast(hex->value()); - src += 3; - } - } - else { - *dest++ = *src++; - } - } - value.erase(dest, value.end()); - } + return scn::scanner {}.scan(value.emplace(), ctx); } } diff --git a/src/logTypes.hpp b/src/logTypes.hpp index 7a78cc1..0262060 100644 --- a/src/logTypes.hpp +++ b/src/logTypes.hpp @@ -16,9 +16,9 @@ namespace WebStat { bool operator<=>(const QueryString &) const = default; }; - struct CLFString : std::optional { - using std::optional::optional; - using std::optional::operator=; + struct CLFString : std::optional { + using std::optional::optional; + using std::optional::operator=; bool operator<=>(const CLFString &) const = default; }; @@ -49,7 +49,5 @@ namespace scn { template<> struct scanner : scanner { static scan_expected scan(WebStat::CLFString & value, ContextType & ctx); - - static void decode(std::string &); }; } diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp index 388c440..9e401b3 100644 --- a/test/test-ingest.cpp +++ b/test/test-ingest.cpp @@ -63,6 +63,12 @@ BOOST_DATA_TEST_CASE(QuotedStringsGood, {R"("-")", "-"}, {R"(".")", "."}, {R"("/url/path")", "/url/path"}, + {R"("hex\x41")", "hexA"}, + {R"("hex\x4141")", "hexA41"}, + {R"("hex\x41\x41")", "hexAA"}, + {R"("hex\t\x41")", "hex\tA"}, + {R"("/packages/dev-php/pecl-uploadprogress(),')\"((,,")", + R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG"}, }), input, expected) { @@ -114,24 +120,6 @@ BOOST_DATA_TEST_CASE(QueryStringsBad, BOOST_TEST_DECORATOR(*boost::unit_test::timeout(1)) -BOOST_DATA_TEST_CASE(CLFStringsDecode, - boost::unit_test::data::make>({ - {"", ""}, - {"plain", "plain"}, - {R"(hex\x41)", "hexA"}, - {R"(hex\x4141)", "hexA41"}, - {R"(hex\x41\x41)", "hexAA"}, - {R"(hex\t\x41)", "hex\tA"}, - }), - input, expected) -{ - std::string value {input}; - scn::scanner::decode(value); - BOOST_CHECK_EQUAL(value, expected); -} - -BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsDecode")) - BOOST_DATA_TEST_CASE(CLFStringsGood, boost::unit_test::data::make>({ {R"("")", ""}, @@ -192,6 +180,22 @@ BOOST_DATA_TEST_CASE(ExtractFields, BOOST_CHECK_EQUAL(result->values(), expected); } +BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700) +{ + const auto result = WebStat::Ingestor::scanLogLine( + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG"); +} + +BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3603068405) +{ + const auto result = WebStat::Ingestor::scanLogLine( + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress'yqFSRA<'">yuezhx)LOG"); +} + class TestIngestor : public WebStat::Ingestor { public: TestIngestor() : -- cgit v1.2.3