diff options
-rw-r--r-- | src/logTypes.cpp | 98 | ||||
-rw-r--r-- | src/logTypes.hpp | 8 | ||||
-rw-r--r-- | test/test-ingest.cpp | 40 |
3 files changed, 68 insertions, 78 deletions
diff --git a/src/logTypes.cpp b/src/logTypes.cpp index 42f0979..85c5f4b 100644 --- a/src/logTypes.cpp +++ b/src/logTypes.cpp @@ -4,16 +4,52 @@ namespace scn { scan_expected<typename ContextType::iterator> scanner<WebStat::QuotedString>::scan(WebStat::QuotedString & value, ContextType & ctx) { + static constexpr auto BS_MAP = []() { + std::array<char, 128> map {}; + map['f'] = '\f'; + map['n'] = '\n'; + map['r'] = '\r'; + map['t'] = '\t'; + map['v'] = '\v'; + map['"'] = '"'; + map['\\'] = '\\'; + return map; + }(); + if (auto empty = scn::scan<>(ctx.range(), R"("")")) { return empty->begin(); } - auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); - if (!result) { - return unexpected(result.error()); + auto simple = scn::scan<std::string>(ctx.range(), R"("{:[^\"]}")"); + if (simple) { + value = std::move(simple->value()); + return simple->begin(); } - value = result->value(); - return result->begin(); + + if (auto openQuote = scn::scan<>(ctx.range(), R"(")")) { + ctx.advance_to(openQuote->begin()); + while (true) { + if (auto closeQuote = scn::scan<>(ctx.range(), R"(")")) { + return closeQuote->begin(); + } + if (auto plain = scn::scan<std::string>(ctx.range(), R"({:[^\"]})")) { + value.append(plain->value()); + ctx.advance_to(plain->begin()); + } + else if (auto hex = scn::scan<unsigned char>(ctx.range(), R"HEX(\x{:.2x})HEX")) { + value.append(1, static_cast<char>(hex->value())); + ctx.advance_to(hex->begin()); + } + else if (auto escaped = scn::scan<std::string>(ctx.range(), R"ESC(\{:.1[fnrtv"\]})ESC")) { + value.append(1, BS_MAP[static_cast<unsigned char>(escaped->value().front())]); + ctx.advance_to(escaped->begin()); + } + else { + return unexpected(simple.error()); + } + } + } + return unexpected(simple.error()); } scan_expected<typename ContextType::iterator> @@ -32,65 +68,17 @@ namespace scn { if (!result) { return unexpected(result.error()); } - value = result->value(); + value = std::move(result->value()); return result->begin(); } scan_expected<typename ContextType::iterator> scanner<WebStat::CLFString>::scan(WebStat::CLFString & value, ContextType & ctx) { - if (auto empty = scn::scan<>(ctx.range(), R"("")")) { - value.emplace(); - return empty->begin(); - } - if (auto null = scn::scan<>(ctx.range(), R"("-")")) { return null->begin(); } - auto result = scn::scan<std::string>(ctx.range(), R"("{:[^"]}")"); - if (!result) { - return unexpected(result.error()); - } - value = result->value(); - decode(*value); - return result->begin(); - } - - void - scanner<WebStat::CLFString>::decode(std::string & value) - { - static constexpr auto BS_MAP = []() { - std::array<char, 128> map {}; - map['f'] = '\f'; - map['n'] = '\n'; - map['r'] = '\r'; - map['t'] = '\t'; - map['v'] = '\v'; - map['"'] = '"'; - map['\\'] = '\\'; - return map; - }(); - - if (auto src = std::ranges::find(value, '\\'); src != value.end()) { - auto dest = src; - while (src != value.cend()) { - if (*src == '\\') { - const std::string_view escaped {++src, value.end()}; - if (auto chr = BS_MAP[static_cast<unsigned char>(*src)]) { - *dest++ = chr; - src++; - } - else if (auto hex = scn::scan<unsigned char>(escaped, R"(x{:.2x})")) { - *dest++ = static_cast<char>(hex->value()); - src += 3; - } - } - else { - *dest++ = *src++; - } - } - value.erase(dest, value.end()); - } + return scn::scanner<WebStat::QuotedString> {}.scan(value.emplace(), ctx); } } diff --git a/src/logTypes.hpp b/src/logTypes.hpp index 7a78cc1..0262060 100644 --- a/src/logTypes.hpp +++ b/src/logTypes.hpp @@ -16,9 +16,9 @@ namespace WebStat { bool operator<=>(const QueryString &) const = default; }; - struct CLFString : std::optional<std::string> { - using std::optional<std::string>::optional; - using std::optional<std::string>::operator=; + struct CLFString : std::optional<QuotedString> { + using std::optional<QuotedString>::optional; + using std::optional<QuotedString>::operator=; bool operator<=>(const CLFString &) const = default; }; @@ -49,7 +49,5 @@ namespace scn { template<> struct scanner<WebStat::CLFString> : scanner<std::string, char> { static scan_expected<typename ContextType::iterator> scan(WebStat::CLFString & value, ContextType & ctx); - - static void decode(std::string &); }; } diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp index 388c440..9e401b3 100644 --- a/test/test-ingest.cpp +++ b/test/test-ingest.cpp @@ -63,6 +63,12 @@ BOOST_DATA_TEST_CASE(QuotedStringsGood, {R"("-")", "-"}, {R"(".")", "."}, {R"("/url/path")", "/url/path"}, + {R"("hex\x41")", "hexA"}, + {R"("hex\x4141")", "hexA41"}, + {R"("hex\x41\x41")", "hexAA"}, + {R"("hex\t\x41")", "hex\tA"}, + {R"("/packages/dev-php/pecl-uploadprogress(),')\"((,,")", + R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG"}, }), input, expected) { @@ -114,24 +120,6 @@ BOOST_DATA_TEST_CASE(QueryStringsBad, BOOST_TEST_DECORATOR(*boost::unit_test::timeout(1)) -BOOST_DATA_TEST_CASE(CLFStringsDecode, - boost::unit_test::data::make<WebStat::ParseData<std::string>>({ - {"", ""}, - {"plain", "plain"}, - {R"(hex\x41)", "hexA"}, - {R"(hex\x4141)", "hexA41"}, - {R"(hex\x41\x41)", "hexAA"}, - {R"(hex\t\x41)", "hex\tA"}, - }), - input, expected) -{ - std::string value {input}; - scn::scanner<WebStat::CLFString>::decode(value); - BOOST_CHECK_EQUAL(value, expected); -} - -BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("CLFStringsDecode")) - BOOST_DATA_TEST_CASE(CLFStringsGood, boost::unit_test::data::make<WebStat::ParseData<WebStat::CLFString>>({ {R"("")", ""}, @@ -192,6 +180,22 @@ BOOST_DATA_TEST_CASE(ExtractFields, BOOST_CHECK_EQUAL(result->values(), expected); } +BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700) +{ + const auto result = WebStat::Ingestor::scanLogLine( + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG"); +} + +BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3603068405) +{ + const auto result = WebStat::Ingestor::scanLogLine( + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + BOOST_REQUIRE(result); + BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress'yqFSRA<'">yuezhx)LOG"); +} + class TestIngestor : public WebStat::Ingestor { public: TestIngestor() : |