diff options
| -rw-r--r-- | src/ingestor.cpp | 11 | ||||
| -rw-r--r-- | src/ingestor.hpp | 2 | ||||
| -rw-r--r-- | src/logTypes.hpp | 1 | ||||
| -rw-r--r-- | src/schema.sql | 14 | ||||
| -rw-r--r-- | src/sql/accessLogInsert.sql | 4 | ||||
| -rw-r--r-- | test/test-ingest.cpp | 28 | ||||
| -rw-r--r-- | test/testing-util.cpp | 7 |
7 files changed, 42 insertions, 25 deletions
diff --git a/src/ingestor.cpp b/src/ingestor.cpp index 81642be..da39c59 100644 --- a/src/ingestor.cpp +++ b/src/ingestor.cpp @@ -54,7 +54,8 @@ namespace WebStat { { static constexpr std::tuple<ToEntity<EntityType::VirtualHost>, std::identity, std::identity, std::identity, ToEntity<EntityType::Path>, ToEntity<EntityType::QueryString>, std::identity, std::identity, - std::identity, std::identity, ToEntity<EntityType::Referrer>, ToEntity<EntityType::UserAgent>> + std::identity, std::identity, ToEntity<EntityType::Referrer>, ToEntity<EntityType::UserAgent>, + ToEntity<EntityType::ContentType>> ENTITY_TYPE_MAP; static constexpr size_t VALUE_COUNT = std::tuple_size_v<Ingestor::ScanValues>; static_assert(VALUE_COUNT == std::tuple_size_v<decltype(ENTITY_TYPE_MAP)>); @@ -99,8 +100,9 @@ namespace WebStat { unsigned int, // size : %B : 1234 unsigned int, // duration : %D : 1234 CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-" - CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-" - >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})"); + CLFString, // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-" + CLFString // content_type : "%{Content-type}o" : "test/plain" or "-" + >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {} {})"); } void @@ -321,7 +323,7 @@ namespace WebStat { Ingestor::NewEntityIds Ingestor::storeEntities(DB::Connection * dbconn, const std::span<const std::optional<Entity>> values) const { - static constexpr std::array<std::pair<std::string_view, void (Ingestor::*)(const Entity &) const>, 8> + static constexpr std::array<std::pair<std::string_view, void (Ingestor::*)(const Entity &) const>, 9> ENTITY_TYPE_VALUES {{ {"host", nullptr}, {"virtual_host", nullptr}, @@ -331,6 +333,7 @@ namespace WebStat { {"user_agent", &Ingestor::onNewUserAgent}, {"unparsable_line", nullptr}, {"uninsertable_line", nullptr}, + {"content_type", nullptr}, }}; auto insert = dbconn->modify(SQL::ENTITY_INSERT, SQL::ENTITY_INSERT_OPTS); diff --git a/src/ingestor.hpp b/src/ingestor.hpp index a19c8ec..67a7a15 100644 --- a/src/ingestor.hpp +++ b/src/ingestor.hpp @@ -43,7 +43,7 @@ namespace WebStat { using ScanResult = decltype(scn::scan<std::string_view, std::string_view, uint64_t, std::string_view, QuotedString, QueryString, std::string_view, unsigned short, unsigned int, unsigned int, CLFString, - CLFString>(std::declval<std::string_view>(), "")); + CLFString, CLFString>(std::declval<std::string_view>(), "")); using ScanValues = std::remove_cvref_t<decltype(std::declval<WebStat::Ingestor::ScanResult>()->values())>; [[nodiscard]] static ScanResult scanLogLine(std::string_view); diff --git a/src/logTypes.hpp b/src/logTypes.hpp index f9395d1..71393b2 100644 --- a/src/logTypes.hpp +++ b/src/logTypes.hpp @@ -31,6 +31,7 @@ namespace WebStat { UserAgent, UnparsableLine, UninsertableLine, + ContentType, }; using Crc32Value = uint32_t; diff --git a/src/schema.sql b/src/schema.sql index 7648b79..8008b3c 100644 --- a/src/schema.sql +++ b/src/schema.sql @@ -28,7 +28,8 @@ CREATE TYPE entity AS ENUM( 'referrer', 'user_agent', 'unparsable_line', - 'uninsertable_line' + 'uninsertable_line', + 'content_type' ); CREATE TABLE entities( @@ -55,13 +56,15 @@ CREATE TABLE access_log( duration interval second(6) NOT NULL, referrer oid, user_agent oid, + content_type oid, CONSTRAINT pk_access_log PRIMARY KEY (id), CONSTRAINT fk_access_log_hostname FOREIGN KEY (hostname) REFERENCES entities(id), CONSTRAINT fk_access_log_virtualhost FOREIGN KEY (virtual_host) REFERENCES entities(id), CONSTRAINT fk_access_log_path FOREIGN KEY (path) REFERENCES entities(id), CONSTRAINT fk_access_log_query_string FOREIGN KEY (query_string) REFERENCES entities(id), CONSTRAINT fk_access_log_referrer FOREIGN KEY (referrer) REFERENCES entities(id), - CONSTRAINT fk_access_log_user_agent FOREIGN KEY (user_agent) REFERENCES entities(id) + CONSTRAINT fk_access_log_user_agent FOREIGN KEY (user_agent) REFERENCES entities(id), + CONSTRAINT fk_access_log_content_type FOREIGN KEY (content_type) REFERENCES entities(id) ); CREATE OR REPLACE VIEW access_log_view AS @@ -85,7 +88,9 @@ SELECT r.id referrer_id, r.value referrer, u.id user_agent_id, - u.value user_agent + u.value user_agent, + c.id content_type_id, + c.value content_type FROM access_log l LEFT OUTER JOIN entities h ON l.hostname = h.id @@ -93,4 +98,5 @@ FROM LEFT OUTER JOIN entities p ON l.path = p.id LEFT OUTER JOIN entities q ON l.query_string = q.id LEFT OUTER JOIN entities r ON l.referrer = r.id - LEFT OUTER JOIN entities u ON l.user_agent = u.id; + LEFT OUTER JOIN entities u ON l.user_agent = u.id + LEFT OUTER JOIN entities c ON l.user_agent = c.id; diff --git a/src/sql/accessLogInsert.sql b/src/sql/accessLogInsert.sql index 42f809b..518045e 100644 --- a/src/sql/accessLogInsert.sql +++ b/src/sql/accessLogInsert.sql @@ -1,3 +1,3 @@ INSERT INTO access_log(hostname, virtual_host, remoteip, request_time, method, path, query_string, protocol, status, - size, duration, referrer, user_agent) - VALUES (?, ?, ?, TO_TIMESTAMP(? / 1000000.0) at time zone 'utc', ?, ?, ?, ?, ?, ?, ? * '1us'::interval, ?, ?) + size, duration, referrer, user_agent, content_type) + VALUES (?, ?, ?, TO_TIMESTAMP(? / 1000000.0) at time zone 'utc', ?, ?, ?, ?, ?, ?, ? * '1us'::interval, ?, ?, ?) diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp index efc7bc9..d523aab 100644 --- a/test/test-ingest.cpp +++ b/test/test-ingest.cpp @@ -151,9 +151,10 @@ BOOST_DATA_TEST_CASE(CLFStringsBad, } constexpr std::string_view LOGLINE1 - = R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36")LOG"; + = R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36" "test/plain")LOG"; +constexpr std::string_view LOGLINE1_PARKED = "parked-237093379.log"; constexpr std::string_view LOGLINE2 - = R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")LOG"; + = R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36" "image/png")LOG"; BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QuotedStringsGood")) BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QueryStringsGood")) @@ -166,12 +167,14 @@ BOOST_DATA_TEST_CASE(ExtractFields, R"(/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest)", R"(h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714)", "HTTP/1.1", 200, 1884, 107791, std::nullopt, - R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)"}}, + R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)", + "test/plain"}}, {LOGLINE2, {"www.randomdan.homeip.net", "43.128.84.166", 1755561575973204, "GET", "/app-dicts/myspell-et/Manifest", std::nullopt, "HTTP/1.1", 200, 312, 10369, "https://google.com", - R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)"}}, + R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)", + "image/png"}}, }), input, expected) { @@ -183,7 +186,7 @@ BOOST_DATA_TEST_CASE(ExtractFields, BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700) { const auto result = WebStat::Ingestor::scanLogLine( - R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" "-")LOG"); BOOST_REQUIRE(result); BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG"); } @@ -191,7 +194,7 @@ BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700) BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3603068405) { const auto result = WebStat::Ingestor::scanLogLine( - R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG"); + R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" "-")LOG"); BOOST_REQUIRE(result); BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress'yqFSRA<'">yuezhx)LOG"); } @@ -231,7 +234,8 @@ BOOST_DATA_TEST_CASE(StoreLogLine, BOOST_CHECK_EQUAL(linesRead, 0); BOOST_CHECK_EQUAL(linesParsed, 1); BOOST_CHECK_EQUAL(linesDiscarded, 0); - BOOST_CHECK_EQUAL(existingEntities.size(), 4); + BOOST_CHECK_EQUAL(linesParked, 0); + BOOST_CHECK_EQUAL(existingEntities.size(), 5); } BOOST_AUTO_TEST_CASE(StoreLog, *boost::unit_test::depends_on("I/StoreLogLine")) @@ -249,7 +253,7 @@ BOOST_AUTO_TEST_CASE(ParkLogLine) { parkLogLine(LOGLINE1); BOOST_CHECK_EQUAL(linesParked, 1); - const auto path = settings.fallbackDir / "parked-3377916038.log"; + const auto path = settings.fallbackDir / LOGLINE1_PARKED; BOOST_TEST_INFO(path); BOOST_REQUIRE(std::filesystem::exists(path)); BOOST_CHECK_EQUAL(std::filesystem::file_size(path), LOGLINE1.length()); @@ -275,7 +279,7 @@ BOOST_AUTO_TEST_CASE(IngestParked, *boost::unit_test::depends_on("I/ParkLogLine" jobIngestParkedLines(); BOOST_CHECK_EQUAL(linesParsed, 1); BOOST_CHECK_EQUAL(linesDiscarded, 0); - BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log")); + BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED)); } BOOST_AUTO_TEST_CASE(IngestParkedJob, *boost::unit_test::depends_on("I/IngestParked")) @@ -299,7 +303,7 @@ BOOST_AUTO_TEST_CASE(IngestParkedJob, *boost::unit_test::depends_on("I/IngestPar BOOST_CHECK_EQUAL(linesParsed, 1); BOOST_CHECK_EQUAL(linesDiscarded, 0); BOOST_CHECK_GE(lastRunIngestParkedLines, now); - BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log")); + BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED)); } BOOST_AUTO_TEST_CASE(JobErrorRescheduler, *boost::unit_test::depends_on("I/IngestParkedJob")) @@ -307,9 +311,9 @@ BOOST_AUTO_TEST_CASE(JobErrorRescheduler, *boost::unit_test::depends_on("I/Inges const auto now = JobLastRunTime::clock::now(); lastRunIngestParkedLines = now - settings.freqIngestParkedLines - 1s; parkLogLine(LOGLINE1); - std::filesystem::permissions(settings.fallbackDir / "parked-3377916038.log", std::filesystem::perms::owner_write); + std::filesystem::permissions(settings.fallbackDir / LOGLINE1_PARKED, std::filesystem::perms::owner_write); runJobsIdle(); - BOOST_CHECK(std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log")); + BOOST_CHECK(std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED)); BOOST_CHECK_GE(lastRunIngestParkedLines, now - (settings.freqIngestParkedLines / 2) - 1s); BOOST_CHECK_LE(lastRunIngestParkedLines, now - (settings.freqIngestParkedLines / 2) + 1s); } diff --git a/test/testing-util.cpp b/test/testing-util.cpp index 010b2c6..6e75354 100644 --- a/test/testing-util.cpp +++ b/test/testing-util.cpp @@ -34,6 +34,7 @@ namespace WebStat { std::vector<std::string> qss; std::vector<std::string> refs; std::vector<std::string> uas; + std::vector<std::string> ct; }; Strings strings; @@ -65,6 +66,7 @@ namespace WebStat { {strings.qss, 100, getStrGen(1, 50)}, {strings.refs, 50, getStrGen(10, 50)}, {strings.uas, 10, getStrGen(50, 70)}, + {strings.ct, 10, getStrGen(10, 20)}, }) { std::generate_n(std::back_inserter(out), count, stringGenerator); } @@ -86,10 +88,11 @@ namespace WebStat { std::ofstream logfile {path}; for (size_t line = 0; line < entries; ++line) { - std::println(logfile, R"LOG({} {} {} GET "/{}" "?{}" HTTP/1.1 200 {} {} "{}" "{}")LOG", + std::println(logfile, R"LOG({} {} {} GET "/{}" "?{}" HTTP/1.1 200 {} {} "{}" "{}" "{}")LOG", randomString(strings.vhosts), randomString(strings.ips), tick += tickDistrib(generator), randomString(strings.paths), randomString(strings.qss), sizeDistrib(generator), - durationDistrib(generator), randomString(strings.refs), randomString(strings.uas)); + durationDistrib(generator), randomString(strings.refs), randomString(strings.uas), + randomString(strings.ct)); } } |
