summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Goodliffe <dan@randomdan.homeip.net>2026-01-18 01:36:06 +0000
committerDan Goodliffe <dan@randomdan.homeip.net>2026-01-18 01:36:06 +0000
commit04acfa679fd846ac829ded5562093b3766c85154 (patch)
treea9625eb96c8c4ed246b1f4d172c06d85df577dfe
parent34051da2f27ffa40d0b6d20ae891a497fe73bfe5 (diff)
downloadwebstat-04acfa679fd846ac829ded5562093b3766c85154.tar.bz2
webstat-04acfa679fd846ac829ded5562093b3766c85154.tar.xz
webstat-04acfa679fd846ac829ded5562093b3766c85154.zip
Process new field, content-type, in input streamHEADmain
-rw-r--r--src/ingestor.cpp11
-rw-r--r--src/ingestor.hpp2
-rw-r--r--src/logTypes.hpp1
-rw-r--r--src/schema.sql14
-rw-r--r--src/sql/accessLogInsert.sql4
-rw-r--r--test/test-ingest.cpp28
-rw-r--r--test/testing-util.cpp7
7 files changed, 42 insertions, 25 deletions
diff --git a/src/ingestor.cpp b/src/ingestor.cpp
index 81642be..da39c59 100644
--- a/src/ingestor.cpp
+++ b/src/ingestor.cpp
@@ -54,7 +54,8 @@ namespace WebStat {
{
static constexpr std::tuple<ToEntity<EntityType::VirtualHost>, std::identity, std::identity, std::identity,
ToEntity<EntityType::Path>, ToEntity<EntityType::QueryString>, std::identity, std::identity,
- std::identity, std::identity, ToEntity<EntityType::Referrer>, ToEntity<EntityType::UserAgent>>
+ std::identity, std::identity, ToEntity<EntityType::Referrer>, ToEntity<EntityType::UserAgent>,
+ ToEntity<EntityType::ContentType>>
ENTITY_TYPE_MAP;
static constexpr size_t VALUE_COUNT = std::tuple_size_v<Ingestor::ScanValues>;
static_assert(VALUE_COUNT == std::tuple_size_v<decltype(ENTITY_TYPE_MAP)>);
@@ -99,8 +100,9 @@ namespace WebStat {
unsigned int, // size : %B : 1234
unsigned int, // duration : %D : 1234
CLFString, // referrer : "%{Referer}i" : "https://google.com/whatever" or "-"
- CLFString // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-"
- >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {})");
+ CLFString, // user_agent : "%{User-agent}i" : "Chromium v123.4" or "-"
+ CLFString // content_type : "%{Content-type}o" : "test/plain" or "-"
+ >(input, R"({} {} {} {:[A-Z]} {} {} {} {} {} {} {} {} {})");
}
void
@@ -321,7 +323,7 @@ namespace WebStat {
Ingestor::NewEntityIds
Ingestor::storeEntities(DB::Connection * dbconn, const std::span<const std::optional<Entity>> values) const
{
- static constexpr std::array<std::pair<std::string_view, void (Ingestor::*)(const Entity &) const>, 8>
+ static constexpr std::array<std::pair<std::string_view, void (Ingestor::*)(const Entity &) const>, 9>
ENTITY_TYPE_VALUES {{
{"host", nullptr},
{"virtual_host", nullptr},
@@ -331,6 +333,7 @@ namespace WebStat {
{"user_agent", &Ingestor::onNewUserAgent},
{"unparsable_line", nullptr},
{"uninsertable_line", nullptr},
+ {"content_type", nullptr},
}};
auto insert = dbconn->modify(SQL::ENTITY_INSERT, SQL::ENTITY_INSERT_OPTS);
diff --git a/src/ingestor.hpp b/src/ingestor.hpp
index a19c8ec..67a7a15 100644
--- a/src/ingestor.hpp
+++ b/src/ingestor.hpp
@@ -43,7 +43,7 @@ namespace WebStat {
using ScanResult = decltype(scn::scan<std::string_view, std::string_view, uint64_t, std::string_view,
QuotedString, QueryString, std::string_view, unsigned short, unsigned int, unsigned int, CLFString,
- CLFString>(std::declval<std::string_view>(), ""));
+ CLFString, CLFString>(std::declval<std::string_view>(), ""));
using ScanValues = std::remove_cvref_t<decltype(std::declval<WebStat::Ingestor::ScanResult>()->values())>;
[[nodiscard]] static ScanResult scanLogLine(std::string_view);
diff --git a/src/logTypes.hpp b/src/logTypes.hpp
index f9395d1..71393b2 100644
--- a/src/logTypes.hpp
+++ b/src/logTypes.hpp
@@ -31,6 +31,7 @@ namespace WebStat {
UserAgent,
UnparsableLine,
UninsertableLine,
+ ContentType,
};
using Crc32Value = uint32_t;
diff --git a/src/schema.sql b/src/schema.sql
index 7648b79..8008b3c 100644
--- a/src/schema.sql
+++ b/src/schema.sql
@@ -28,7 +28,8 @@ CREATE TYPE entity AS ENUM(
'referrer',
'user_agent',
'unparsable_line',
- 'uninsertable_line'
+ 'uninsertable_line',
+ 'content_type'
);
CREATE TABLE entities(
@@ -55,13 +56,15 @@ CREATE TABLE access_log(
duration interval second(6) NOT NULL,
referrer oid,
user_agent oid,
+ content_type oid,
CONSTRAINT pk_access_log PRIMARY KEY (id),
CONSTRAINT fk_access_log_hostname FOREIGN KEY (hostname) REFERENCES entities(id),
CONSTRAINT fk_access_log_virtualhost FOREIGN KEY (virtual_host) REFERENCES entities(id),
CONSTRAINT fk_access_log_path FOREIGN KEY (path) REFERENCES entities(id),
CONSTRAINT fk_access_log_query_string FOREIGN KEY (query_string) REFERENCES entities(id),
CONSTRAINT fk_access_log_referrer FOREIGN KEY (referrer) REFERENCES entities(id),
- CONSTRAINT fk_access_log_user_agent FOREIGN KEY (user_agent) REFERENCES entities(id)
+ CONSTRAINT fk_access_log_user_agent FOREIGN KEY (user_agent) REFERENCES entities(id),
+ CONSTRAINT fk_access_log_content_type FOREIGN KEY (content_type) REFERENCES entities(id)
);
CREATE OR REPLACE VIEW access_log_view AS
@@ -85,7 +88,9 @@ SELECT
r.id referrer_id,
r.value referrer,
u.id user_agent_id,
- u.value user_agent
+ u.value user_agent,
+ c.id content_type_id,
+ c.value content_type
FROM
access_log l
LEFT OUTER JOIN entities h ON l.hostname = h.id
@@ -93,4 +98,5 @@ FROM
LEFT OUTER JOIN entities p ON l.path = p.id
LEFT OUTER JOIN entities q ON l.query_string = q.id
LEFT OUTER JOIN entities r ON l.referrer = r.id
- LEFT OUTER JOIN entities u ON l.user_agent = u.id;
+ LEFT OUTER JOIN entities u ON l.user_agent = u.id
+ LEFT OUTER JOIN entities c ON l.user_agent = c.id;
diff --git a/src/sql/accessLogInsert.sql b/src/sql/accessLogInsert.sql
index 42f809b..518045e 100644
--- a/src/sql/accessLogInsert.sql
+++ b/src/sql/accessLogInsert.sql
@@ -1,3 +1,3 @@
INSERT INTO access_log(hostname, virtual_host, remoteip, request_time, method, path, query_string, protocol, status,
- size, duration, referrer, user_agent)
- VALUES (?, ?, ?, TO_TIMESTAMP(? / 1000000.0) at time zone 'utc', ?, ?, ?, ?, ?, ?, ? * '1us'::interval, ?, ?)
+ size, duration, referrer, user_agent, content_type)
+ VALUES (?, ?, ?, TO_TIMESTAMP(? / 1000000.0) at time zone 'utc', ?, ?, ?, ?, ?, ?, ? * '1us'::interval, ?, ?, ?)
diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp
index efc7bc9..d523aab 100644
--- a/test/test-ingest.cpp
+++ b/test/test-ingest.cpp
@@ -151,9 +151,10 @@ BOOST_DATA_TEST_CASE(CLFStringsBad,
}
constexpr std::string_view LOGLINE1
- = R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36")LOG";
+ = R"LOG(git.randomdan.homeip.net 98.82.40.168 1755561576768318 GET "/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest" "?h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714" HTTP/1.1 200 1884 107791 "-" "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36" "test/plain")LOG";
+constexpr std::string_view LOGLINE1_PARKED = "parked-237093379.log";
constexpr std::string_view LOGLINE2
- = R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36")LOG";
+ = R"LOG(www.randomdan.homeip.net 43.128.84.166 1755561575973204 GET "/app-dicts/myspell-et/Manifest" "" HTTP/1.1 200 312 10369 "https://google.com" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36" "image/png")LOG";
BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QuotedStringsGood"))
BOOST_TEST_DECORATOR(*boost::unit_test::depends_on("QueryStringsGood"))
@@ -166,12 +167,14 @@ BOOST_DATA_TEST_CASE(ExtractFields,
R"(/repo/gentoobrowse-api/commit/gentoobrowse-api/unittests/fixtures/756569aa764177340726dd3d40b41d89b11b20c7/app-crypt/pdfcrack/Manifest)",
R"(h=gentoobrowse-api-0.9.1&id=a2ed3fd30333721accd4b697bfcb6cc4165c7714)", "HTTP/1.1",
200, 1884, 107791, std::nullopt,
- R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)"}},
+ R"(Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Amazonbot/0.1; +https://developer.amazon.com/support/amazonbot) Chrome/119.0.6045.214 Safari/537.36)",
+ "test/plain"}},
{LOGLINE2,
{"www.randomdan.homeip.net", "43.128.84.166", 1755561575973204, "GET",
"/app-dicts/myspell-et/Manifest", std::nullopt, "HTTP/1.1", 200, 312, 10369,
"https://google.com",
- R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)"}},
+ R"(Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36)",
+ "image/png"}},
}),
input, expected)
{
@@ -183,7 +186,7 @@ BOOST_DATA_TEST_CASE(ExtractFields,
BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700)
{
const auto result = WebStat::Ingestor::scanLogLine(
- R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG");
+ R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912510520 GET "/packages/dev-php/pecl-uploadprogress(),')\"((,," "" HTTP/1.1 404 0 10051 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" "-")LOG");
BOOST_REQUIRE(result);
BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress(),')"((,,)LOG");
}
@@ -191,7 +194,7 @@ BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3580673700)
BOOST_AUTO_TEST_CASE(ExtractFieldsEdgeCasesUnparsable3603068405)
{
const auto result = WebStat::Ingestor::scanLogLine(
- R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36")LOG");
+ R"LOG(gentoobrowse.randomdan.homeip.net 5.183.129.58 1759960912705682 GET "/packages/dev-php/pecl-uploadprogress'yqFSRA<'\">yuezhx" "" HTTP/1.1 404 0 19143 "-" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36" "-")LOG");
BOOST_REQUIRE(result);
BOOST_CHECK_EQUAL(std::get<4>(result->values()), R"LOG(/packages/dev-php/pecl-uploadprogress'yqFSRA<'">yuezhx)LOG");
}
@@ -231,7 +234,8 @@ BOOST_DATA_TEST_CASE(StoreLogLine,
BOOST_CHECK_EQUAL(linesRead, 0);
BOOST_CHECK_EQUAL(linesParsed, 1);
BOOST_CHECK_EQUAL(linesDiscarded, 0);
- BOOST_CHECK_EQUAL(existingEntities.size(), 4);
+ BOOST_CHECK_EQUAL(linesParked, 0);
+ BOOST_CHECK_EQUAL(existingEntities.size(), 5);
}
BOOST_AUTO_TEST_CASE(StoreLog, *boost::unit_test::depends_on("I/StoreLogLine"))
@@ -249,7 +253,7 @@ BOOST_AUTO_TEST_CASE(ParkLogLine)
{
parkLogLine(LOGLINE1);
BOOST_CHECK_EQUAL(linesParked, 1);
- const auto path = settings.fallbackDir / "parked-3377916038.log";
+ const auto path = settings.fallbackDir / LOGLINE1_PARKED;
BOOST_TEST_INFO(path);
BOOST_REQUIRE(std::filesystem::exists(path));
BOOST_CHECK_EQUAL(std::filesystem::file_size(path), LOGLINE1.length());
@@ -275,7 +279,7 @@ BOOST_AUTO_TEST_CASE(IngestParked, *boost::unit_test::depends_on("I/ParkLogLine"
jobIngestParkedLines();
BOOST_CHECK_EQUAL(linesParsed, 1);
BOOST_CHECK_EQUAL(linesDiscarded, 0);
- BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log"));
+ BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED));
}
BOOST_AUTO_TEST_CASE(IngestParkedJob, *boost::unit_test::depends_on("I/IngestParked"))
@@ -299,7 +303,7 @@ BOOST_AUTO_TEST_CASE(IngestParkedJob, *boost::unit_test::depends_on("I/IngestPar
BOOST_CHECK_EQUAL(linesParsed, 1);
BOOST_CHECK_EQUAL(linesDiscarded, 0);
BOOST_CHECK_GE(lastRunIngestParkedLines, now);
- BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log"));
+ BOOST_CHECK(!std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED));
}
BOOST_AUTO_TEST_CASE(JobErrorRescheduler, *boost::unit_test::depends_on("I/IngestParkedJob"))
@@ -307,9 +311,9 @@ BOOST_AUTO_TEST_CASE(JobErrorRescheduler, *boost::unit_test::depends_on("I/Inges
const auto now = JobLastRunTime::clock::now();
lastRunIngestParkedLines = now - settings.freqIngestParkedLines - 1s;
parkLogLine(LOGLINE1);
- std::filesystem::permissions(settings.fallbackDir / "parked-3377916038.log", std::filesystem::perms::owner_write);
+ std::filesystem::permissions(settings.fallbackDir / LOGLINE1_PARKED, std::filesystem::perms::owner_write);
runJobsIdle();
- BOOST_CHECK(std::filesystem::exists(settings.fallbackDir / "parked-3377916038.log"));
+ BOOST_CHECK(std::filesystem::exists(settings.fallbackDir / LOGLINE1_PARKED));
BOOST_CHECK_GE(lastRunIngestParkedLines, now - (settings.freqIngestParkedLines / 2) - 1s);
BOOST_CHECK_LE(lastRunIngestParkedLines, now - (settings.freqIngestParkedLines / 2) + 1s);
}
diff --git a/test/testing-util.cpp b/test/testing-util.cpp
index 010b2c6..6e75354 100644
--- a/test/testing-util.cpp
+++ b/test/testing-util.cpp
@@ -34,6 +34,7 @@ namespace WebStat {
std::vector<std::string> qss;
std::vector<std::string> refs;
std::vector<std::string> uas;
+ std::vector<std::string> ct;
};
Strings strings;
@@ -65,6 +66,7 @@ namespace WebStat {
{strings.qss, 100, getStrGen(1, 50)},
{strings.refs, 50, getStrGen(10, 50)},
{strings.uas, 10, getStrGen(50, 70)},
+ {strings.ct, 10, getStrGen(10, 20)},
}) {
std::generate_n(std::back_inserter(out), count, stringGenerator);
}
@@ -86,10 +88,11 @@ namespace WebStat {
std::ofstream logfile {path};
for (size_t line = 0; line < entries; ++line) {
- std::println(logfile, R"LOG({} {} {} GET "/{}" "?{}" HTTP/1.1 200 {} {} "{}" "{}")LOG",
+ std::println(logfile, R"LOG({} {} {} GET "/{}" "?{}" HTTP/1.1 200 {} {} "{}" "{}" "{}")LOG",
randomString(strings.vhosts), randomString(strings.ips), tick += tickDistrib(generator),
randomString(strings.paths), randomString(strings.qss), sizeDistrib(generator),
- durationDistrib(generator), randomString(strings.refs), randomString(strings.uas));
+ durationDistrib(generator), randomString(strings.refs), randomString(strings.uas),
+ randomString(strings.ct));
}
}