From 30a9d45f5322a63c2adf1de7a248ba3a9a0c3903 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Sat, 20 Dec 2025 15:16:25 +0000 Subject: Add job for puring old access log entries from the database --- src/ingestor.cpp | 21 +++++++++++++++++++++ src/ingestor.hpp | 9 +++++++++ src/sql.cpp | 4 ++++ src/sql.hpp | 1 + src/sql/accessLogPurgeOld.sql | 11 +++++++++++ src/webstat_logger_main.cpp | 10 ++++++++++ test/test-ingest.cpp | 5 +++++ 7 files changed, 61 insertions(+) create mode 100644 src/sql/accessLogPurgeOld.sql diff --git a/src/ingestor.cpp b/src/ingestor.cpp index 0c9ce4e..44107ce 100644 --- a/src/ingestor.cpp +++ b/src/ingestor.cpp @@ -221,6 +221,7 @@ namespace WebStat { } }; runJobAsNeeded(&Ingestor::jobIngestParkedLines, lastRunIngestParkedLines, settings.freqIngestParkedLines); + runJobAsNeeded(&Ingestor::jobPurgeOldLogs, lastRunPurgeOldLogs, settings.freqPurgeOldLogs); } void @@ -260,6 +261,26 @@ namespace WebStat { std::filesystem::remove(path); } + unsigned int + Ingestor::jobPurgeOldLogs() + { + auto dbconn = dbpool->get(); + const auto stopAt = JobLastRunTime::clock::now() + settings.purgeDeleteMaxTime; + const auto purge = dbconn->modify(SQL::ACCESS_LOG_PURGE_OLD, SQL::ACCESS_LOG_PURGE_OLD_OPTS); + purge->bindParam(0, settings.purgeDeleteMax); + purge->bindParam(1, std::format("{} days", settings.purgeDaysToKeep)); + unsigned int purgedTotal {}; + while (stopAt > JobLastRunTime::clock::now()) { + const auto purged = purge->execute(); + purgedTotal += purged; + if (purged < settings.purgeDeleteMax) { + break; + } + std::this_thread::sleep_for(settings.purgeDeletePause); + } + return purgedTotal; + } + template Ingestor::NewEntities Ingestor::newEntities(const std::tuple & values) const diff --git a/src/ingestor.hpp b/src/ingestor.hpp index 3e25938..a19c8ec 100644 --- a/src/ingestor.hpp +++ b/src/ingestor.hpp @@ -17,6 +17,7 @@ namespace WebStat { using namespace std::chrono_literals; struct IngestorSettings : Settings { + // NOLINTBEGIN(readability-magic-numbers) std::string dbConnStr = "dbname=webstat user=webstat"; std::string userAgentAPI = "https://useragentstring.com"; std::filesystem::path fallbackDir = "/var/log/webstat"; @@ -24,6 +25,12 @@ namespace WebStat { unsigned int dbKeep = 2; int idleJobsAfter = duration_cast(1min).count(); minutes freqIngestParkedLines = 30min; + minutes freqPurgeOldLogs = 6h; + unsigned int purgeDaysToKeep = 61; // ~2 months + unsigned int purgeDeleteMax = 10'000; + minutes purgeDeleteMaxTime = 5min; + seconds purgeDeletePause = 3s; + // NOLINTEND(readability-magic-numbers) }; class Ingestor { @@ -48,6 +55,7 @@ namespace WebStat { void runJobsIdle(); void jobIngestParkedLines(); + unsigned int jobPurgeOldLogs(); template void storeLogLine(DB::Connection *, const std::tuple &) const; @@ -64,6 +72,7 @@ namespace WebStat { using JobLastRunTime = std::chrono::system_clock::time_point; JobLastRunTime lastRunIngestParkedLines; + JobLastRunTime lastRunPurgeOldLogs; private: static constexpr size_t MAX_NEW_ENTITIES = 6; diff --git a/src/sql.cpp b/src/sql.cpp index 9c0d992..da95f18 100644 --- a/src/sql.cpp +++ b/src/sql.cpp @@ -8,6 +8,9 @@ namespace WebStat::SQL { const std::string ACCESS_LOG_INSERT { #embed "sql/accessLogInsert.sql" + }; + const std::string ACCESS_LOG_PURGE_OLD { +#embed "sql/accessLogPurgeOld.sql" }; const std::string ENTITY_INSERT { #embed "sql/entityInsert.sql" @@ -21,6 +24,7 @@ namespace WebStat::SQL { #define HASH_OPTS(VAR) \ const DB::CommandOptionsPtr VAR##_OPTS = std::make_shared(std::hash {}(VAR)) HASH_OPTS(ACCESS_LOG_INSERT); + HASH_OPTS(ACCESS_LOG_PURGE_OLD); HASH_OPTS(ENTITY_INSERT); HASH_OPTS(ENTITY_UPDATE_DETAIL); HASH_OPTS(HOST_UPSERT); diff --git a/src/sql.hpp b/src/sql.hpp index f0dfb05..1a12823 100644 --- a/src/sql.hpp +++ b/src/sql.hpp @@ -9,6 +9,7 @@ namespace WebStat::SQL { extern const DB::CommandOptionsPtr Name##_OPTS EMBED_DECLARE(ACCESS_LOG_INSERT); + EMBED_DECLARE(ACCESS_LOG_PURGE_OLD); EMBED_DECLARE(ENTITY_INSERT); EMBED_DECLARE(ENTITY_UPDATE_DETAIL); EMBED_DECLARE(HOST_UPSERT); diff --git a/src/sql/accessLogPurgeOld.sql b/src/sql/accessLogPurgeOld.sql new file mode 100644 index 0000000..8379018 --- /dev/null +++ b/src/sql/accessLogPurgeOld.sql @@ -0,0 +1,11 @@ +WITH scope AS ( + SELECT id + FROM access_log + ORDER BY id + LIMIT ? +), scoperange AS ( + SELECT min(id) minid, max(id) maxid + FROM scope) +DELETE FROM access_log USING scoperange +WHERE request_time < CURRENT_DATE - ?::interval + AND access_log.id BETWEEN scoperange.minid AND scoperange.maxid diff --git a/src/webstat_logger_main.cpp b/src/webstat_logger_main.cpp index c859abf..7f4d9b4 100644 --- a/src/webstat_logger_main.cpp +++ b/src/webstat_logger_main.cpp @@ -59,6 +59,16 @@ main(int argc, char ** argv) "Run idle when there's no activity for this period (ms)") ("job.parked.freq", po::value(&settings.freqIngestParkedLines)->default_value(settings.freqIngestParkedLines), "How often to check for and import parked log lines") + ("job.purge.freq", po::value(&settings.freqPurgeOldLogs)->default_value(settings.freqPurgeOldLogs), + "How often to purge old access log entries from the database") + ("job.purge.days", po::value(&settings.purgeDaysToKeep)->default_value(settings.purgeDaysToKeep), + "How many days of access log entries to keep") + ("job.purge.max", po::value(&settings.purgeDeleteMax)->default_value(settings.purgeDeleteMax), + "Maximum number of access log entries to delete in a single operation") + ("job.purge.time", po::value(&settings.purgeDeleteMaxTime)->default_value(settings.purgeDeleteMaxTime), + "Maximum amount of time to spending purging old access log entries before continuing to ingest") + ("job.purge.pause", po::value(&settings.purgeDeletePause)->default_value(settings.purgeDeletePause), + "Time to pause for between repeated exections of a delete operation") ; // clang-format on po::variables_map optVars; diff --git a/test/test-ingest.cpp b/test/test-ingest.cpp index 7692234..9b77567 100644 --- a/test/test-ingest.cpp +++ b/test/test-ingest.cpp @@ -341,6 +341,11 @@ BOOST_AUTO_TEST_CASE(DiscardUnparsable) BOOST_CHECK_EQUAL_COLLECTIONS(rows.begin(), rows.end(), EXPECTED.begin(), EXPECTED.end()); } +BOOST_AUTO_TEST_CASE(PurgeOldJob) +{ + BOOST_CHECK_EQUAL(2, jobPurgeOldLogs()); +} + BOOST_AUTO_TEST_SUITE_END(); BOOST_AUTO_TEST_CASE(FetchRealUserAgentDetail, *boost::unit_test::disabled()) -- cgit v1.2.3