From 9652aed5c4711581917230c5dbcd38968cffc7e4 Mon Sep 17 00:00:00 2001 From: Dan Goodliffe Date: Fri, 7 Apr 2023 17:09:39 +0100 Subject: Validate sitemap and atom pages against XSDs --- gentoobrowse/src/Jamfile.jam | 2 +- gentoobrowse/src/atom.xsd | 426 +++++++++++++++++++++++++++++++++++++++ gentoobrowse/src/sitemap.xsd | 106 ++++++++++ gentoobrowse/src/test.cpp | 54 +++-- gentoobrowse/src/xml.xsd | 46 +++++ gentoobrowse/xslt/home-atom.xslt | 8 +- gentoobrowse/xslt/news-atom.xslt | 8 +- gentoobrowse/xslt/user-atom.xslt | 8 +- 8 files changed, 628 insertions(+), 30 deletions(-) create mode 100644 gentoobrowse/src/atom.xsd create mode 100644 gentoobrowse/src/sitemap.xsd create mode 100644 gentoobrowse/src/xml.xsd diff --git a/gentoobrowse/src/Jamfile.jam b/gentoobrowse/src/Jamfile.jam index c39ee70..af99f1e 100644 --- a/gentoobrowse/src/Jamfile.jam +++ b/gentoobrowse/src/Jamfile.jam @@ -83,7 +83,7 @@ path-constant me : . ; run test.cpp : -- : - [ sequence.insertion-sort [ glob ../xslt/*.xslt ] ] + [ sequence.insertion-sort [ glob ../xslt/*.xslt ] [ glob *.xsd ] ] : BOOST_TEST_DYN_LINK ROOT=\"$(me)\" diff --git a/gentoobrowse/src/atom.xsd b/gentoobrowse/src/atom.xsd new file mode 100644 index 0000000..f59dfeb --- /dev/null +++ b/gentoobrowse/src/atom.xsd @@ -0,0 +1,426 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The "atom:link" element defines a reference from an + entry or feed to a Web resource. This specification + assigns no + meaning to the content (if any) of this + element. + + + + + + + + + + + + + + + + + + + + + + + + + atom:source is used to preserve metadata of a feed + when + an entry is copied from a feed to another feed. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + The "atom:title" element is a Text construct that + conveys a human- readable title for an entry or feed. + atomTitle = + element atom:title { atomTextConstruct }. + + + + + + + + The "atom:updated" element is a Date construct + indicating the most recent instant in time when an entry + or feed was + modified in a way the publisher considers + significant. Therefore, not + all modifications + necessarily result in a changed atom:updated value. + atomUpdated = element atom:updated { atomDateConstruct + }. Publishers + MAY change the value of this element over + time. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gentoobrowse/src/sitemap.xsd b/gentoobrowse/src/sitemap.xsd new file mode 100644 index 0000000..36d7a95 --- /dev/null +++ b/gentoobrowse/src/sitemap.xsd @@ -0,0 +1,106 @@ + + + + + XML Schema for Sitemap files. + Last Modifed 2008-03-26 + + + + + + Container for a set of up to 50,000 document elements. + This is the root element of the XML file. + + + + + + + + + + + + + Container for the data needed to describe a document to crawl. + + + + + + + + + + + + + + REQUIRED: The location URI of a document. + The URI must conform to RFC 2396 (http://www.ietf.org/rfc/rfc2396.txt). + + + + + + + + + + + OPTIONAL: The date the document was last modified. The date must conform + to the W3C DATETIME format (http://www.w3.org/TR/NOTE-datetime). + Example: 2005-05-10 + Lastmod may also contain a timestamp. + Example: 2005-05-10T17:33:30+08:00 + + + + + + + + + + + + + + + OPTIONAL: Indicates how frequently the content at a particular URL is + likely to change. The value "always" should be used to describe + documents that change each time they are accessed. The value "never" + should be used to describe archived URLs. Please note that web + crawlers may not necessarily crawl pages marked "always" more often. + Consider this element as a friendly suggestion and not a command. + + + + + + + + + + + + + + + + OPTIONAL: The priority of a particular URL relative to other pages + on the same site. The value for this element is a number between + 0.0 and 1.0 where 0.0 identifies the lowest priority page(s). + The default priority of a page is 0.5. Priority is used to select + between pages on your site. Setting a priority of 1.0 for all URLs + will not help you, as the relative priority of pages on your site + is what will be considered. + + + + + + + + diff --git a/gentoobrowse/src/test.cpp b/gentoobrowse/src/test.cpp index fd4166b..01ffa76 100644 --- a/gentoobrowse/src/test.cpp +++ b/gentoobrowse/src/test.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -36,13 +37,16 @@ public: hdr["Accept-Encoding"] = "gzip, deflate, sdch"; hdr["Accept-Language"] = "en-GB,en;q=0.8"; } +}; +template class ChromiumRequestT : public ChromiumRequest { + using ChromiumRequest::ChromiumRequest; - virtual void standardAssertions(const char * name) = 0; + virtual T standardAssertions(const char * name) = 0; }; -class ChromiumRequestHtml : public ChromiumRequest { +class ChromiumRequestHtml : public ChromiumRequestT<> { public: - ChromiumRequestHtml(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequest(c, m, p) + ChromiumRequestHtml(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequestT(c, m, p) { hdr["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"; } @@ -74,9 +78,9 @@ public: } }; -class ChromiumRequest404 : public ChromiumRequest { +class ChromiumRequest404 : public ChromiumRequestT<> { public: - ChromiumRequest404(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequest(c, m, p) { } + ChromiumRequest404(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequestT(c, m, p) { } virtual void standardAssertions(const char *) override @@ -86,11 +90,11 @@ public: } }; -class ChromiumRequestDefaultXml : public ChromiumRequest { +class ChromiumRequestDefaultXml : public ChromiumRequestT> { public: - using ChromiumRequest::ChromiumRequest; + using ChromiumRequestT::ChromiumRequestT; - virtual void + virtual std::unique_ptr standardAssertions(const char *) override { auto h = getResponseHeaders(); @@ -98,8 +102,17 @@ public: BOOST_REQUIRE_EQUAL(h["Content-Type"], "application/xml"); BOOST_TEST_INFO(this->output.view()); BOOST_REQUIRE_NE(this->output.view().find("output); + auto p = std::make_unique(); + p->parse_stream(this->output); + return p; + } + + void + xsdAssertions(const char * name, const std::filesystem::path & xsd) + { + auto doc = standardAssertions(name); + xmlpp::XsdValidator validator {xsd}; + validator.validate(doc->get_document()); } }; @@ -111,14 +124,14 @@ public: } }; -class ChromiumRequestAtom : public ChromiumRequest { +class ChromiumRequestAtom : public ChromiumRequestXml { public: - ChromiumRequestAtom(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequest(c, m, p) + ChromiumRequestAtom(const Core * c, HttpMethod m, const std::string & p) : ChromiumRequestXml(c, m, p) { hdr["Accept"] = "application/atom+xml"; } - virtual void + virtual std::unique_ptr standardAssertions(const char *) override { auto h = getResponseHeaders(); @@ -126,8 +139,9 @@ public: BOOST_REQUIRE_EQUAL(h["Content-Type"], "application/atom+xml"); BOOST_TEST_INFO(this->output.view()); BOOST_REQUIRE_NE(this->output.view().find("output); + auto p = std::make_unique(); + p->parse_stream(this->output); + return p; } }; @@ -261,35 +275,35 @@ BOOST_AUTO_TEST_CASE(search_sitemap) { ChromiumRequestXml request(this, HttpMethod::GET, "/sitemap.xml"); process(&request); - request.standardAssertions(typeid(*this).name()); + request.xsdAssertions(typeid(*this).name(), rootDir / "sitemap.xsd"); } BOOST_AUTO_TEST_CASE(search_sitemap_dfl) { ChromiumRequestDefaultXml request(this, HttpMethod::GET, "/sitemap.xml"); process(&request); - request.standardAssertions(typeid(*this).name()); + request.xsdAssertions(typeid(*this).name(), rootDir / "sitemap.xsd"); } BOOST_AUTO_TEST_CASE(home_atom) { ChromiumRequestAtom request(this, HttpMethod::GET, "/"); process(&request); - request.standardAssertions(typeid(*this).name()); + request.xsdAssertions(typeid(*this).name(), rootDir / "atom.xsd"); } BOOST_AUTO_TEST_CASE(news_atom) { ChromiumRequestAtom request(this, HttpMethod::GET, "/news"); process(&request); - request.standardAssertions(typeid(*this).name()); + request.xsdAssertions(typeid(*this).name(), rootDir / "atom.xsd"); } BOOST_AUTO_TEST_CASE(user_atom) { ChromiumRequestAtom request(this, HttpMethod::GET, "/atom/randomdan"); process(&request); - request.standardAssertions(typeid(*this).name()); + request.xsdAssertions(typeid(*this).name(), rootDir / "atom.xsd"); } BOOST_AUTO_TEST_SUITE_END(); diff --git a/gentoobrowse/src/xml.xsd b/gentoobrowse/src/xml.xsd new file mode 100644 index 0000000..0ca0876 --- /dev/null +++ b/gentoobrowse/src/xml.xsd @@ -0,0 +1,46 @@ + + + + + + + + + + + + + + + + + + + + + + + + + See http://www.w3.org/TR/xmlbase/ for + information about this attribute. + + + + + See http://www.w3.org/TR/xml-id/ for + information about this attribute. + + + + + + + + + diff --git a/gentoobrowse/xslt/home-atom.xslt b/gentoobrowse/xslt/home-atom.xslt index 5cd854a..1f80744 100644 --- a/gentoobrowse/xslt/home-atom.xslt +++ b/gentoobrowse/xslt/home-atom.xslt @@ -57,9 +57,11 @@ - - - + + + + + diff --git a/gentoobrowse/xslt/news-atom.xslt b/gentoobrowse/xslt/news-atom.xslt index bf4350d..11c2006 100644 --- a/gentoobrowse/xslt/news-atom.xslt +++ b/gentoobrowse/xslt/news-atom.xslt @@ -69,9 +69,11 @@ - - - + + + + + diff --git a/gentoobrowse/xslt/user-atom.xslt b/gentoobrowse/xslt/user-atom.xslt index 32b36e9..b52dc78 100644 --- a/gentoobrowse/xslt/user-atom.xslt +++ b/gentoobrowse/xslt/user-atom.xslt @@ -60,9 +60,11 @@ - - - + + + + + -- cgit v1.2.3