diff options
author | Dan Goodliffe <dan@randomdan.homeip.net> | 2018-03-07 19:33:36 +0000 |
---|---|---|
committer | Dan Goodliffe <dan@randomdan.homeip.net> | 2018-03-07 19:33:36 +0000 |
commit | 6eafb8d4bbc9bfdd225723dbd1ed902d722c7f71 (patch) | |
tree | 2b11754eede3d672d47310645db9f023131497a5 | |
parent | Store parser flags in service config (diff) | |
download | mirrorsearch-6eafb8d4bbc9bfdd225723dbd1ed902d722c7f71.tar.bz2 mirrorsearch-6eafb8d4bbc9bfdd225723dbd1ed902d722c7f71.tar.xz mirrorsearch-6eafb8d4bbc9bfdd225723dbd1ed902d722c7f71.zip |
Use curl instead of libxml2's networking as we need more control over HTTP requests
-rw-r--r-- | service/Jamfile.jam | 2 | ||||
-rw-r--r-- | service/apiImpl.cpp | 62 | ||||
-rw-r--r-- | service/main.cpp | 12 |
3 files changed, 70 insertions, 6 deletions
diff --git a/service/Jamfile.jam b/service/Jamfile.jam index e7bfed9..8f15002 100644 --- a/service/Jamfile.jam +++ b/service/Jamfile.jam @@ -5,11 +5,13 @@ import testing ; lib boost_utf : : <name>boost_unit_test_framework ; lib dbpp-postgresql : : : : <include>/usr/include/dbpp-postgresql ; lib dryice : : : : <include>/usr/include/icetray ; +lib curl ; lib mirrorsearch : [ glob *.cpp *.ice sql/*.sql : test.cpp ] : <slicer>yes + <library>curl <library>..//adhocutil <library>..//dbppcore <library>..//boost_system diff --git a/service/apiImpl.cpp b/service/apiImpl.cpp index 28d88b4..ab86d98 100644 --- a/service/apiImpl.cpp +++ b/service/apiImpl.cpp @@ -9,6 +9,10 @@ #include <libxml/xpathInternals.h> #include <libxml/HTMLparser.h> #include <libxml/HTMLtree.h> +#include <curl/curl.h> + +#define CESSO(curl, opt, expr) \ + BOOST_VERIFY_MSG(CURLE_OK == curl_easy_setopt(curl.get(), opt, expr), "Failed setting option " #opt); namespace MirrorSearch { SearchImpl::SearchImpl(IceTray::DatabasePoolPtr db) : @@ -34,14 +38,62 @@ namespace MirrorSearch { return std::bind(&libxmlErrorHandler<Fmt, P...>, std::placeholders::_1, p...); } + template<typename Fmt, typename ... P> + void + curlErrorHandler(const std::string & fn, const char * errbuf, const P & ... p) + { + throw XmlError(Fmt::get(fn, errbuf, p...)); + } + template<typename Fmt, typename ... P> + auto cEHB(const char * errbuf, const P & ... p) + { + return std::bind(&curlErrorHandler<Fmt, P...>, std::placeholders::_1, errbuf, p...); + } + typedef UPtr<xmlDoc> xmlDocSPtr; typedef UPtr<xmlXPathContext> xmlXPathContextSPtr; typedef UPtr<xmlXPathObject> xmlXPathObjectSPtr; + typedef UPtr<xmlParserCtxt> xmlParserCtxtSPtr; + + typedef std::function<size_t(const char *, size_t)> CurlWriteCallback; - AdHocFormatter(Read, "Failed to read in %? (%?) [%?, %?]"); - static auto getDoc(const ::std::string & url, int flags) + static size_t write_callback(char * ptr, size_t size, size_t nmemb, void * userdata) { - return make_unique(htmlReadFile, xmlFreeDoc, lEHB<Read>(url, flags), url.c_str(), (const char*)NULL, flags); + return (*(MirrorSearch::CurlWriteCallback *)(userdata))(ptr, size * nmemb); + } + + AdHocFormatter(Read, "Failed to read in %? (%?) [%?]"); + UPtr<xmlDoc> getDoc(const SearchServicePtr & ss, const std::string & fn) { + auto fmt = AdHoc::Buffer::getFormat(ss->baseurl); + auto url = (*fmt % fn).str(); + char errbuf[CURL_ERROR_SIZE] = ""; + + xmlParserCtxtSPtr ctx { nullptr, nullptr }; + + auto curl = make_unique(curl_easy_init, curl_easy_cleanup, cEHB<Read>(errbuf, url)); + BOOST_ASSERT(curl); + CESSO(curl, CURLOPT_URL, url.c_str()); + CESSO(curl, CURLOPT_WRITEFUNCTION, write_callback); + CurlWriteCallback cb = [&ctx, &url, &ss](auto data, auto size) { + if (!ctx) { + ctx = make_unique(htmlCreatePushParserCtxt, htmlFreeParserCtxt, lEHB<Read>(url), + (xmlSAXHandlerPtr)NULL, (void*)NULL, data, size, url.c_str(), XML_CHAR_ENCODING_NONE); + htmlCtxtUseOptions(ctx.get(), ss->parserflags); + } + else { + htmlParseChunk(ctx.get(), data, size, 0); + } + return size; + }; + CESSO(curl, CURLOPT_WRITEDATA, &cb); + if (curl_easy_perform(curl.get()) != CURLE_OK) { + curlErrorHandler<Read>(failingFunction((void*)&curl_easy_perform), errbuf, url); + } + BOOST_VERIFY_MSG(ctx, "No ctx and no previous error should never happen."); + htmlParseChunk(ctx.get(), "", 0, 1); + + UPtr<xmlDoc> doc = { ctx->myDoc, xmlFreeDoc }; + return doc; } AdHocFormatter(XPathCtx, "Failed to create xpath context in %? (%?)"); @@ -62,9 +114,7 @@ namespace MirrorSearch { void SearchImpl::callService(const ::std::string & fn, const SearchServicePtr & s, SearchHits & sh) const { - auto fmt = AdHoc::Buffer::getFormat(s->baseurl); - auto url = (*fmt % fn).str(); - auto doc = getDoc(url, s->parserflags); + auto doc = getDoc(s, fn); auto xpathCtx = getXPathCxt(doc); auto xpathObj = getXPathObj(s->listxpath, xpathCtx, xmlXPathObjectType::XPATH_NODESET); log->messagebf(LOG::INFO, "%d nodes matched %s", xpathObj->nodesetval->nodeNr, s->listxpath); diff --git a/service/main.cpp b/service/main.cpp index 97bc89e..00563f4 100644 --- a/service/main.cpp +++ b/service/main.cpp @@ -2,10 +2,22 @@ #include <Ice/ObjectAdapter.h> #include <icetrayService.h> #include "apiImpl.h" +#include <curl/curl.h> +#include <libxml/parser.h> namespace MirrorSearch { class Api : public IceTray::Service { public: + Api() + { + xmlInitParser(); + curl_global_init(0); + } + ~Api() + { + xmlCleanupParser(); + curl_global_cleanup(); + } void addObjects(const std::string &, const Ice::CommunicatorPtr & ic, const Ice::StringSeq &, const Ice::ObjectAdapterPtr & adp) override { auto dbpool = getConnectionPool(ic, "postgresql", "MirrorSearch"); |