From a4f38b0d7c7b2e465d8840abd508a88eb59d8784 Mon Sep 17 00:00:00 2001 From: randomdan Date: Tue, 10 Aug 2010 23:19:58 +0000 Subject: Minor fixes to text parsers Add a URL base row implementation --- project2/Jamfile.jam | 3 + project2/fileRows.cpp | 5 +- project2/iterate.cpp | 3 + project2/urlRows.cpp | 181 ++++++++++++++++++++++++++++++++++++++++++++++++++ project2/urlRows.h | 61 +++++++++++++++++ project2/view.cpp | 2 + 6 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 project2/urlRows.cpp create mode 100644 project2/urlRows.h diff --git a/project2/Jamfile.jam b/project2/Jamfile.jam index e5633a9..ac82e15 100644 --- a/project2/Jamfile.jam +++ b/project2/Jamfile.jam @@ -14,6 +14,7 @@ lib boost_regex : : boost_regex ; lib boost_filesystem : : boost_filesystem ; lib cgicc : : cgicc ; lib esmtp : : esmtp ; +lib curl : : curl ; exe p2web : libxmlpp @@ -28,6 +29,7 @@ exe p2web : odbc esmtp cgicc + curl fcgi++ fcgi ; @@ -43,4 +45,5 @@ exe p2console : boost_regex boost_filesystem odbc + curl esmtp ; diff --git a/project2/fileRows.cpp b/project2/fileRows.cpp index 955767c..d9c26bc 100644 --- a/project2/fileRows.cpp +++ b/project2/fileRows.cpp @@ -51,10 +51,11 @@ _FileRows::execute() const if (prevWasQuote) { tok += c; prevWasQuote = false; + inQuotes = !inQuotes; } else { + prevWasQuote = inQuotes; inQuotes = !inQuotes; - prevWasQuote = true; } } else if ((!inQuotes) && (c == fieldSep)) { @@ -82,7 +83,7 @@ _FileRows::execute() const curCol++; } } - if (!mkCols) { + if (!mkCols && !values.empty()) { while (values.size() < columns.size()) { values.push_back(ValPtr(new Glib::ustring())); curCol++; diff --git a/project2/iterate.cpp b/project2/iterate.cpp index f6d4820..f29328b 100644 --- a/project2/iterate.cpp +++ b/project2/iterate.cpp @@ -4,6 +4,7 @@ #include "xmlObjectLoader.h" #include "sqlRows.h" #include "fileRows.h" +#include "urlRows.h" #include "procRows.h" #include "task.h" @@ -27,6 +28,7 @@ _Iterate::AddLoaders(Loaders & l, NoOutputExecutes & iterates) l.insert(LoadersVT("sqliterate", _LoaderBase::Make<_SqlIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates))); l.insert(LoadersVT("fileiterate", _LoaderBase::Make<_FileIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates))); l.insert(LoadersVT("prociterate", _LoaderBase::Make<_ProcIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates))); + l.insert(LoadersVT("urliterate", _LoaderBase::Make<_UrlIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates))); } void @@ -35,6 +37,7 @@ _Iterate::AddLoaders(Loaders & l, Iterates & iterates) l.insert(LoadersVT("sqliterate", _LoaderBase::Make<_SqlIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates))); l.insert(LoadersVT("fileiterate", _LoaderBase::Make<_FileIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates))); l.insert(LoadersVT("prociterate", _LoaderBase::Make<_ProcIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates))); + l.insert(LoadersVT("urliterate", _LoaderBase::Make<_UrlIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates))); } void diff --git a/project2/urlRows.cpp b/project2/urlRows.cpp new file mode 100644 index 0000000..ddae45c --- /dev/null +++ b/project2/urlRows.cpp @@ -0,0 +1,181 @@ +#include "urlRows.h" +#include "../libmisc/curlsup.h" +#include +#include + +_UrlRows::_UrlRows(const xmlpp::Element * p) : + url(p->get_attribute_value("url")), + fieldSep(p->get_attribute_value("fieldSep")[0]), + quoteChar(p->get_attribute_value("quoteChar")[0]), + newline(p->get_attribute_value("newline")[0]), + encoding(p->get_attribute_value("encoding")), + skipheader(atoi(p->get_attribute_value("skipheader").c_str())), + inQuotes(false), + prevWasQuote(false) +{ + BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { + const xmlpp::Element * elem = dynamic_cast(node); + if (elem) { + columns.push_back(elem->get_child_text()->get_content()); + } + } + mkCols = columns.empty(); +} + +_UrlRows::~_UrlRows() +{ +} + +unsigned int +_UrlRows::columnCount() const +{ + return columns.size(); +} + +const Glib::ustring & +_UrlRows::getColumnName(unsigned int col) const +{ + return columns[col]; +} + +const Glib::ustring & +_UrlRows::getCurrentValue(unsigned int col) const +{ + return *values[col]; +} + +const Glib::ustring & +_UrlRows::getCurrentValue(const Glib::ustring & id) const +{ + Values::const_iterator v = values.begin(); + for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { + if (*i == id) { + return **v; + } + } + throw PerRowValues::FieldDoesNotExist(); +} + +size_t +_UrlRows::_handleData(const char * ptr, size_t size, size_t nmemb, void *stream) +{ + size_t used = static_cast(stream)->handleData(ptr, size * nmemb); + return used; +} + +size_t +_UrlRows::handleData(const char * bytes, size_t bytesLen) const +{ + size_t used = 0, len = 0; + char * utf8 = g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL); + Glib::ustring str(utf8); + free(utf8); + + BOOST_FOREACH(gunichar c, str) { + if (c == newline) { + if (skipheader) { + skipheader -= 1; + } + else { + if (!mkCols) { + if (!tok.empty()) { + values.push_back(ValPtr(new Glib::ustring(tok))); + } + while (values.size() < columns.size()) { + values.push_back(ValPtr(new Glib::ustring())); + } + rowReady(); + } + else { + mkCols = false; + } + } + values.clear(); + tok.clear(); + } + else if (c == quoteChar) { + if (prevWasQuote) { + tok += c; + prevWasQuote = false; + inQuotes = !inQuotes; + } + else { + prevWasQuote = inQuotes; + inQuotes = !inQuotes; + } + } + else if ((!inQuotes) && (c == fieldSep)) { + prevWasQuote = false; + if (skipheader == 0) { + if (mkCols) { + addColumn(tok); + } + else { + values.push_back(ValPtr(new Glib::ustring(tok))); + } + } + tok.clear(); + } + else { + prevWasQuote = false; + tok += c; + } + } + + return used; +} + +void +_UrlRows::addColumn(const Glib::ustring & rawtok) const +{ + columns.push_back(rawtok); + Glib::ustring & tok(columns.back()); + for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { + if (!isalnum(*i)) { + tok.erase(i); + } + else { + i++; + } + } +} + +void +_UrlRows::execute() const +{ + CurlHandle::Ptr c = new CurlHandle(); + c->setopt(CURLOPT_URL, url.c_str()); + //c->setopt(CURLOPT_PROXY, proxy.c_str()); + c->setopt(CURLOPT_FOLLOWLOCATION, 1); + //c->setopt(CURLOPT_COOKIEFILE, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); + //c->setopt(CURLOPT_COOKIEJAR, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); + c->setopt(CURLOPT_ENCODING, "deflate, gzip"); + c->setopt(CURLOPT_USERAGENT, "project2/0.3"); + c->setopt(CURLOPT_WRITEDATA, this); + c->setopt(CURLOPT_WRITEFUNCTION, &_handleData); + c->perform(); + if (!tok.empty()) { + if (skipheader == 0) { + if (mkCols) { + addColumn(tok); + } + else { + values.push_back(ValPtr(new Glib::ustring(tok))); + } + } + } + if (!values.empty()) { + while (values.size() < columns.size()) { + values.push_back(ValPtr(new Glib::ustring())); + } + rowReady(); + values.clear(); + } + values.clear(); +} + +#include "view.hpp" +template class _GenericView<_UrlRows>; +#include "iterate.hpp" +template class _GenericIterate<_UrlRows>; + diff --git a/project2/urlRows.h b/project2/urlRows.h new file mode 100644 index 0000000..f94e149 --- /dev/null +++ b/project2/urlRows.h @@ -0,0 +1,61 @@ +#ifndef URLROWS_H +#define URLROWS_H + +#include +#include +#include +#include "view.h" +#include "iterate.h" + +class _UrlRows : public PerRowValues { + public: + _UrlRows(const xmlpp::Element * p); + ~_UrlRows(); + + void execute() const; + unsigned int columnCount() const; + const Glib::ustring & getColumnName(unsigned int col) const; + const Glib::ustring & getCurrentValue(const Glib::ustring & id) const; + const Glib::ustring & getCurrentValue(unsigned int col) const; + virtual void rowReady() const = 0; + + typedef std::set CharSet; + const Glib::ustring url; + + protected: + void addColumn(const Glib::ustring & rawtok) const; + typedef boost::shared_ptr ValPtr; + typedef std::vector Values; + mutable Values values; + + private: + static size_t _handleData(const char * ptr, size_t size, size_t nmemb, void * stream); + size_t handleData(const char * bytes, size_t bytesLen) const; + gunichar fieldSep; + gunichar quoteChar; + gunichar newline; + std::string encoding; + mutable size_t skipheader; + typedef std::vector Columns; + mutable Columns columns; + + // Used in CURL callback + mutable bool mkCols; + mutable bool inQuotes; + mutable bool prevWasQuote; + mutable Glib::ustring tok; +}; +typedef boost::shared_ptr<_UrlRows> UrlRows; + +typedef _GenericView<_UrlRows> _UrlView; +typedef boost::shared_ptr<_UrlView> UrlView; +typedef std::map UrlViews; + +typedef _GenericIterate<_UrlRows> _UrlIterate; +typedef boost::shared_ptr<_UrlIterate> UrlIterate; +typedef std::map UrlIterates; + +#endif + + + diff --git a/project2/view.cpp b/project2/view.cpp index 530d7b5..217424f 100644 --- a/project2/view.cpp +++ b/project2/view.cpp @@ -3,6 +3,7 @@ #include "xmlObjectLoader.h" #include "rawView.h" #include "fileRows.h" +#include "urlRows.h" #include "sqlRows.h" #include "procRows.h" @@ -26,6 +27,7 @@ _View::AddLoaders(Loaders & l, Views & views) l.insert(LoadersVT("rawview", _LoaderBase::Make<_RawView, _View, std::string, _SourceObject, &_SourceObject::name>(&views))); l.insert(LoadersVT("fileview", _LoaderBase::Make<_FileView, _View, std::string, _SourceObject, &_SourceObject::name>(&views))); l.insert(LoadersVT("procview", _LoaderBase::Make<_ProcView, _View, std::string, _SourceObject, &_SourceObject::name>(&views))); + l.insert(LoadersVT("urlview", _LoaderBase::Make<_UrlView, _View, std::string, _SourceObject, &_SourceObject::name>(&views))); } void -- cgit v1.2.3