diff options
author | randomdan <randomdan@localhost> | 2011-02-14 23:30:52 +0000 |
---|---|---|
committer | randomdan <randomdan@localhost> | 2011-02-14 23:30:52 +0000 |
commit | bc0c102ee7746d6a7b52a5fa4a4a65709823762f (patch) | |
tree | 14517e8efce4ecdf7cb6eb6c9387f087872a245a | |
parent | Convert dumpTask into a generic purpose file* writer (diff) | |
download | project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.bz2 project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.xz project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.zip |
Merge fileRows and urlRows content parser into a shared based; streamRows
-rw-r--r-- | project2/Jamfile.jam | 1 | ||||
-rw-r--r-- | project2/fileRows.cpp | 138 | ||||
-rw-r--r-- | project2/fileRows.h | 31 | ||||
-rw-r--r-- | project2/streamRows.cpp | 192 | ||||
-rw-r--r-- | project2/streamRows.h | 69 | ||||
-rw-r--r-- | project2/urlRows.cpp | 164 | ||||
-rw-r--r-- | project2/urlRows.h | 32 |
7 files changed, 291 insertions, 336 deletions
diff --git a/project2/Jamfile.jam b/project2/Jamfile.jam index 96a0797..594bbd1 100644 --- a/project2/Jamfile.jam +++ b/project2/Jamfile.jam @@ -79,6 +79,7 @@ lib p2processes : lib p2files : fsRows.cpp fileRows.cpp + streamRows.cpp : <library>../libmisc//misc <library>libxmlpp diff --git a/project2/fileRows.cpp b/project2/fileRows.cpp index e990f56..11a71ef 100644 --- a/project2/fileRows.cpp +++ b/project2/fileRows.cpp @@ -8,21 +8,9 @@ DECLARE_LOADER("filerows", FileRows); FileRows::FileRows(const xmlpp::Element * p) : SourceObject(p), - RowSet(p), - path(p, "path"), - fieldSep(p->get_attribute_value("fieldSep")[0]), - quoteChar(p->get_attribute_value("quoteChar")[0]), - keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"), - countBlankRows(p->get_attribute_value("keepBlankRows") == "count"), - newline(p->get_attribute_value("newline")), - encoding(p->get_attribute_value("encoding")) + StreamRows(p), + path(p, "path") { - BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { - const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); - if (elem) { - columns.push_back(boost::shared_ptr<Glib::ustring>(new Glib::ustring(elem->get_child_text()->get_content()))); - } - } } FileRows::~FileRows() @@ -40,131 +28,15 @@ FileRows::setFilter(const Glib::ustring &) throw NotSupported(__PRETTY_FUNCTION__); } -unsigned int -FileRows::columnCount() const -{ - return columns.size(); -} - -const Glib::ustring & -FileRows::getColumnName(unsigned int col) const -{ - return *columns[col]; -} - void FileRows::execute(const RowProcessor * rp) const { rowNum = 1; FileStarChannel c(doOpen()); c.set_encoding(encoding); - c.set_line_term(newline); - Glib::ustring line; - while (c.read_line(line) == Glib::IO_STATUS_NORMAL) { - if (boost::algorithm::ends_with(line, newline)) { - line.erase(line.length() - newline.length()); - } - Columns::const_iterator curCol = columns.begin(); - bool mkCols = columns.empty(); - bool inQuotes = false; - bool prevWasQuote = false; - typedef boost::shared_ptr<Glib::ustring> StringPtr; - StringPtr tok(new Glib::ustring()); - BOOST_FOREACH(gunichar c, line) { - if (c == quoteChar) { - if (prevWasQuote) { - *tok += c; - prevWasQuote = false; - inQuotes = !inQuotes; - } - else { - prevWasQuote = inQuotes; - inQuotes = !inQuotes; - } - } - else if ((!inQuotes) && (c == fieldSep)) { - prevWasQuote = false; - if (mkCols) { - addColumn(tok); - } - else { - values.push_back(tok); - curCol++; - } - tok = StringPtr(new Glib::ustring()); - } - else { - prevWasQuote = false; - *tok += c; - } - } - if (tok->length()) { - if (mkCols) { - addColumn(tok); - } - else { - values.push_back(tok); - curCol++; - } - } - if (!mkCols) { - if (keepBlankRows || !values.empty()) { - while (values.size() < columns.size()) { - values.push_back(VariableType()); - curCol++; - } - rp->rowReady(); - rowNum += 1; - } - else if (countBlankRows) { - rowNum +=1; - } - } - values.clear(); - } -} - -VariableType -FileRows::getCurrentValue(unsigned int col) const -{ - return values[col]; -} - -bool -FileRows::isNull(unsigned int) const -{ - return false; -} - -bool -FileRows::isNull(const Glib::ustring &) const -{ - return false; -} - -VariableType -FileRows::getCurrentValue(const Glib::ustring & id) const -{ - Values::const_iterator v = values.begin(); - for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { - if (**i == id) { - return *v; - } - } - throw RowSet::FieldDoesNotExist(); -} - -void -FileRows::addColumn(boost::shared_ptr<Glib::ustring> tok) const -{ - columns.push_back(tok); - for (Glib::ustring::iterator i = tok->begin(); i != tok->end(); ) { - if (!isalnum(*i)) { - tok->erase(i); - } - else { - i++; - } + gunichar ch; + while (c.read(ch) == Glib::IO_STATUS_NORMAL) { + this->pushChar(ch, rp); } } diff --git a/project2/fileRows.h b/project2/fileRows.h index 1a4b7ed..0fcc394 100644 --- a/project2/fileRows.h +++ b/project2/fileRows.h @@ -1,17 +1,12 @@ #ifndef FILEROWS_H #define FILEROWS_H -#include <libxml++/nodes/element.h> -#include <boost/intrusive_ptr.hpp> -#include <boost/shared_ptr.hpp> -#include <map> -#include "variables.h" -#include "rowSet.h" +#include "streamRows.h" #include "fileStarGlibIoChannel.h" class CommonObjects; -class FileRows : public RowSet { +class FileRows : public StreamRows { public: FileRows(const xmlpp::Element * p); ~FileRows(); @@ -19,31 +14,11 @@ class FileRows : public RowSet { void execute(const RowProcessor *) const; virtual void loadComplete(const CommonObjects *); virtual void setFilter(const Glib::ustring &); - unsigned int columnCount() const; - const Glib::ustring & getColumnName(unsigned int col) const; - VariableType getCurrentValue(const Glib::ustring & id) const; - VariableType getCurrentValue(unsigned int col) const; - bool isNull(unsigned int col) const; - bool isNull(const Glib::ustring & id) const; - - typedef std::set<gunichar> CharSet; + const Variable path; protected: virtual FileStarChannel doOpen() const; - void addColumn(boost::shared_ptr<Glib::ustring> rawtok) const; - typedef std::vector<VariableType> Values; - mutable Values values; - - private: - gunichar fieldSep; - gunichar quoteChar; - bool keepBlankRows; - bool countBlankRows; - std::string newline; - std::string encoding; - typedef std::vector<boost::shared_ptr<Glib::ustring> > Columns; - mutable Columns columns; }; #endif diff --git a/project2/streamRows.cpp b/project2/streamRows.cpp new file mode 100644 index 0000000..9160040 --- /dev/null +++ b/project2/streamRows.cpp @@ -0,0 +1,192 @@ +#include "streamRows.h" +#include "rowProcessor.h" +#include <boost/foreach.hpp> +#include <libxml++/nodes/textnode.h> + +StreamRows::StreamRows(const xmlpp::Element * p) : + SourceObject(p), + RowSet(p), + fieldSep(p->get_attribute_value("fieldSep")[0]), + quoteChar(p->get_attribute_value("quoteChar")[0]), + keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"), + countBlankRows(p->get_attribute_value("keepBlankRows") == "count"), + newline(p->get_attribute_value("newline")), + newlin(newline, 0, newline.length() - 1), + encoding(p->get_attribute_value("encoding")), + skipheader(atoi(p->get_attribute_value("skipheader").c_str())), + inQuotes(false), + prevWasQuote(false) +{ + unsigned int colNo = 0; + BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { + const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); + if (elem) { + columns.insert(Column(colNo++, elem->get_child_text()->get_content())); + } + } + mkCols = columns.empty(); +} + +StreamRows::~StreamRows() +{ +} + +const Glib::ustring & +StreamRows::getColumnName(unsigned int col) const +{ + Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col); + if (i != columns.get<byColIdx>().end()) { + return i->col; + } + throw RowSet::FieldDoesNotExist(); +} + +unsigned int +StreamRows::columnCount() const +{ + return columns.size(); +} + +VariableType +StreamRows::getCurrentValue(unsigned int col) const +{ + Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col); + if (i != columns.get<byColIdx>().end()) { + return i->value; + } + throw RowSet::FieldDoesNotExist(); +} + +bool +StreamRows::isNull(unsigned int col) const +{ + return (columns.get<byColIdx>().find(col) == columns.get<byColIdx>().end()); +} + +bool +StreamRows::isNull(const Glib::ustring & col) const +{ + return (columns.get<byColName>().find(col) == columns.get<byColName>().end()); +} + +VariableType +StreamRows::getCurrentValue(const Glib::ustring & col) const +{ + Columns::const_iterator i = columns.get<byColName>().find(col); + if (i != columns.end()) { + return i->value; + } + throw RowSet::FieldDoesNotExist(); +} + +void +StreamRows::addColumn(Glib::ustring & tok) const +{ + for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { + if (!isalnum(*i)) { + tok.erase(i); + } + else { + i++; + } + } + columns.insert(Column(columns.size(), tok)); +} + +StreamRows::Column::Column(unsigned int i, const Glib::ustring & c) : + idx(i), + col(c) +{ +} + +void +StreamRows::Column::operator=(const VariableType & v) const +{ + value = v; +} + +void +StreamRows::begin() const +{ + curCol = columns.get<byColIdx>().begin(); + tok = StringPtr(new Glib::ustring()); +} + +void +StreamRows::pushChar(gunichar c, const RowProcessor * rp) const +{ + if ((!inQuotes) && (c == *newline.rbegin()) && (*tok == newlin)) { + if (skipheader) { + skipheader -= 1; + } + else { + tok->erase(tok->length() - newlin.length()); + if (!mkCols) { + if (!tok->empty()) { + *curCol++ = VariableType(tok); + } + while (curCol != columns.get<byColIdx>().end()) { + *curCol++ = VariableType(); + } + rp->rowReady(); + rowNum += 1; + curCol = columns.get<byColIdx>().begin(); + } + else { + mkCols = false; + } + } + tok = StringPtr(new Glib::ustring()); + } + else if (c == quoteChar) { + if (prevWasQuote) { + *tok += c; + prevWasQuote = false; + inQuotes = !inQuotes; + } + else { + prevWasQuote = inQuotes; + inQuotes = !inQuotes; + } + } + else if ((!inQuotes) && (c == fieldSep)) { + prevWasQuote = false; + if (skipheader == 0) { + if (mkCols) { + addColumn(*tok); + } + else { + *curCol++ = VariableType(tok); + } + } + tok = StringPtr(new Glib::ustring()); + } + else { + prevWasQuote = false; + *tok += c; + } +} + +void +StreamRows::end(const RowProcessor * rp) const +{ + if (!tok->empty()) { + if (skipheader == 0) { + if (mkCols) { + addColumn(*tok); + } + else { + *curCol++ = VariableType(tok); + } + } + } + if (curCol != columns.get<byColIdx>().begin()) { + while (curCol != columns.get<byColIdx>().end()) { + *curCol++ = VariableType(); + } + rp->rowReady(); + rowNum += 1; + } + tok = StringPtr(new Glib::ustring()); +} + diff --git a/project2/streamRows.h b/project2/streamRows.h new file mode 100644 index 0000000..473a0d8 --- /dev/null +++ b/project2/streamRows.h @@ -0,0 +1,69 @@ +#ifndef STREAMROWS_H +#define STREAMROWS_H + +#include "rowSet.h" +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/multi_index/ordered_index.hpp> + +class StreamRows : public RowSet { + public: + StreamRows(const xmlpp::Element * p); + ~StreamRows(); + + unsigned int columnCount() const; + const Glib::ustring & getColumnName(unsigned int col) const; + VariableType getCurrentValue(const Glib::ustring & id) const; + VariableType getCurrentValue(unsigned int col) const; + bool isNull(unsigned int col) const; + bool isNull(const Glib::ustring & id) const; + + protected: + void begin() const; + void pushChar(gunichar ch, const RowProcessor *) const; + void end(const RowProcessor *) const; + + private: + void addColumn(Glib::ustring & rawtok) const; + class Column { + public: + Column(unsigned int idx, const Glib::ustring &); + + void operator=(const VariableType &) const; + + const unsigned int idx; + const Glib::ustring col; + mutable VariableType value; + }; + struct byColIdx {}; + struct byColName {}; + typedef boost::multi_index::multi_index_container< + Column, + boost::multi_index::indexed_by< + boost::multi_index::ordered_unique< + boost::multi_index::tag<byColName>, BOOST_MULTI_INDEX_MEMBER(Column, const Glib::ustring, col)>, + boost::multi_index::ordered_unique< + boost::multi_index::tag<byColIdx>, BOOST_MULTI_INDEX_MEMBER(Column, const unsigned int, idx)> + > > Columns; + mutable Columns columns; + + public: + const gunichar fieldSep; + const gunichar quoteChar; + const bool keepBlankRows; + const bool countBlankRows; + const Glib::ustring newline; + const Glib::ustring newlin; + const std::string encoding; + // Used in callback + mutable size_t skipheader; + mutable bool mkCols; + mutable bool inQuotes; + mutable bool prevWasQuote; + typedef boost::shared_ptr<Glib::ustring> StringPtr; + mutable StringPtr tok; + mutable Columns::index<byColIdx>::type::iterator curCol; +}; + +#endif + diff --git a/project2/urlRows.cpp b/project2/urlRows.cpp index cb97b7f..447b661 100644 --- a/project2/urlRows.cpp +++ b/project2/urlRows.cpp @@ -10,23 +10,10 @@ DECLARE_LOADER("urlrows", UrlRows); UrlRows::UrlRows(const xmlpp::Element * p) : SourceObject(p), - RowSet(p), - url(p->get_attribute_value("url")), - fieldSep(p->get_attribute_value("fieldSep")[0]), - quoteChar(p->get_attribute_value("quoteChar")[0]), - newline(p->get_attribute_value("newline")[0]), - encoding(p->get_attribute_value("encoding")), - skipheader(atoi(p->get_attribute_value("skipheader").c_str())), - inQuotes(false), - prevWasQuote(false) + StreamRows(p), + url(p, "url"), + convertRequired(encoding != "utf-8") { - BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { - const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); - if (elem) { - columns.push_back(elem->get_child_text()->get_content()); - } - } - mkCols = columns.empty(); } UrlRows::~UrlRows() @@ -44,48 +31,6 @@ UrlRows::setFilter(const Glib::ustring &) throw NotSupported(__PRETTY_FUNCTION__); } -unsigned int -UrlRows::columnCount() const -{ - return columns.size(); -} - -const Glib::ustring & -UrlRows::getColumnName(unsigned int col) const -{ - return columns[col]; -} - -VariableType -UrlRows::getCurrentValue(unsigned int col) const -{ - return *values[col]; -} - -bool -UrlRows::isNull(unsigned int) const -{ - return false; -} - -bool -UrlRows::isNull(const Glib::ustring &) const -{ - return false; -} - -VariableType -UrlRows::getCurrentValue(const Glib::ustring & id) const -{ - Values::const_iterator v = values.begin(); - for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { - if (*i == id) { - return **v; - } - } - throw RowSet::FieldDoesNotExist(); -} - size_t UrlRows::handleDataHelper(const char * ptr, size_t size, size_t nmemb, void *stream) { @@ -98,77 +43,17 @@ size_t UrlRows::handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const { size_t used = 0, len = 0; - char * utf8 = g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL); - Glib::ustring str(utf8); - free(utf8); - - BOOST_FOREACH(gunichar c, str) { - if (c == newline) { - if (skipheader) { - skipheader -= 1; - } - else { - if (!mkCols) { - if (!tok.empty()) { - values.push_back(ValPtr(new Glib::ustring(tok))); - } - while (values.size() < columns.size()) { - values.push_back(ValPtr(new Glib::ustring())); - } - rp->rowReady(); - rowNum += 1; - } - else { - mkCols = false; - } - } - values.clear(); - tok.clear(); - } - else if (c == quoteChar) { - if (prevWasQuote) { - tok += c; - prevWasQuote = false; - inQuotes = !inQuotes; - } - else { - prevWasQuote = inQuotes; - inQuotes = !inQuotes; - } - } - else if ((!inQuotes) && (c == fieldSep)) { - prevWasQuote = false; - if (skipheader == 0) { - if (mkCols) { - addColumn(tok); - } - else { - values.push_back(ValPtr(new Glib::ustring(tok))); - } - } - tok.clear(); - } - else { - prevWasQuote = false; - tok += c; - } + const gchar * utf8 = convertRequired ? g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL) : bytes; + for (const gchar * iter = utf8; *iter; iter = g_utf8_next_char(iter)) { + this->pushChar(*iter, rp); } - - return used; -} - -void -UrlRows::addColumn(const Glib::ustring & rawtok) const -{ - columns.push_back(rawtok); - Glib::ustring & tok(columns.back()); - for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { - if (!isalnum(*i)) { - tok.erase(i); - } - else { - i++; - } + if (convertRequired) { + // We allocated it.. sooo.... + free(const_cast<gchar *>(utf8)); + return used; + } + else { + return bytesLen; } } @@ -176,8 +61,9 @@ void UrlRows::execute(const RowProcessor * rp) const { rowNum = 1; + begin(); CurlHandle::Ptr c = new CurlHandle(); - c->setopt(CURLOPT_URL, url.c_str()); + c->setopt(CURLOPT_URL, (const char *)url()); //c->setopt(CURLOPT_PROXY, proxy.c_str()); c->setopt(CURLOPT_FOLLOWLOCATION, 1); //c->setopt(CURLOPT_COOKIEFILE, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); @@ -188,25 +74,7 @@ UrlRows::execute(const RowProcessor * rp) const c->setopt(CURLOPT_WRITEDATA, &cb); c->setopt(CURLOPT_WRITEFUNCTION, &handleDataHelper); c->perform(); - if (!tok.empty()) { - if (skipheader == 0) { - if (mkCols) { - addColumn(tok); - } - else { - values.push_back(ValPtr(new Glib::ustring(tok))); - } - } - } - if (!values.empty()) { - while (values.size() < columns.size()) { - values.push_back(ValPtr(new Glib::ustring())); - } - rp->rowReady(); - rowNum += 1; - values.clear(); - } - values.clear(); + end(rp); } UrlRows::callback::callback(const UrlRows * u, const RowProcessor * r) : diff --git a/project2/urlRows.h b/project2/urlRows.h index 7b60400..fe63a3f 100644 --- a/project2/urlRows.h +++ b/project2/urlRows.h @@ -5,9 +5,9 @@ #include <boost/intrusive_ptr.hpp> #include <boost/shared_ptr.hpp> #include <map> -#include "rowSet.h" +#include "streamRows.h" -class UrlRows : public RowSet { +class UrlRows : public StreamRows { public: UrlRows(const xmlpp::Element * p); ~UrlRows(); @@ -15,21 +15,10 @@ class UrlRows : public RowSet { virtual void loadComplete(const CommonObjects *); void execute(const RowProcessor *) const; virtual void setFilter(const Glib::ustring &); - unsigned int columnCount() const; - const Glib::ustring & getColumnName(unsigned int col) const; - VariableType getCurrentValue(const Glib::ustring & id) const; - VariableType getCurrentValue(unsigned int col) const; - bool isNull(unsigned int col) const; - bool isNull(const Glib::ustring & id) const; - typedef std::set<gunichar> CharSet; - const Glib::ustring url; + const Variable url; protected: - void addColumn(const Glib::ustring & rawtok) const; - typedef boost::shared_ptr<Glib::ustring> ValPtr; - typedef std::vector<ValPtr> Values; - mutable Values values; private: struct callback { @@ -39,19 +28,8 @@ class UrlRows : public RowSet { }; static size_t handleDataHelper(const char * ptr, size_t size, size_t nmemb, void * stream); size_t handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const; - gunichar fieldSep; - gunichar quoteChar; - gunichar newline; - std::string encoding; - mutable size_t skipheader; - typedef std::vector<Glib::ustring> Columns; - mutable Columns columns; - - // Used in CURL callback - mutable bool mkCols; - mutable bool inQuotes; - mutable bool prevWasQuote; - mutable Glib::ustring tok; + bool convertRequired; + }; #endif |