summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorrandomdan <randomdan@localhost>2011-02-14 23:30:52 +0000
committerrandomdan <randomdan@localhost>2011-02-14 23:30:52 +0000
commitbc0c102ee7746d6a7b52a5fa4a4a65709823762f (patch)
tree14517e8efce4ecdf7cb6eb6c9387f087872a245a
parentConvert dumpTask into a generic purpose file* writer (diff)
downloadproject2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.bz2
project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.xz
project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.zip
Merge fileRows and urlRows content parser into a shared based; streamRows
-rw-r--r--project2/Jamfile.jam1
-rw-r--r--project2/fileRows.cpp138
-rw-r--r--project2/fileRows.h31
-rw-r--r--project2/streamRows.cpp192
-rw-r--r--project2/streamRows.h69
-rw-r--r--project2/urlRows.cpp164
-rw-r--r--project2/urlRows.h32
7 files changed, 291 insertions, 336 deletions
diff --git a/project2/Jamfile.jam b/project2/Jamfile.jam
index 96a0797..594bbd1 100644
--- a/project2/Jamfile.jam
+++ b/project2/Jamfile.jam
@@ -79,6 +79,7 @@ lib p2processes :
lib p2files :
fsRows.cpp
fileRows.cpp
+ streamRows.cpp
:
<library>../libmisc//misc
<library>libxmlpp
diff --git a/project2/fileRows.cpp b/project2/fileRows.cpp
index e990f56..11a71ef 100644
--- a/project2/fileRows.cpp
+++ b/project2/fileRows.cpp
@@ -8,21 +8,9 @@ DECLARE_LOADER("filerows", FileRows);
FileRows::FileRows(const xmlpp::Element * p) :
SourceObject(p),
- RowSet(p),
- path(p, "path"),
- fieldSep(p->get_attribute_value("fieldSep")[0]),
- quoteChar(p->get_attribute_value("quoteChar")[0]),
- keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"),
- countBlankRows(p->get_attribute_value("keepBlankRows") == "count"),
- newline(p->get_attribute_value("newline")),
- encoding(p->get_attribute_value("encoding"))
+ StreamRows(p),
+ path(p, "path")
{
- BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) {
- const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node);
- if (elem) {
- columns.push_back(boost::shared_ptr<Glib::ustring>(new Glib::ustring(elem->get_child_text()->get_content())));
- }
- }
}
FileRows::~FileRows()
@@ -40,131 +28,15 @@ FileRows::setFilter(const Glib::ustring &)
throw NotSupported(__PRETTY_FUNCTION__);
}
-unsigned int
-FileRows::columnCount() const
-{
- return columns.size();
-}
-
-const Glib::ustring &
-FileRows::getColumnName(unsigned int col) const
-{
- return *columns[col];
-}
-
void
FileRows::execute(const RowProcessor * rp) const
{
rowNum = 1;
FileStarChannel c(doOpen());
c.set_encoding(encoding);
- c.set_line_term(newline);
- Glib::ustring line;
- while (c.read_line(line) == Glib::IO_STATUS_NORMAL) {
- if (boost::algorithm::ends_with(line, newline)) {
- line.erase(line.length() - newline.length());
- }
- Columns::const_iterator curCol = columns.begin();
- bool mkCols = columns.empty();
- bool inQuotes = false;
- bool prevWasQuote = false;
- typedef boost::shared_ptr<Glib::ustring> StringPtr;
- StringPtr tok(new Glib::ustring());
- BOOST_FOREACH(gunichar c, line) {
- if (c == quoteChar) {
- if (prevWasQuote) {
- *tok += c;
- prevWasQuote = false;
- inQuotes = !inQuotes;
- }
- else {
- prevWasQuote = inQuotes;
- inQuotes = !inQuotes;
- }
- }
- else if ((!inQuotes) && (c == fieldSep)) {
- prevWasQuote = false;
- if (mkCols) {
- addColumn(tok);
- }
- else {
- values.push_back(tok);
- curCol++;
- }
- tok = StringPtr(new Glib::ustring());
- }
- else {
- prevWasQuote = false;
- *tok += c;
- }
- }
- if (tok->length()) {
- if (mkCols) {
- addColumn(tok);
- }
- else {
- values.push_back(tok);
- curCol++;
- }
- }
- if (!mkCols) {
- if (keepBlankRows || !values.empty()) {
- while (values.size() < columns.size()) {
- values.push_back(VariableType());
- curCol++;
- }
- rp->rowReady();
- rowNum += 1;
- }
- else if (countBlankRows) {
- rowNum +=1;
- }
- }
- values.clear();
- }
-}
-
-VariableType
-FileRows::getCurrentValue(unsigned int col) const
-{
- return values[col];
-}
-
-bool
-FileRows::isNull(unsigned int) const
-{
- return false;
-}
-
-bool
-FileRows::isNull(const Glib::ustring &) const
-{
- return false;
-}
-
-VariableType
-FileRows::getCurrentValue(const Glib::ustring & id) const
-{
- Values::const_iterator v = values.begin();
- for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) {
- if (**i == id) {
- return *v;
- }
- }
- throw RowSet::FieldDoesNotExist();
-}
-
-void
-FileRows::addColumn(boost::shared_ptr<Glib::ustring> tok) const
-{
- columns.push_back(tok);
- for (Glib::ustring::iterator i = tok->begin(); i != tok->end(); ) {
- if (!isalnum(*i)) {
- tok->erase(i);
- }
- else {
- i++;
- }
+ gunichar ch;
+ while (c.read(ch) == Glib::IO_STATUS_NORMAL) {
+ this->pushChar(ch, rp);
}
}
diff --git a/project2/fileRows.h b/project2/fileRows.h
index 1a4b7ed..0fcc394 100644
--- a/project2/fileRows.h
+++ b/project2/fileRows.h
@@ -1,17 +1,12 @@
#ifndef FILEROWS_H
#define FILEROWS_H
-#include <libxml++/nodes/element.h>
-#include <boost/intrusive_ptr.hpp>
-#include <boost/shared_ptr.hpp>
-#include <map>
-#include "variables.h"
-#include "rowSet.h"
+#include "streamRows.h"
#include "fileStarGlibIoChannel.h"
class CommonObjects;
-class FileRows : public RowSet {
+class FileRows : public StreamRows {
public:
FileRows(const xmlpp::Element * p);
~FileRows();
@@ -19,31 +14,11 @@ class FileRows : public RowSet {
void execute(const RowProcessor *) const;
virtual void loadComplete(const CommonObjects *);
virtual void setFilter(const Glib::ustring &);
- unsigned int columnCount() const;
- const Glib::ustring & getColumnName(unsigned int col) const;
- VariableType getCurrentValue(const Glib::ustring & id) const;
- VariableType getCurrentValue(unsigned int col) const;
- bool isNull(unsigned int col) const;
- bool isNull(const Glib::ustring & id) const;
-
- typedef std::set<gunichar> CharSet;
+
const Variable path;
protected:
virtual FileStarChannel doOpen() const;
- void addColumn(boost::shared_ptr<Glib::ustring> rawtok) const;
- typedef std::vector<VariableType> Values;
- mutable Values values;
-
- private:
- gunichar fieldSep;
- gunichar quoteChar;
- bool keepBlankRows;
- bool countBlankRows;
- std::string newline;
- std::string encoding;
- typedef std::vector<boost::shared_ptr<Glib::ustring> > Columns;
- mutable Columns columns;
};
#endif
diff --git a/project2/streamRows.cpp b/project2/streamRows.cpp
new file mode 100644
index 0000000..9160040
--- /dev/null
+++ b/project2/streamRows.cpp
@@ -0,0 +1,192 @@
+#include "streamRows.h"
+#include "rowProcessor.h"
+#include <boost/foreach.hpp>
+#include <libxml++/nodes/textnode.h>
+
+StreamRows::StreamRows(const xmlpp::Element * p) :
+ SourceObject(p),
+ RowSet(p),
+ fieldSep(p->get_attribute_value("fieldSep")[0]),
+ quoteChar(p->get_attribute_value("quoteChar")[0]),
+ keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"),
+ countBlankRows(p->get_attribute_value("keepBlankRows") == "count"),
+ newline(p->get_attribute_value("newline")),
+ newlin(newline, 0, newline.length() - 1),
+ encoding(p->get_attribute_value("encoding")),
+ skipheader(atoi(p->get_attribute_value("skipheader").c_str())),
+ inQuotes(false),
+ prevWasQuote(false)
+{
+ unsigned int colNo = 0;
+ BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) {
+ const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node);
+ if (elem) {
+ columns.insert(Column(colNo++, elem->get_child_text()->get_content()));
+ }
+ }
+ mkCols = columns.empty();
+}
+
+StreamRows::~StreamRows()
+{
+}
+
+const Glib::ustring &
+StreamRows::getColumnName(unsigned int col) const
+{
+ Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col);
+ if (i != columns.get<byColIdx>().end()) {
+ return i->col;
+ }
+ throw RowSet::FieldDoesNotExist();
+}
+
+unsigned int
+StreamRows::columnCount() const
+{
+ return columns.size();
+}
+
+VariableType
+StreamRows::getCurrentValue(unsigned int col) const
+{
+ Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col);
+ if (i != columns.get<byColIdx>().end()) {
+ return i->value;
+ }
+ throw RowSet::FieldDoesNotExist();
+}
+
+bool
+StreamRows::isNull(unsigned int col) const
+{
+ return (columns.get<byColIdx>().find(col) == columns.get<byColIdx>().end());
+}
+
+bool
+StreamRows::isNull(const Glib::ustring & col) const
+{
+ return (columns.get<byColName>().find(col) == columns.get<byColName>().end());
+}
+
+VariableType
+StreamRows::getCurrentValue(const Glib::ustring & col) const
+{
+ Columns::const_iterator i = columns.get<byColName>().find(col);
+ if (i != columns.end()) {
+ return i->value;
+ }
+ throw RowSet::FieldDoesNotExist();
+}
+
+void
+StreamRows::addColumn(Glib::ustring & tok) const
+{
+ for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) {
+ if (!isalnum(*i)) {
+ tok.erase(i);
+ }
+ else {
+ i++;
+ }
+ }
+ columns.insert(Column(columns.size(), tok));
+}
+
+StreamRows::Column::Column(unsigned int i, const Glib::ustring & c) :
+ idx(i),
+ col(c)
+{
+}
+
+void
+StreamRows::Column::operator=(const VariableType & v) const
+{
+ value = v;
+}
+
+void
+StreamRows::begin() const
+{
+ curCol = columns.get<byColIdx>().begin();
+ tok = StringPtr(new Glib::ustring());
+}
+
+void
+StreamRows::pushChar(gunichar c, const RowProcessor * rp) const
+{
+ if ((!inQuotes) && (c == *newline.rbegin()) && (*tok == newlin)) {
+ if (skipheader) {
+ skipheader -= 1;
+ }
+ else {
+ tok->erase(tok->length() - newlin.length());
+ if (!mkCols) {
+ if (!tok->empty()) {
+ *curCol++ = VariableType(tok);
+ }
+ while (curCol != columns.get<byColIdx>().end()) {
+ *curCol++ = VariableType();
+ }
+ rp->rowReady();
+ rowNum += 1;
+ curCol = columns.get<byColIdx>().begin();
+ }
+ else {
+ mkCols = false;
+ }
+ }
+ tok = StringPtr(new Glib::ustring());
+ }
+ else if (c == quoteChar) {
+ if (prevWasQuote) {
+ *tok += c;
+ prevWasQuote = false;
+ inQuotes = !inQuotes;
+ }
+ else {
+ prevWasQuote = inQuotes;
+ inQuotes = !inQuotes;
+ }
+ }
+ else if ((!inQuotes) && (c == fieldSep)) {
+ prevWasQuote = false;
+ if (skipheader == 0) {
+ if (mkCols) {
+ addColumn(*tok);
+ }
+ else {
+ *curCol++ = VariableType(tok);
+ }
+ }
+ tok = StringPtr(new Glib::ustring());
+ }
+ else {
+ prevWasQuote = false;
+ *tok += c;
+ }
+}
+
+void
+StreamRows::end(const RowProcessor * rp) const
+{
+ if (!tok->empty()) {
+ if (skipheader == 0) {
+ if (mkCols) {
+ addColumn(*tok);
+ }
+ else {
+ *curCol++ = VariableType(tok);
+ }
+ }
+ }
+ if (curCol != columns.get<byColIdx>().begin()) {
+ while (curCol != columns.get<byColIdx>().end()) {
+ *curCol++ = VariableType();
+ }
+ rp->rowReady();
+ rowNum += 1;
+ }
+ tok = StringPtr(new Glib::ustring());
+}
+
diff --git a/project2/streamRows.h b/project2/streamRows.h
new file mode 100644
index 0000000..473a0d8
--- /dev/null
+++ b/project2/streamRows.h
@@ -0,0 +1,69 @@
+#ifndef STREAMROWS_H
+#define STREAMROWS_H
+
+#include "rowSet.h"
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+
+class StreamRows : public RowSet {
+ public:
+ StreamRows(const xmlpp::Element * p);
+ ~StreamRows();
+
+ unsigned int columnCount() const;
+ const Glib::ustring & getColumnName(unsigned int col) const;
+ VariableType getCurrentValue(const Glib::ustring & id) const;
+ VariableType getCurrentValue(unsigned int col) const;
+ bool isNull(unsigned int col) const;
+ bool isNull(const Glib::ustring & id) const;
+
+ protected:
+ void begin() const;
+ void pushChar(gunichar ch, const RowProcessor *) const;
+ void end(const RowProcessor *) const;
+
+ private:
+ void addColumn(Glib::ustring & rawtok) const;
+ class Column {
+ public:
+ Column(unsigned int idx, const Glib::ustring &);
+
+ void operator=(const VariableType &) const;
+
+ const unsigned int idx;
+ const Glib::ustring col;
+ mutable VariableType value;
+ };
+ struct byColIdx {};
+ struct byColName {};
+ typedef boost::multi_index::multi_index_container<
+ Column,
+ boost::multi_index::indexed_by<
+ boost::multi_index::ordered_unique<
+ boost::multi_index::tag<byColName>, BOOST_MULTI_INDEX_MEMBER(Column, const Glib::ustring, col)>,
+ boost::multi_index::ordered_unique<
+ boost::multi_index::tag<byColIdx>, BOOST_MULTI_INDEX_MEMBER(Column, const unsigned int, idx)>
+ > > Columns;
+ mutable Columns columns;
+
+ public:
+ const gunichar fieldSep;
+ const gunichar quoteChar;
+ const bool keepBlankRows;
+ const bool countBlankRows;
+ const Glib::ustring newline;
+ const Glib::ustring newlin;
+ const std::string encoding;
+ // Used in callback
+ mutable size_t skipheader;
+ mutable bool mkCols;
+ mutable bool inQuotes;
+ mutable bool prevWasQuote;
+ typedef boost::shared_ptr<Glib::ustring> StringPtr;
+ mutable StringPtr tok;
+ mutable Columns::index<byColIdx>::type::iterator curCol;
+};
+
+#endif
+
diff --git a/project2/urlRows.cpp b/project2/urlRows.cpp
index cb97b7f..447b661 100644
--- a/project2/urlRows.cpp
+++ b/project2/urlRows.cpp
@@ -10,23 +10,10 @@ DECLARE_LOADER("urlrows", UrlRows);
UrlRows::UrlRows(const xmlpp::Element * p) :
SourceObject(p),
- RowSet(p),
- url(p->get_attribute_value("url")),
- fieldSep(p->get_attribute_value("fieldSep")[0]),
- quoteChar(p->get_attribute_value("quoteChar")[0]),
- newline(p->get_attribute_value("newline")[0]),
- encoding(p->get_attribute_value("encoding")),
- skipheader(atoi(p->get_attribute_value("skipheader").c_str())),
- inQuotes(false),
- prevWasQuote(false)
+ StreamRows(p),
+ url(p, "url"),
+ convertRequired(encoding != "utf-8")
{
- BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) {
- const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node);
- if (elem) {
- columns.push_back(elem->get_child_text()->get_content());
- }
- }
- mkCols = columns.empty();
}
UrlRows::~UrlRows()
@@ -44,48 +31,6 @@ UrlRows::setFilter(const Glib::ustring &)
throw NotSupported(__PRETTY_FUNCTION__);
}
-unsigned int
-UrlRows::columnCount() const
-{
- return columns.size();
-}
-
-const Glib::ustring &
-UrlRows::getColumnName(unsigned int col) const
-{
- return columns[col];
-}
-
-VariableType
-UrlRows::getCurrentValue(unsigned int col) const
-{
- return *values[col];
-}
-
-bool
-UrlRows::isNull(unsigned int) const
-{
- return false;
-}
-
-bool
-UrlRows::isNull(const Glib::ustring &) const
-{
- return false;
-}
-
-VariableType
-UrlRows::getCurrentValue(const Glib::ustring & id) const
-{
- Values::const_iterator v = values.begin();
- for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) {
- if (*i == id) {
- return **v;
- }
- }
- throw RowSet::FieldDoesNotExist();
-}
-
size_t
UrlRows::handleDataHelper(const char * ptr, size_t size, size_t nmemb, void *stream)
{
@@ -98,77 +43,17 @@ size_t
UrlRows::handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const
{
size_t used = 0, len = 0;
- char * utf8 = g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL);
- Glib::ustring str(utf8);
- free(utf8);
-
- BOOST_FOREACH(gunichar c, str) {
- if (c == newline) {
- if (skipheader) {
- skipheader -= 1;
- }
- else {
- if (!mkCols) {
- if (!tok.empty()) {
- values.push_back(ValPtr(new Glib::ustring(tok)));
- }
- while (values.size() < columns.size()) {
- values.push_back(ValPtr(new Glib::ustring()));
- }
- rp->rowReady();
- rowNum += 1;
- }
- else {
- mkCols = false;
- }
- }
- values.clear();
- tok.clear();
- }
- else if (c == quoteChar) {
- if (prevWasQuote) {
- tok += c;
- prevWasQuote = false;
- inQuotes = !inQuotes;
- }
- else {
- prevWasQuote = inQuotes;
- inQuotes = !inQuotes;
- }
- }
- else if ((!inQuotes) && (c == fieldSep)) {
- prevWasQuote = false;
- if (skipheader == 0) {
- if (mkCols) {
- addColumn(tok);
- }
- else {
- values.push_back(ValPtr(new Glib::ustring(tok)));
- }
- }
- tok.clear();
- }
- else {
- prevWasQuote = false;
- tok += c;
- }
+ const gchar * utf8 = convertRequired ? g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL) : bytes;
+ for (const gchar * iter = utf8; *iter; iter = g_utf8_next_char(iter)) {
+ this->pushChar(*iter, rp);
}
-
- return used;
-}
-
-void
-UrlRows::addColumn(const Glib::ustring & rawtok) const
-{
- columns.push_back(rawtok);
- Glib::ustring & tok(columns.back());
- for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) {
- if (!isalnum(*i)) {
- tok.erase(i);
- }
- else {
- i++;
- }
+ if (convertRequired) {
+ // We allocated it.. sooo....
+ free(const_cast<gchar *>(utf8));
+ return used;
+ }
+ else {
+ return bytesLen;
}
}
@@ -176,8 +61,9 @@ void
UrlRows::execute(const RowProcessor * rp) const
{
rowNum = 1;
+ begin();
CurlHandle::Ptr c = new CurlHandle();
- c->setopt(CURLOPT_URL, url.c_str());
+ c->setopt(CURLOPT_URL, (const char *)url());
//c->setopt(CURLOPT_PROXY, proxy.c_str());
c->setopt(CURLOPT_FOLLOWLOCATION, 1);
//c->setopt(CURLOPT_COOKIEFILE, (std::string(cacheRoot) + "/ytfs.cookies").c_str());
@@ -188,25 +74,7 @@ UrlRows::execute(const RowProcessor * rp) const
c->setopt(CURLOPT_WRITEDATA, &cb);
c->setopt(CURLOPT_WRITEFUNCTION, &handleDataHelper);
c->perform();
- if (!tok.empty()) {
- if (skipheader == 0) {
- if (mkCols) {
- addColumn(tok);
- }
- else {
- values.push_back(ValPtr(new Glib::ustring(tok)));
- }
- }
- }
- if (!values.empty()) {
- while (values.size() < columns.size()) {
- values.push_back(ValPtr(new Glib::ustring()));
- }
- rp->rowReady();
- rowNum += 1;
- values.clear();
- }
- values.clear();
+ end(rp);
}
UrlRows::callback::callback(const UrlRows * u, const RowProcessor * r) :
diff --git a/project2/urlRows.h b/project2/urlRows.h
index 7b60400..fe63a3f 100644
--- a/project2/urlRows.h
+++ b/project2/urlRows.h
@@ -5,9 +5,9 @@
#include <boost/intrusive_ptr.hpp>
#include <boost/shared_ptr.hpp>
#include <map>
-#include "rowSet.h"
+#include "streamRows.h"
-class UrlRows : public RowSet {
+class UrlRows : public StreamRows {
public:
UrlRows(const xmlpp::Element * p);
~UrlRows();
@@ -15,21 +15,10 @@ class UrlRows : public RowSet {
virtual void loadComplete(const CommonObjects *);
void execute(const RowProcessor *) const;
virtual void setFilter(const Glib::ustring &);
- unsigned int columnCount() const;
- const Glib::ustring & getColumnName(unsigned int col) const;
- VariableType getCurrentValue(const Glib::ustring & id) const;
- VariableType getCurrentValue(unsigned int col) const;
- bool isNull(unsigned int col) const;
- bool isNull(const Glib::ustring & id) const;
- typedef std::set<gunichar> CharSet;
- const Glib::ustring url;
+ const Variable url;
protected:
- void addColumn(const Glib::ustring & rawtok) const;
- typedef boost::shared_ptr<Glib::ustring> ValPtr;
- typedef std::vector<ValPtr> Values;
- mutable Values values;
private:
struct callback {
@@ -39,19 +28,8 @@ class UrlRows : public RowSet {
};
static size_t handleDataHelper(const char * ptr, size_t size, size_t nmemb, void * stream);
size_t handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const;
- gunichar fieldSep;
- gunichar quoteChar;
- gunichar newline;
- std::string encoding;
- mutable size_t skipheader;
- typedef std::vector<Glib::ustring> Columns;
- mutable Columns columns;
-
- // Used in CURL callback
- mutable bool mkCols;
- mutable bool inQuotes;
- mutable bool prevWasQuote;
- mutable Glib::ustring tok;
+ bool convertRequired;
+
};
#endif