diff options
| author | randomdan <randomdan@localhost> | 2011-02-14 23:30:52 +0000 | 
|---|---|---|
| committer | randomdan <randomdan@localhost> | 2011-02-14 23:30:52 +0000 | 
| commit | bc0c102ee7746d6a7b52a5fa4a4a65709823762f (patch) | |
| tree | 14517e8efce4ecdf7cb6eb6c9387f087872a245a | |
| parent | Convert dumpTask into a generic purpose file* writer (diff) | |
| download | project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.bz2 project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.tar.xz project2-bc0c102ee7746d6a7b52a5fa4a4a65709823762f.zip  | |
Merge fileRows and urlRows content parser into a shared based; streamRows
| -rw-r--r-- | project2/Jamfile.jam | 1 | ||||
| -rw-r--r-- | project2/fileRows.cpp | 138 | ||||
| -rw-r--r-- | project2/fileRows.h | 31 | ||||
| -rw-r--r-- | project2/streamRows.cpp | 192 | ||||
| -rw-r--r-- | project2/streamRows.h | 69 | ||||
| -rw-r--r-- | project2/urlRows.cpp | 164 | ||||
| -rw-r--r-- | project2/urlRows.h | 32 | 
7 files changed, 291 insertions, 336 deletions
diff --git a/project2/Jamfile.jam b/project2/Jamfile.jam index 96a0797..594bbd1 100644 --- a/project2/Jamfile.jam +++ b/project2/Jamfile.jam @@ -79,6 +79,7 @@ lib p2processes :  lib p2files :  	fsRows.cpp  	fileRows.cpp +	streamRows.cpp  	:  	<library>../libmisc//misc  	<library>libxmlpp diff --git a/project2/fileRows.cpp b/project2/fileRows.cpp index e990f56..11a71ef 100644 --- a/project2/fileRows.cpp +++ b/project2/fileRows.cpp @@ -8,21 +8,9 @@ DECLARE_LOADER("filerows", FileRows);  FileRows::FileRows(const xmlpp::Element * p) :  	SourceObject(p), -	RowSet(p), -	path(p, "path"), -	fieldSep(p->get_attribute_value("fieldSep")[0]), -	quoteChar(p->get_attribute_value("quoteChar")[0]), -	keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"), -	countBlankRows(p->get_attribute_value("keepBlankRows") == "count"), -	newline(p->get_attribute_value("newline")), -	encoding(p->get_attribute_value("encoding")) +	StreamRows(p), +	path(p, "path")  { -	BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { -		const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); -		if (elem) { -			columns.push_back(boost::shared_ptr<Glib::ustring>(new Glib::ustring(elem->get_child_text()->get_content()))); -		} -	}  }  FileRows::~FileRows() @@ -40,131 +28,15 @@ FileRows::setFilter(const Glib::ustring &)  	throw NotSupported(__PRETTY_FUNCTION__);  } -unsigned int -FileRows::columnCount() const -{ -	return columns.size(); -} - -const Glib::ustring & -FileRows::getColumnName(unsigned int col) const -{ -	return *columns[col]; -} -  void  FileRows::execute(const RowProcessor * rp) const  {  	rowNum = 1;  	FileStarChannel c(doOpen());  	c.set_encoding(encoding); -	c.set_line_term(newline); -	Glib::ustring line; -	while (c.read_line(line) == Glib::IO_STATUS_NORMAL) { -		if (boost::algorithm::ends_with(line, newline)) { -			line.erase(line.length() - newline.length()); -		} -		Columns::const_iterator curCol = columns.begin(); -		bool mkCols = columns.empty(); -		bool inQuotes = false; -		bool prevWasQuote = false; -		typedef boost::shared_ptr<Glib::ustring> StringPtr; -		StringPtr tok(new Glib::ustring()); -		BOOST_FOREACH(gunichar c, line) { -			if (c == quoteChar) { -				if (prevWasQuote) { -					*tok += c; -					prevWasQuote = false; -					inQuotes = !inQuotes; -				} -				else { -					prevWasQuote = inQuotes; -					inQuotes = !inQuotes; -				} -			} -			else if ((!inQuotes) && (c == fieldSep)) { -				prevWasQuote = false; -				if (mkCols) { -					addColumn(tok); -				} -				else { -					values.push_back(tok); -					curCol++; -				} -				tok = StringPtr(new Glib::ustring()); -			} -			else { -				prevWasQuote = false; -				*tok += c; -			} -		} -		if (tok->length()) { -			if (mkCols) { -				addColumn(tok); -			} -			else { -				values.push_back(tok); -				curCol++; -			} -		} -		if (!mkCols) { -			if (keepBlankRows || !values.empty()) { -				while (values.size() < columns.size()) { -					values.push_back(VariableType()); -					curCol++; -				} -				rp->rowReady(); -				rowNum += 1; -			} -			else if (countBlankRows) { -				rowNum +=1; -			} -		} -		values.clear(); -	} -} - -VariableType -FileRows::getCurrentValue(unsigned int col) const -{ -	return values[col]; -} - -bool -FileRows::isNull(unsigned int) const -{ -	return false; -} - -bool -FileRows::isNull(const Glib::ustring &) const -{ -	return false; -} - -VariableType -FileRows::getCurrentValue(const Glib::ustring & id) const -{ -	Values::const_iterator v = values.begin(); -	for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { -		if (**i == id) { -			return *v; -		} -	} -	throw RowSet::FieldDoesNotExist(); -} - -void -FileRows::addColumn(boost::shared_ptr<Glib::ustring> tok) const -{ -	columns.push_back(tok); -	for (Glib::ustring::iterator i = tok->begin(); i != tok->end(); ) { -		if (!isalnum(*i)) { -			tok->erase(i); -		} -		else { -			i++; -		} +	gunichar ch; +	while (c.read(ch) == Glib::IO_STATUS_NORMAL) { +		this->pushChar(ch, rp);  	}  } diff --git a/project2/fileRows.h b/project2/fileRows.h index 1a4b7ed..0fcc394 100644 --- a/project2/fileRows.h +++ b/project2/fileRows.h @@ -1,17 +1,12 @@  #ifndef FILEROWS_H  #define FILEROWS_H -#include <libxml++/nodes/element.h> -#include <boost/intrusive_ptr.hpp> -#include <boost/shared_ptr.hpp> -#include <map> -#include "variables.h" -#include "rowSet.h" +#include "streamRows.h"  #include "fileStarGlibIoChannel.h"  class CommonObjects; -class FileRows : public RowSet { +class FileRows : public StreamRows {  	public:  		FileRows(const xmlpp::Element * p);  		~FileRows(); @@ -19,31 +14,11 @@ class FileRows : public RowSet {  		void execute(const RowProcessor *) const;  		virtual void loadComplete(const CommonObjects *);  		virtual void setFilter(const Glib::ustring &); -		unsigned int columnCount() const; -		const Glib::ustring & getColumnName(unsigned int col) const; -		VariableType getCurrentValue(const Glib::ustring & id) const; -		VariableType getCurrentValue(unsigned int col) const; -		bool isNull(unsigned int col) const; -		bool isNull(const Glib::ustring & id) const; - -		typedef std::set<gunichar> CharSet; +  		const Variable path;  	protected:  		virtual FileStarChannel doOpen() const; -		void addColumn(boost::shared_ptr<Glib::ustring> rawtok) const; -		typedef std::vector<VariableType> Values; -		mutable Values values; - -	private: -		gunichar fieldSep; -		gunichar quoteChar; -		bool keepBlankRows; -		bool countBlankRows; -		std::string newline; -		std::string encoding; -		typedef std::vector<boost::shared_ptr<Glib::ustring> > Columns; -		mutable Columns columns;  };  #endif diff --git a/project2/streamRows.cpp b/project2/streamRows.cpp new file mode 100644 index 0000000..9160040 --- /dev/null +++ b/project2/streamRows.cpp @@ -0,0 +1,192 @@ +#include "streamRows.h" +#include "rowProcessor.h" +#include <boost/foreach.hpp> +#include <libxml++/nodes/textnode.h> + +StreamRows::StreamRows(const xmlpp::Element * p) : +	SourceObject(p), +	RowSet(p), +	fieldSep(p->get_attribute_value("fieldSep")[0]), +	quoteChar(p->get_attribute_value("quoteChar")[0]), +	keepBlankRows(p->get_attribute_value("keepBlankRows") == "true"), +	countBlankRows(p->get_attribute_value("keepBlankRows") == "count"), +	newline(p->get_attribute_value("newline")), +	newlin(newline, 0, newline.length() - 1), +	encoding(p->get_attribute_value("encoding")), +	skipheader(atoi(p->get_attribute_value("skipheader").c_str())), +	inQuotes(false), +	prevWasQuote(false) +{ +	unsigned int colNo = 0; +	BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { +		const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); +		if (elem) { +			columns.insert(Column(colNo++, elem->get_child_text()->get_content())); +		} +	} +	mkCols = columns.empty(); +} + +StreamRows::~StreamRows() +{ +} + +const Glib::ustring & +StreamRows::getColumnName(unsigned int col) const +{ +	Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col); +	if (i != columns.get<byColIdx>().end()) { +		return i->col; +	} +	throw RowSet::FieldDoesNotExist(); +} + +unsigned int +StreamRows::columnCount() const +{ +	return columns.size(); +} + +VariableType +StreamRows::getCurrentValue(unsigned int col) const +{ +	Columns::index<byColIdx>::type::iterator i = columns.get<byColIdx>().find(col); +	if (i != columns.get<byColIdx>().end()) { +		return i->value; +	} +	throw RowSet::FieldDoesNotExist(); +} + +bool +StreamRows::isNull(unsigned int col) const +{ +	return (columns.get<byColIdx>().find(col) == columns.get<byColIdx>().end()); +} + +bool +StreamRows::isNull(const Glib::ustring & col) const +{ +	return (columns.get<byColName>().find(col) == columns.get<byColName>().end()); +} + +VariableType +StreamRows::getCurrentValue(const Glib::ustring & col) const +{ +	Columns::const_iterator i = columns.get<byColName>().find(col); +	if (i != columns.end()) { +		return i->value; +	} +	throw RowSet::FieldDoesNotExist(); +} + +void +StreamRows::addColumn(Glib::ustring & tok) const +{ +	for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { +		if (!isalnum(*i)) { +			tok.erase(i); +		} +		else { +			i++; +		} +	} +	columns.insert(Column(columns.size(), tok)); +} + +StreamRows::Column::Column(unsigned int i, const Glib::ustring & c) : +	idx(i), +	col(c) +{ +} + +void +StreamRows::Column::operator=(const VariableType & v) const +{ +	value = v; +} + +void +StreamRows::begin() const +{ +	curCol = columns.get<byColIdx>().begin(); +	tok = StringPtr(new Glib::ustring()); +} + +void +StreamRows::pushChar(gunichar c, const RowProcessor * rp) const +{ +	if ((!inQuotes) && (c == *newline.rbegin()) && (*tok == newlin)) { +		if (skipheader) { +			skipheader -= 1; +		} +		else { +			tok->erase(tok->length() - newlin.length()); +			if (!mkCols) { +				if (!tok->empty()) { +					*curCol++ = VariableType(tok); +				} +				while (curCol != columns.get<byColIdx>().end()) { +					*curCol++ = VariableType(); +				} +				rp->rowReady(); +				rowNum += 1; +				curCol = columns.get<byColIdx>().begin(); +			} +			else { +				mkCols = false; +			} +		} +		tok = StringPtr(new Glib::ustring()); +	} +	else if (c == quoteChar) { +		if (prevWasQuote) { +			*tok += c; +			prevWasQuote = false; +			inQuotes = !inQuotes; +		} +		else { +			prevWasQuote = inQuotes; +			inQuotes = !inQuotes; +		} +	} +	else if ((!inQuotes) && (c == fieldSep)) { +		prevWasQuote = false; +		if (skipheader == 0) { +			if (mkCols) { +				addColumn(*tok); +			} +			else { +				*curCol++ = VariableType(tok); +			} +		} +		tok = StringPtr(new Glib::ustring()); +	} +	else { +		prevWasQuote = false; +		*tok += c; +	} +} + +void +StreamRows::end(const RowProcessor * rp) const +{ +	if (!tok->empty()) { +		if (skipheader == 0) { +			if (mkCols) { +				addColumn(*tok); +			} +			else { +				*curCol++ = VariableType(tok); +			} +		} +	} +	if (curCol != columns.get<byColIdx>().begin()) { +		while (curCol != columns.get<byColIdx>().end()) { +			*curCol++ = VariableType(); +		} +		rp->rowReady(); +		rowNum += 1; +	} +	tok = StringPtr(new Glib::ustring()); +} + diff --git a/project2/streamRows.h b/project2/streamRows.h new file mode 100644 index 0000000..473a0d8 --- /dev/null +++ b/project2/streamRows.h @@ -0,0 +1,69 @@ +#ifndef STREAMROWS_H +#define STREAMROWS_H + +#include "rowSet.h" +#include <boost/multi_index_container.hpp> +#include <boost/multi_index/member.hpp> +#include <boost/multi_index/ordered_index.hpp> + +class StreamRows : public RowSet { +	public: +		StreamRows(const xmlpp::Element * p); +		~StreamRows(); + +		unsigned int columnCount() const; +		const Glib::ustring & getColumnName(unsigned int col) const; +		VariableType getCurrentValue(const Glib::ustring & id) const; +		VariableType getCurrentValue(unsigned int col) const; +		bool isNull(unsigned int col) const; +		bool isNull(const Glib::ustring & id) const; + +	protected: +		void begin() const; +		void pushChar(gunichar ch, const RowProcessor *) const; +		void end(const RowProcessor *) const; + +	private: +		void addColumn(Glib::ustring & rawtok) const; +		class Column { +			public: +				Column(unsigned int idx, const Glib::ustring &); + +				void operator=(const VariableType &) const; + +				const unsigned int idx; +				const Glib::ustring col; +				mutable VariableType value; +		}; +		struct byColIdx {}; +		struct byColName {}; +		typedef boost::multi_index::multi_index_container< +			Column, +			boost::multi_index::indexed_by< +				boost::multi_index::ordered_unique< +				boost::multi_index::tag<byColName>, BOOST_MULTI_INDEX_MEMBER(Column, const Glib::ustring, col)>, +				boost::multi_index::ordered_unique< +				boost::multi_index::tag<byColIdx>,  BOOST_MULTI_INDEX_MEMBER(Column, const unsigned int, idx)> +				> > Columns; +		mutable Columns columns; + +	public: +		const gunichar fieldSep; +		const gunichar quoteChar; +		const bool keepBlankRows; +		const bool countBlankRows; +		const Glib::ustring newline; +		const Glib::ustring newlin; +		const std::string encoding; +		// Used in callback +		mutable size_t skipheader; +		mutable bool mkCols; +		mutable bool inQuotes; +		mutable bool prevWasQuote; +		typedef boost::shared_ptr<Glib::ustring> StringPtr; +		mutable StringPtr tok; +		mutable Columns::index<byColIdx>::type::iterator curCol; +}; + +#endif + diff --git a/project2/urlRows.cpp b/project2/urlRows.cpp index cb97b7f..447b661 100644 --- a/project2/urlRows.cpp +++ b/project2/urlRows.cpp @@ -10,23 +10,10 @@ DECLARE_LOADER("urlrows", UrlRows);  UrlRows::UrlRows(const xmlpp::Element * p) :  	SourceObject(p), -	RowSet(p), -	url(p->get_attribute_value("url")), -	fieldSep(p->get_attribute_value("fieldSep")[0]), -	quoteChar(p->get_attribute_value("quoteChar")[0]), -	newline(p->get_attribute_value("newline")[0]), -	encoding(p->get_attribute_value("encoding")), -	skipheader(atoi(p->get_attribute_value("skipheader").c_str())), -	inQuotes(false), -	prevWasQuote(false) +	StreamRows(p), +	url(p, "url"), +	convertRequired(encoding != "utf-8")  { -	BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { -		const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); -		if (elem) { -			columns.push_back(elem->get_child_text()->get_content()); -		} -	} -	mkCols = columns.empty();  }  UrlRows::~UrlRows() @@ -44,48 +31,6 @@ UrlRows::setFilter(const Glib::ustring &)  	throw NotSupported(__PRETTY_FUNCTION__);  } -unsigned int -UrlRows::columnCount() const -{ -	return columns.size(); -} - -const Glib::ustring & -UrlRows::getColumnName(unsigned int col) const -{ -	return columns[col]; -} - -VariableType -UrlRows::getCurrentValue(unsigned int col) const -{ -	return *values[col]; -} - -bool -UrlRows::isNull(unsigned int) const -{ -	return false; -} - -bool -UrlRows::isNull(const Glib::ustring &) const -{ -	return false; -} - -VariableType -UrlRows::getCurrentValue(const Glib::ustring & id) const -{ -	Values::const_iterator v = values.begin(); -	for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { -		if (*i == id) { -			return **v; -		} -	} -	throw RowSet::FieldDoesNotExist(); -} -  size_t  UrlRows::handleDataHelper(const char * ptr, size_t size, size_t nmemb, void *stream)  { @@ -98,77 +43,17 @@ size_t  UrlRows::handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const  {  	size_t used = 0, len = 0; -	char * utf8 = g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL); -	Glib::ustring str(utf8); -	free(utf8); - -	BOOST_FOREACH(gunichar c, str) { -		if (c == newline) { -			if (skipheader) { -				skipheader -= 1; -			} -			else { -				if (!mkCols) { -					if (!tok.empty()) { -						values.push_back(ValPtr(new Glib::ustring(tok))); -					} -					while (values.size() < columns.size()) { -						values.push_back(ValPtr(new Glib::ustring())); -					} -					rp->rowReady(); -					rowNum += 1; -				} -				else { -					mkCols = false; -				} -			} -			values.clear(); -			tok.clear(); -		} -		else if (c == quoteChar) { -			if (prevWasQuote) { -				tok += c; -				prevWasQuote = false; -				inQuotes = !inQuotes; -			} -			else { -				prevWasQuote = inQuotes; -				inQuotes = !inQuotes; -			} -		} -		else if ((!inQuotes) && (c == fieldSep)) { -			prevWasQuote = false; -			if (skipheader == 0) { -				if (mkCols) { -					addColumn(tok); -				} -				else { -					values.push_back(ValPtr(new Glib::ustring(tok))); -				} -			} -			tok.clear(); -		} -		else { -			prevWasQuote = false; -			tok += c; -		} +	const gchar * utf8 = convertRequired ? g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL) : bytes; +	for (const gchar * iter = utf8; *iter; iter = g_utf8_next_char(iter)) { +		this->pushChar(*iter, rp);  	} - -	return used; -} - -void -UrlRows::addColumn(const Glib::ustring & rawtok) const -{ -	columns.push_back(rawtok); -	Glib::ustring & tok(columns.back()); -	for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { -		if (!isalnum(*i)) { -			tok.erase(i); -		} -		else { -			i++; -		} +	if (convertRequired) { +		// We allocated it.. sooo.... +		free(const_cast<gchar *>(utf8)); +		return used; +	} +	else { +		return bytesLen;  	}  } @@ -176,8 +61,9 @@ void  UrlRows::execute(const RowProcessor * rp) const  {  	rowNum = 1; +	begin();  	CurlHandle::Ptr c = new CurlHandle(); -	c->setopt(CURLOPT_URL, url.c_str()); +	c->setopt(CURLOPT_URL, (const char *)url());  	//c->setopt(CURLOPT_PROXY, proxy.c_str());  	c->setopt(CURLOPT_FOLLOWLOCATION, 1);  	//c->setopt(CURLOPT_COOKIEFILE, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); @@ -188,25 +74,7 @@ UrlRows::execute(const RowProcessor * rp) const  	c->setopt(CURLOPT_WRITEDATA, &cb);  	c->setopt(CURLOPT_WRITEFUNCTION, &handleDataHelper);  	c->perform(); -	if (!tok.empty()) { -		if (skipheader == 0) { -			if (mkCols) { -				addColumn(tok); -			} -			else { -				values.push_back(ValPtr(new Glib::ustring(tok))); -			} -		} -	} -	if (!values.empty()) { -		while (values.size() < columns.size()) { -			values.push_back(ValPtr(new Glib::ustring())); -		} -		rp->rowReady(); -		rowNum += 1; -		values.clear(); -	} -	values.clear(); +	end(rp);  }  UrlRows::callback::callback(const UrlRows * u, const RowProcessor * r) : diff --git a/project2/urlRows.h b/project2/urlRows.h index 7b60400..fe63a3f 100644 --- a/project2/urlRows.h +++ b/project2/urlRows.h @@ -5,9 +5,9 @@  #include <boost/intrusive_ptr.hpp>  #include <boost/shared_ptr.hpp>  #include <map> -#include "rowSet.h" +#include "streamRows.h" -class UrlRows : public RowSet { +class UrlRows : public StreamRows {  	public:  		UrlRows(const xmlpp::Element * p);  		~UrlRows(); @@ -15,21 +15,10 @@ class UrlRows : public RowSet {  		virtual void loadComplete(const CommonObjects *);  		void execute(const RowProcessor *) const;  		virtual void setFilter(const Glib::ustring &); -		unsigned int columnCount() const; -		const Glib::ustring & getColumnName(unsigned int col) const; -		VariableType getCurrentValue(const Glib::ustring & id) const; -		VariableType getCurrentValue(unsigned int col) const; -		bool isNull(unsigned int col) const; -		bool isNull(const Glib::ustring & id) const; -		typedef std::set<gunichar> CharSet; -		const Glib::ustring url; +		const Variable url;  	protected: -		void addColumn(const Glib::ustring & rawtok) const; -		typedef boost::shared_ptr<Glib::ustring> ValPtr; -		typedef std::vector<ValPtr> Values; -		mutable Values values;  	private:  		struct callback { @@ -39,19 +28,8 @@ class UrlRows : public RowSet {  		};  		static size_t handleDataHelper(const char * ptr, size_t size, size_t nmemb, void * stream);  		size_t handleData(const RowProcessor * rp, const char * bytes, size_t bytesLen) const; -		gunichar fieldSep; -		gunichar quoteChar; -		gunichar newline; -		std::string encoding; -		mutable size_t skipheader; -		typedef std::vector<Glib::ustring> Columns; -		mutable Columns columns; - -		// Used in CURL callback -		mutable bool mkCols; -		mutable bool inQuotes; -		mutable bool prevWasQuote; -		mutable Glib::ustring tok; +		bool convertRequired; +  };  #endif  | 
