diff options
| author | randomdan <randomdan@localhost> | 2010-08-10 23:19:58 +0000 | 
|---|---|---|
| committer | randomdan <randomdan@localhost> | 2010-08-10 23:19:58 +0000 | 
| commit | 99abe705086af006aa8f56c1ada08203a4dfcf16 (patch) | |
| tree | 57608cf4cf6434a65602524c49e3103898a77eed | |
| parent | Tidy up hierarchy (diff) | |
| download | project2-99abe705086af006aa8f56c1ada08203a4dfcf16.tar.bz2 project2-99abe705086af006aa8f56c1ada08203a4dfcf16.tar.xz project2-99abe705086af006aa8f56c1ada08203a4dfcf16.zip | |
Minor fixes to text parsers
Add a URL base row implementation
| -rw-r--r-- | project2/Jamfile.jam | 3 | ||||
| -rw-r--r-- | project2/fileRows.cpp | 5 | ||||
| -rw-r--r-- | project2/iterate.cpp | 3 | ||||
| -rw-r--r-- | project2/urlRows.cpp | 181 | ||||
| -rw-r--r-- | project2/urlRows.h | 61 | ||||
| -rw-r--r-- | project2/view.cpp | 2 | 
6 files changed, 253 insertions, 2 deletions
| diff --git a/project2/Jamfile.jam b/project2/Jamfile.jam index e5633a9..ac82e15 100644 --- a/project2/Jamfile.jam +++ b/project2/Jamfile.jam @@ -14,6 +14,7 @@ lib boost_regex : : <name>boost_regex ;  lib boost_filesystem : : <name>boost_filesystem ;  lib cgicc : : <name>cgicc ;  lib esmtp : : <name>esmtp ; +lib curl : : <name>curl ;  exe p2web :  	libxmlpp @@ -28,6 +29,7 @@ exe p2web :  	<library>odbc  	<library>esmtp  	<library>cgicc +	<library>curl  	<library>fcgi++  	<library>fcgi ; @@ -43,4 +45,5 @@ exe p2console :  	<library>boost_regex  	<library>boost_filesystem  	<library>odbc +	<library>curl  	<library>esmtp ; diff --git a/project2/fileRows.cpp b/project2/fileRows.cpp index 955767c..d9c26bc 100644 --- a/project2/fileRows.cpp +++ b/project2/fileRows.cpp @@ -51,10 +51,11 @@ _FileRows::execute() const  				if (prevWasQuote) {  					tok += c;  					prevWasQuote = false; +					inQuotes = !inQuotes;  				}  				else { +					prevWasQuote = inQuotes;  					inQuotes = !inQuotes; -					prevWasQuote = true;  				}  			}  			else if ((!inQuotes) && (c == fieldSep)) { @@ -82,7 +83,7 @@ _FileRows::execute() const  				curCol++;  			}  		} -		if (!mkCols) { +		if (!mkCols && !values.empty()) {  			while (values.size() < columns.size()) {  				values.push_back(ValPtr(new Glib::ustring()));  				curCol++; diff --git a/project2/iterate.cpp b/project2/iterate.cpp index f6d4820..f29328b 100644 --- a/project2/iterate.cpp +++ b/project2/iterate.cpp @@ -4,6 +4,7 @@  #include "xmlObjectLoader.h"  #include "sqlRows.h"  #include "fileRows.h" +#include "urlRows.h"  #include "procRows.h"  #include "task.h" @@ -27,6 +28,7 @@ _Iterate::AddLoaders(Loaders & l, NoOutputExecutes & iterates)  	l.insert(LoadersVT("sqliterate", _LoaderBase::Make<_SqlIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates)));  	l.insert(LoadersVT("fileiterate", _LoaderBase::Make<_FileIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates)));  	l.insert(LoadersVT("prociterate", _LoaderBase::Make<_ProcIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates))); +	l.insert(LoadersVT("urliterate", _LoaderBase::Make<_UrlIterate, _NoOutputExecute, unsigned int, _SourceObject, &_SourceObject::order>(&iterates)));  }  void @@ -35,6 +37,7 @@ _Iterate::AddLoaders(Loaders & l, Iterates & iterates)  	l.insert(LoadersVT("sqliterate", _LoaderBase::Make<_SqlIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates)));  	l.insert(LoadersVT("fileiterate", _LoaderBase::Make<_FileIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates)));  	l.insert(LoadersVT("prociterate", _LoaderBase::Make<_ProcIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates))); +	l.insert(LoadersVT("urliterate", _LoaderBase::Make<_UrlIterate, _Iterate, std::string, _SourceObject, &_SourceObject::name>(&iterates)));  }  void diff --git a/project2/urlRows.cpp b/project2/urlRows.cpp new file mode 100644 index 0000000..ddae45c --- /dev/null +++ b/project2/urlRows.cpp @@ -0,0 +1,181 @@ +#include "urlRows.h" +#include "../libmisc/curlsup.h" +#include <stdexcept> +#include <queue> + +_UrlRows::_UrlRows(const xmlpp::Element * p) : +	url(p->get_attribute_value("url")), +	fieldSep(p->get_attribute_value("fieldSep")[0]), +	quoteChar(p->get_attribute_value("quoteChar")[0]), +	newline(p->get_attribute_value("newline")[0]), +	encoding(p->get_attribute_value("encoding")), +	skipheader(atoi(p->get_attribute_value("skipheader").c_str())), +	inQuotes(false), +	prevWasQuote(false) +{ +	BOOST_FOREACH(const xmlpp::Node * node, p->find("columns/column")) { +		const xmlpp::Element * elem = dynamic_cast<const xmlpp::Element *>(node); +		if (elem) { +			columns.push_back(elem->get_child_text()->get_content()); +		} +	} +	mkCols = columns.empty(); +} + +_UrlRows::~_UrlRows() +{ +} + +unsigned int +_UrlRows::columnCount() const +{ +	return columns.size(); +} + +const Glib::ustring & +_UrlRows::getColumnName(unsigned int col) const +{ +	return columns[col]; +} + +const Glib::ustring & +_UrlRows::getCurrentValue(unsigned int col) const +{ +	return *values[col]; +} + +const Glib::ustring & +_UrlRows::getCurrentValue(const Glib::ustring & id) const +{ +	Values::const_iterator v = values.begin(); +	for (Columns::const_iterator i = columns.begin(); i != columns.end(); i++, v++) { +		if (*i == id) { +			return **v; +		} +	} +	throw PerRowValues::FieldDoesNotExist(); +} + +size_t +_UrlRows::_handleData(const char * ptr, size_t size, size_t nmemb, void *stream) +{ +	size_t used = static_cast<const _UrlRows *>(stream)->handleData(ptr, size * nmemb); +	return used; +} + +size_t +_UrlRows::handleData(const char * bytes, size_t bytesLen) const +{ +	size_t used = 0, len = 0; +	char * utf8 = g_convert(bytes, bytesLen, "utf-8", encoding.c_str(), &used, &len, NULL); +	Glib::ustring str(utf8); +	free(utf8); + +	BOOST_FOREACH(gunichar c, str) { +		if (c == newline) { +			if (skipheader) { +				skipheader -= 1; +			} +			else { +				if (!mkCols) { +					if (!tok.empty()) { +						values.push_back(ValPtr(new Glib::ustring(tok))); +					} +					while (values.size() < columns.size()) { +						values.push_back(ValPtr(new Glib::ustring())); +					} +					rowReady(); +				} +				else { +					mkCols = false; +				} +			} +			values.clear(); +			tok.clear(); +		} +		else if (c == quoteChar) { +			if (prevWasQuote) { +				tok += c; +				prevWasQuote = false; +				inQuotes = !inQuotes; +			} +			else { +				prevWasQuote = inQuotes; +				inQuotes = !inQuotes; +			} +		} +		else if ((!inQuotes) && (c == fieldSep)) { +			prevWasQuote = false; +			if (skipheader == 0) { +				if (mkCols) { +					addColumn(tok); +				} +				else { +					values.push_back(ValPtr(new Glib::ustring(tok))); +				} +			} +			tok.clear(); +		} +		else { +			prevWasQuote = false; +			tok += c; +		} +	} + +	return used; +} + +void +_UrlRows::addColumn(const Glib::ustring & rawtok) const +{ +	columns.push_back(rawtok); +	Glib::ustring & tok(columns.back()); +	for (Glib::ustring::iterator i = tok.begin(); i != tok.end(); ) { +		if (!isalnum(*i)) { +			tok.erase(i); +		} +		else { +			i++; +		} +	} +} + +void +_UrlRows::execute() const +{ +	CurlHandle::Ptr c = new CurlHandle(); +	c->setopt(CURLOPT_URL, url.c_str()); +	//c->setopt(CURLOPT_PROXY, proxy.c_str()); +	c->setopt(CURLOPT_FOLLOWLOCATION, 1); +	//c->setopt(CURLOPT_COOKIEFILE, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); +	//c->setopt(CURLOPT_COOKIEJAR, (std::string(cacheRoot) + "/ytfs.cookies").c_str()); +	c->setopt(CURLOPT_ENCODING, "deflate, gzip"); +	c->setopt(CURLOPT_USERAGENT, "project2/0.3"); +	c->setopt(CURLOPT_WRITEDATA, this); +	c->setopt(CURLOPT_WRITEFUNCTION, &_handleData); +	c->perform(); +	if (!tok.empty()) { +		if (skipheader == 0) { +			if (mkCols) { +				addColumn(tok); +			} +			else { +				values.push_back(ValPtr(new Glib::ustring(tok))); +			} +		} +	} +	if (!values.empty()) { +		while (values.size() < columns.size()) { +			values.push_back(ValPtr(new Glib::ustring())); +		} +		rowReady(); +		values.clear(); +	} +	values.clear(); +} + +#include "view.hpp" +template class _GenericView<_UrlRows>; +#include "iterate.hpp" +template class _GenericIterate<_UrlRows>; + diff --git a/project2/urlRows.h b/project2/urlRows.h new file mode 100644 index 0000000..f94e149 --- /dev/null +++ b/project2/urlRows.h @@ -0,0 +1,61 @@ +#ifndef URLROWS_H +#define URLROWS_H + +#include <libxml++/nodes/element.h> +#include <boost/shared_ptr.hpp> +#include <map> +#include "view.h" +#include "iterate.h" + +class _UrlRows : public PerRowValues { +	public: +		_UrlRows(const xmlpp::Element * p); +		~_UrlRows(); + +		void execute() const; +		unsigned int columnCount() const; +		const Glib::ustring & getColumnName(unsigned int col) const; +		const Glib::ustring & getCurrentValue(const Glib::ustring & id) const; +		const Glib::ustring & getCurrentValue(unsigned int col) const; +		virtual void rowReady() const = 0; + +		typedef std::set<gunichar> CharSet; +		const Glib::ustring url; + +	protected: +		void addColumn(const Glib::ustring & rawtok) const; +		typedef boost::shared_ptr<Glib::ustring> ValPtr; +		typedef std::vector<ValPtr> Values; +		mutable Values values; + +	private: +		static size_t _handleData(const char * ptr, size_t size, size_t nmemb, void * stream); +		size_t handleData(const char * bytes, size_t bytesLen) const; +		gunichar fieldSep; +		gunichar quoteChar; +		gunichar newline; +		std::string encoding; +		mutable size_t skipheader; +		typedef std::vector<Glib::ustring> Columns; +		mutable Columns columns; + +		// Used in CURL callback +		mutable bool mkCols; +		mutable bool inQuotes; +		mutable bool prevWasQuote; +		mutable Glib::ustring tok; +}; +typedef boost::shared_ptr<_UrlRows> UrlRows; + +typedef _GenericView<_UrlRows> _UrlView; +typedef boost::shared_ptr<_UrlView> UrlView; +typedef std::map<std::string, UrlView> UrlViews; + +typedef _GenericIterate<_UrlRows> _UrlIterate; +typedef boost::shared_ptr<_UrlIterate> UrlIterate; +typedef std::map<std::string, UrlIterate> UrlIterates; + +#endif + + + diff --git a/project2/view.cpp b/project2/view.cpp index 530d7b5..217424f 100644 --- a/project2/view.cpp +++ b/project2/view.cpp @@ -3,6 +3,7 @@  #include "xmlObjectLoader.h"  #include "rawView.h"  #include "fileRows.h" +#include "urlRows.h"  #include "sqlRows.h"  #include "procRows.h" @@ -26,6 +27,7 @@ _View::AddLoaders(Loaders & l, Views & views)  	l.insert(LoadersVT("rawview", _LoaderBase::Make<_RawView, _View, std::string, _SourceObject, &_SourceObject::name>(&views)));  	l.insert(LoadersVT("fileview", _LoaderBase::Make<_FileView, _View, std::string, _SourceObject, &_SourceObject::name>(&views)));  	l.insert(LoadersVT("procview", _LoaderBase::Make<_ProcView, _View, std::string, _SourceObject, &_SourceObject::name>(&views))); +	l.insert(LoadersVT("urlview", _LoaderBase::Make<_UrlView, _View, std::string, _SourceObject, &_SourceObject::name>(&views)));  }  void | 
