/*
								+---------------------------------+
								|                                 |
								| *** GigaWord dataset parser *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|            Main.cpp             |
								|                                 |
								+---------------------------------+
*/

// configuration flags - enable exactly one of them
#define STORAGE_CLUCENE /**< use CLucene */
//#define STORAGE_RAW /**< use raw storage */
//#define STORAGE_NONE /**< no data output, just lists empty documents */

// below are debug flags, those are by default defined
#define PARSE_SGML /**< do parse SGML, we need documents contents (undef only when debugging/optimizing zlib-related stuff) */
#define PROCESS_SGML /**< do pass parsed articles to storage engine (CLucene / raw) */

#include "../UberLame_src/NewFix.h"
#include "../UberLame_src/CallStack.h"
#include <string>
#include <vector>
#include <algorithm>
#include <string.h>
#include <stdio.h>
#include "../UberLame_src/MinMax.h"
#include "../UberLame_src/StlUtils.h"
#include "../UberLame_src/Unused.h"
#include "../UberLame_src/Dir.h"
#include "../UberLame_src/Timer.h"
#ifdef MULTITHREADED
#include "../UberLame_src/Thread.h"
#endif //MULTITHREADED
#include "SGML_Parser.h"
#include "GZipFile.h"
#if defined(MULTITHREADED) && !defined(DRIVE_ACCESS_CONTROL)
#error("MULTITHREADED is defined, please enable DRIVE_ACCESS_CONTROL in GZipFile.h as well")
#endif //MULTITHREADED && !DRIVE_ACCESS_CONTROL

#ifdef STORAGE_CLUCENE
#include <CLucene.h>
#endif //STORAGE_CLUCENE

#if defined(_MSC_VER) && !defined(__MWERKS__)
#define for if(0) {} else for
#endif
// msvc 'for' scoping hack

/**
 *	@brief simple utility class, getting (filtered) file list from directory traversal
 */
class CFileFilter {
protected:
	std::vector<std::string> &m_r_dest_list;
	const char *m_p_s_ext_list;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] r_dest_list is destination list for file paths
	 *	@param[in] p_s_ext_list is either null (get list of all the files), or list of file extensions,
	 *		separated by a null character and terminated by an extra null character (such as "txt\0" or "zip\0gz\0bz2\0")
	 */
	inline CFileFilter(std::vector<std::string> &r_dest_list, const char *p_s_ext_list)
		:m_r_dest_list(r_dest_list), m_p_s_ext_list(p_s_ext_list)
	{}

	/**
	 *	@brief adds a file to the list
	 *	@param[in] r_t_file is file information
	 *	@return Returns true on success, false on failure.
	 */
	inline bool operator ()(const TFileInfo &r_t_file)
	{
		if(r_t_file.b_directory)
			return true;
		// skip directories

		if(m_p_s_ext_list) {
			const char *p_s_ext = r_t_file.p_s_Extension();
			bool b_matched = false;
			for(const char *p_s_filter = m_p_s_ext_list; *p_s_filter; p_s_filter += strlen(p_s_filter)) {
#if defined(_WIN32) || defined(_WIN64)
				if(!_stricmp(p_s_filter, p_s_ext)) { // windows is case-insensitive
#else // _WIN32 || _WIN64
				if(!strcmp(p_s_filter, p_s_ext)) { // unix is case sensitive
#endif // _WIN32 || _WIN64
					b_matched = true;
					break;
				}
			}
		}
		// try to find the extension in the list (if we have a list)

		try {
			m_r_dest_list.push_back(r_t_file.s_filename);
		} catch(std::bad_alloc&) {
			return false;
		}
		// add file to the list

		return true;
	}
};

/**
 *	@brief simple task distributor for multithreaded processing
 *	@param[in] TWorkUnit is work unit data type, this might be for instance work unit filename
 */
template <class TWorkUnit>
class CTask {
protected:
	const std::vector<TWorkUnit> &m_r_task_list;
	size_t m_n_cur_task;
#ifdef MULTITHREADED
	CMutex m_mutex;
#endif //MULTITHREADED

public:
	/**
	 *	@brief default constructor
	 *	@param[in] r_task_list is list of tasks to be distributed to worker threads
	 *	@note r_task_list is only referenced and it must therefore remain allocated.
	 *		Also, it must not be modified while the worker threads are running.
	 */
	CTask(const std::vector<TWorkUnit> &r_task_list)
		:m_r_task_list(r_task_list), m_n_cur_task(0)
	{}

	/**
	 *	@brief determines whether all the tasks have been distributed to worker threads (they still might be processed, however)
	 *	@return Returns true if all tasks, specified in constructor have been distributed to worker threads
	 */
	bool b_Finished() const
	{
		return m_n_cur_task == m_r_task_list.size();
	}

	/**
	 *	@brief gets a new work-unite
	 *	@param[out] r_b_result is set upon function return, it contains true if the function succeeded (even though no work-unit might be returned); it is set to false on failure.
	 *	@return Returns pointer to a new work-unit, or 0 if there are no more work-units left.
	 */
	const TWorkUnit *p_Get_WorkUnit(bool &r_b_result)
	{
#ifdef MULTITHREADED
		if(!m_mutex.Lock()) {
			r_b_result = false;
			return 0;
			// failure
		}
		// lock mutex
#endif //MULTITHREADED

		if(m_n_cur_task == m_r_task_list.size()) {
			r_b_result = true;
#ifdef MULTITHREADED
			m_mutex.Unlock();
#endif //MULTITHREADED
			return 0;
			// success, but no work
		}
		// is there any file to process?

		const TWorkUnit *p_s_file = &m_r_task_list[m_n_cur_task];
		++ m_n_cur_task;
		// get file

#ifdef MULTITHREADED
		m_mutex.Unlock();
		// unlock mutex
#endif //MULTITHREADED

		r_b_result = true;
		return p_s_file;
		// success
	}
};

/**
 *	@brief converts an iso-8859-1 string to unicode (utf-16)
 *
 *	@param[out] p_s_wide_dest is destination buffer where the unicode string is written
 *	@param[in] n_space_bytes is size of p_s_wide_dest, in bytes
 *	@param[in] p_s_us_english_src is the string to be copied
 */
void strcpy_s_AtoW(TCHAR *p_s_wide_dest, size_t n_space_bytes, const char *p_s_us_english_src)
{
	n_space_bytes /= sizeof(TCHAR); // ...
	if(!n_space_bytes)
		return;
	if(n_space_bytes == 1) {
		*p_s_wide_dest = 0;
		return;
	}
	size_t n_chars_to_copy = n_space_bytes - 1; // make sure theres space for term zero
	for(; n_chars_to_copy && *p_s_us_english_src; -- n_chars_to_copy, ++ p_s_wide_dest, ++ p_s_us_english_src) {
		char c = *p_s_us_english_src;
#ifdef US_ASCII
		if(c >= 0x80)
			fprintf(stderr, "error: invalid us-english character(s)\n"); // ...
		*p_s_wide_dest = (c == 0x27)? 0x2019 : (c == 0x60)? 0x2018 : (TCHAR)c; // us-ascii
#else //US_ASCII
		*p_s_wide_dest = (TCHAR)c; // iso-8859-1 (this is CLucene way by default)
#endif //US_ASCII
	}
	*p_s_wide_dest = 0;
}

/**
 *	@brief function object for directory traversal, calling CLucene on found documents
 */
class CDocumentIndexer {
protected:
#if defined(STORAGE_CLUCENE)
	lucene::index::IndexWriter &m_r_writer; /**< @brief reference to CLucene index writer, specified in constructor */
#elif defined(STORAGE_RAW)
	FILE *m_p_binary_index;
#endif //STORAGE_RAW
#ifdef MULTITHREADED
	CMutex m_mutex;
#endif //MULTITHREADED

	size_t m_n_doc;
	bool m_b_verbose;

	CTimer m_timer;

public:
#if defined(STORAGE_CLUCENE)
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] r_writer is CLucene index writer, which will process found documents
	 */
	CDocumentIndexer(lucene::index::IndexWriter &r_writer, bool b_verbose = false)
		:m_r_writer(r_writer), m_n_doc(0), m_b_verbose(b_verbose)
	{}
#elif defined(STORAGE_RAW)
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] p_binary_index is output file where raw document data is stored
	 */
	CDocumentIndexer(FILE *p_binary_index, bool b_verbose = false)
		:m_p_binary_index(p_binary_index), m_n_doc(0), m_b_verbose(b_verbose)
	{}
#else //STORAGE_RAW
	/**
	 *	@brief default constructor
	 */
	CDocumentIndexer(bool b_verbose = false)
		:m_n_doc(0), m_b_verbose(b_verbose)
	{}
#endif //STORAGE_RAW

#ifdef STORAGE_CLUCENE
	/**
	 *	@brief directory traversal callback function
	 *
	 *	@param[in] r_t_file is file information, in case it is document file, it is indexed here
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool operator ()(const TFileInfo &r_t_file)
	{
		if(r_t_file.b_directory)
			return true;
		// ignore directories

		lucene::document::Document *p_docu = _CLNEW lucene::document::Document();
		// create a new document instance

		if(m_b_verbose)
			printf("\t%s   \r", r_t_file.p_s_Path());

		{
			TCHAR p_s_filename_w[CL_MAX_DIR];
			STRCPY_AtoT(p_s_filename_w, r_t_file.p_s_Path(), CL_MAX_DIR);
			p_docu->add(*_CLNEW lucene::document::Field(_T("path"), p_s_filename_w,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify document path

		{
			std::string s_date;
			stl_ut::Format(s_date, "%04d-%02d-%02dT%02d:%02d:%02d",
				r_t_file.p_time[TFileInfo::time_LastWrite].n_year,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_month,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_day,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_hour,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_minute,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_second);
			TCHAR p_s_date_w[32];
			STRCPY_AtoT(p_s_date_w, s_date.c_str(), s_date.length()+1);
			p_docu->add(*_CLNEW lucene::document::Field(_T("modified"), p_s_date_w,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify date modified

		{
			FILE *p_fr;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
			if(fopen_s(&p_fr, r_t_file.p_s_Path(), "r"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			if(!(p_fr = fopen(r_t_file.p_s_Path(), "r")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
				return false;

			lucene::util::StringBuffer str;
			fseek(p_fr, 0, SEEK_END);
			str.reserve(ftell(p_fr));
			fseek(p_fr, 0, SEEK_SET);

			char p_s_field[1024];
			TCHAR p_s_field_w[1024];
			for(;;) {
				size_t r = fread(p_s_field, 1, 1023, p_fr);
				if(!r)
					break;
				p_s_field[r] = 0;
				STRCPY_AtoT(p_s_field_w, p_s_field, r);
				p_s_field_w[r] = 0;
				str.append(p_s_field_w);
			}
			fclose(p_fr);

			try {
				lucene::document::Field *pf = _CLNEW lucene::document::Field(_T("contents"), str.getBuffer(),
					lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED |
					lucene::document::Field::TERMVECTOR_YES | lucene::document::Field::TERMVECTOR_WITH_POSITIONS);
				p_docu->add(*pf);
			} catch(CLuceneError &err) {
				printf("lucene error: %s\n", err.what());
			}
		}
		// specify document contents

		m_r_writer.addDocument(p_docu);
		_CLDELETE(p_docu);

		return true;
	}
#endif //STORAGE_CLUCENE

	/**
	 *	@brief simple function, used to determine whether will string contents be recognized by CLucene tokenizer as words
	 *	@param[in] p_s_text is input text
	 *	@return Returns true if there is at least a single five character word contained in the text, otherwise returns false.
	 *	@note This prevents a bug in CLucene IndexReader, which is triggered by an empty tokenized fields.
	 */
	static bool b_ContainsWords(const char *p_s_text)
	{
		size_t b = 0, e = strlen(p_s_text);
		if(!e)
			return false;
		// no text, no words

		do {
			while(b < e && isspace(p_s_text[b]))
				++ b;
			// skip whitespace

			size_t n_word_length = 0;
			while(b < e && !isspace(p_s_text[b])) {
				if(isalpha(p_s_text[b])) {
					++ n_word_length;
					if(n_word_length > 4)
						return true;
				} else
					n_word_length = 0;
				++ b;
			}
			// skip word
		} while(b < e);
		// try to find long enough word

		return false;
	}

	/**
	 *	@brief adds a document to CLucene indes
	 *
	 *	@param[in] p_s_filename is document filename (stored as "path")
	 *	@param[in] p_s_file_date is document file time in the "yyyy-MM-ddThh:mm:ss.ss" format (stored as "modified")
	 *	@param[in] p_s_id is gigaword document id (stored as "gw-id")
	 *	@param[in] p_s_type is gigaword document type (stored as "gw-type")
	 *	@param[in] p_s_headline gigaword is document headline (stored as "gw-headline")
	 *	@param[in] p_s_dateline is gigaword document dateline (stored as "gw-dateline")
	 *	@param[in] p_s_text is document contents (stored as "contents")
	 *
	 *	@return Returns true on success, false on failure.
	 *	@note Field names are chosen to match field names used by SemanticVectors, with exception of the ones with "gw-" prefix.
	 *	@note This method is not protected by a mutex. If calling from multiple threads at once, it is necessary to call Lock() and Unlock() to prevent race conditions.
	 */
	bool AddDocument(const char *p_s_filename, const char *p_s_file_date, const char *p_s_id, const char *p_s_type,
		const char *p_s_headline, const char *p_s_dateline, const char *p_s_text)
	{
#if defined(STORAGE_CLUCENE) || defined(STORAGE_NONE)
		if(!b_ContainsWords(p_s_text)) {
			std::string s_text;
			for(const char *p_s_ptr = p_s_text; *p_s_ptr; ++ p_s_ptr) {
				if(isspace(*p_s_ptr)) {
					if(!s_text.empty() && !isspace(*(s_text.end() - 1))) { // don't add whitespace to the beginning
						if(!stl_ut::Resize_Add_1More(s_text, ' ')) {
							fprintf(stderr, "warning: document %d (\'%s\'): doesn't contain terms (length: %d, text: \'%s\')\n", m_n_doc, p_s_id, strlen(p_s_text), "<not enough memory>");
							return false;
						}
					}
				} else if(!stl_ut::Resize_Add_1More(s_text, *p_s_ptr)) {
					fprintf(stderr, "warning: document %d (\'%s\'): doesn't contain terms (length: %d, text: \'%s\')\n", m_n_doc, p_s_id, strlen(p_s_text), "<not enough memory>");
					return false;
				}
			}
			// condense whitespace

			if(!s_text.empty() && isspace(*(s_text.end() - 1)))
				s_text.erase(s_text.end() - 1);
			// trim whitespace from the end

			fprintf(stderr, "warning: document %d (\'%s\'): doesn't contain terms (length: %d, text: \'%s\')\n", m_n_doc, p_s_id, strlen(p_s_text), s_text.c_str());
			return true;
		}
		// no empty documents, damnit! (triggers CLucene IndexReader bug)
#endif //STORAGE_CLUCENE || STORAGE_NONE

#if defined(STORAGE_CLUCENE)
		/*if(m_n_doc < 8000) {
			++ m_n_doc;
			return true;
		}
		if(m_n_doc == 8794) {
			printf("document %d: \'%s\'\n", m_n_doc, p_s_id);
			++ m_n_doc;
			return true;
		}*/
		// debug (tracking documents with problems)

		lucene::document::Document *p_docu = _CLNEW lucene::document::Document();
		// create a new document instance

		{
			TCHAR p_s_filename_w[CL_MAX_DIR];
			STRCPY_AtoT(p_s_filename_w, p_s_filename, CL_MAX_DIR);
			p_docu->add(*_CLNEW lucene::document::Field(_T("path"), p_s_filename_w,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify document path

		{
			TCHAR p_s_date_w[32];
			STRCPY_AtoT(p_s_date_w, p_s_file_date, strlen(p_s_file_date) + 1);
			p_docu->add(*_CLNEW lucene::document::Field(_T("modified"), p_s_date_w,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify date modified

		const TCHAR *p_field_name_list[] = {L"gw-id", L"gw-type", L"gw-headline", L"gw-dateline", L"contents"};
		const char *p_field_value_list[] = {p_s_id, p_s_type, p_s_headline, p_s_dateline, p_s_text};
		const int n_nonparsed_text_field = lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED;
		const int n_parsed_contents_field = lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED |
			lucene::document::Field::TERMVECTOR_YES | lucene::document::Field::TERMVECTOR_WITH_POSITIONS;
		const int p_field_flag_list[] = {n_nonparsed_text_field, n_nonparsed_text_field,
			n_nonparsed_text_field, n_nonparsed_text_field, n_parsed_contents_field};
		const size_t n_field_num = sizeof(p_field_name_list) / sizeof(p_field_name_list[0]);
		_ASSERTE(n_field_num == sizeof(p_field_value_list) / sizeof(p_field_value_list[0])); // arrays must have the same length
		_ASSERTE(n_field_num == sizeof(p_field_flag_list) / sizeof(p_field_flag_list[0])); // arrays must have the same length
		for(size_t i = 0; i < n_field_num; ++ i) {
			const char *p_s_field_text = p_field_value_list[i];
			// get field text

			std::basic_string<TCHAR> w_str;
			size_t n_field_length = strlen(p_s_field_text);
			_ASSERTE(n_field_length < SIZE_MAX);
			if(!stl_ut::Resize_To_N(w_str, n_field_length + 1))
				return false;
			// create (possibly long) string buffer

			strcpy_s_AtoW(&w_str[0], (n_field_length + 1) * sizeof(TCHAR), p_s_field_text);
			w_str.resize(n_field_length); // drop the last elem (terminating null)
			// convert from ansi to unicode, put data to string buffer

			try {
				lucene::document::Field *pf = _CLNEW lucene::document::Field(p_field_name_list[i],
					w_str.c_str(), p_field_flag_list[i]);
				p_docu->add(*pf);
			} catch(CLuceneError &err) {
				printf("\nlucene error: on Field::Field(): %s\n", err.what());
				_CLDELETE(p_docu);
				return false;
			}
			// create the field
		}
		// specify document contents

		try {
			m_r_writer.addDocument(p_docu); // @t_odo - this crashes, god knows why ... do something! (it doesn't crash the first time, it just crunches some documents and crashes when doing something. it doesn't depend on wheter running multithreaded, or in a single (the original one) thread)
			_CLDELETE(p_docu);
			// add the document to the database, delete document instance
		} catch(CLuceneError &err) {
			printf("\nlucene error: on addDocument(): %s\n", err.what());
			_CLDELETE(p_docu);
			return false;
		}
#elif defined(STORAGE_RAW)
		{
			TCHAR p_s_filename_w[MAX_PATH];
			strcpy_s_AtoW(p_s_filename_w, sizeof(p_s_filename_w), p_s_filename);

			uint32_t n = uint32_t(wcslen(p_s_filename_w));
			fwrite(&n, sizeof(uint32_t), 1, m_p_binary_index);
			fwrite(p_s_filename_w, sizeof(TCHAR), n, m_p_binary_index);
		}
		// specify document path

		{
			
			TCHAR p_s_date_w[32];
			strcpy_s_AtoW(p_s_date_w, sizeof(p_s_date_w), p_s_file_date);

			uint32_t n = uint32_t(wcslen(p_s_date_w));
			fwrite(&n, sizeof(uint32_t), 1, m_p_binary_index);
			fwrite(p_s_date_w, sizeof(TCHAR), n, m_p_binary_index);
		}
		// specify date modified

		const TCHAR *p_field_name_list[] = {L"gw-id", L"gw-type", L"gw-headline", L"gw-dateline", L"contents"};
		const char *p_field_value_list[] = {p_s_id, p_s_type, p_s_headline, p_s_dateline, p_s_text};
		const size_t n_field_num = sizeof(p_field_name_list) / sizeof(p_field_name_list[0]);
		_ASSERTE(n_field_num == sizeof(p_field_value_list) / sizeof(p_field_value_list[0])); // arrays must have the same length
		for(size_t i = 0; i < n_field_num; ++ i) {
			const char *p_s_field_text = p_field_value_list[i];
			// get field text

			std::basic_string<TCHAR> w_str;
			size_t n_field_length = strlen(p_s_field_text);
			_ASSERTE(n_field_length < SIZE_MAX);
			if(!stl_ut::Resize_To_N(w_str, n_field_length + 1))
				return false;
			// create (possibly long) string buffer

			strcpy_s_AtoW(&w_str[0], (n_field_length + 1) * sizeof(TCHAR), p_s_field_text);
			w_str.resize(n_field_length); // drop the last elem (terminating null)
			// convert from ansi to unicode, put data to string buffer

			const TCHAR *p_s_str = w_str.c_str();
			_ASSERTE(n_field_length <= UINT32_MAX);
			uint32_t n = uint32_t(n_field_length);

			fwrite(&n, sizeof(uint32_t), 1, m_p_binary_index);
			fwrite(p_s_str, sizeof(TCHAR), n, m_p_binary_index);
		}
		// specify additional document fields

		if(ferror(m_p_binary_index))
			return false;
#endif //STORAGE_RAW

		if(m_b_verbose) {
			if(m_timer.f_Time() > .2) { // refresh approx 5 times a second
				m_timer.ResetTimer();
				printf("\tadding \'%s\' (doc %d)\r", p_s_id, m_n_doc);
			}
		}
		// verbose

		++ m_n_doc;

		return true;
	}

	/**
	 *	@brief locks mutex, controlling access to this object (ie. call before AddDocument())
	 *	@return Returns true on success, false on failure.
	 *	@note It would, probably be more intuitive to have this built in the AddDocument()
	 *		function, but with the access pattern of gigaword parser, it is more effective
	 *		to pass multiple documents in a single lock, thus this function needs to be exposed and called explicitly.
	 */
	bool Lock()
	{
#ifdef MULTITHREADED
		return m_mutex.Lock();
#else //MULTITHREADED
		return true;
#endif //MULTITHREADED
	}

	/**
	 *	@brief unlocks mutex, controlling access to this object (ie. call after AddDocument())
	 *	@return Returns true on success, false on failure.
	 *	@note It would, probably be more intuitive to have this built in the AddDocument()
	 *		function, but with the access pattern of gigaword parser, it is more effective
	 *		to pass multiple documents in a single lock, thus this function needs to be exposed and called explicitly.
	 */
	bool Unlock()
	{
#ifdef MULTITHREADED
		return m_mutex.Unlock();
#else //MULTITHREADED
		return true;
#endif //MULTITHREADED
	}
};

/**
 *	@brief gigaword parser and unpacker class
 */
#ifdef DRIVE_ACCESS_CONTROL
class CGWUnpacker : public CRunable {
#else //DRIVE_ACCESS_CONTROL
class CGWUnpacker {
#endif //DRIVE_ACCESS_CONTROL
protected:
	CDocumentIndexer *m_p_indexer;
	CTask<std::string> *m_p_task;
#ifdef MULTITHREADED
	CMutex *m_p_drive_access_mutex;
	CThread m_thread;
#endif //MULTITHREADED
	uint64_t m_n_read_size;
	uint64_t m_n_unpack_size;
	bool m_b_verbose;
	bool m_b_result;

public:
	/**
	 *	@brief default constructor
	 */
	CGWUnpacker()
		:m_p_task(0),
#ifdef MULTITHREADED
		m_p_drive_access_mutex(0),
#endif //MULTITHREADED
		m_n_read_size(0), m_n_unpack_size(0), m_b_result(false), m_b_verbose(false)
	{
#ifdef MULTITHREADED
		m_thread.AttachRunable(*this);
#endif //MULTITHREADED
	}

#ifdef MULTITHREADED
	/**
	 *	@brief starts the worker thread
	 *
	 *	@param[in] r_indexer is reference to initialized document indexer (the storage engine)
	 *	@param[in] r_task is list of work-items to be processed
	 *	@param[in] r_drive_access_mutex is mutex for exclusive access to harddrive when reading files (improves performance)
	 *
	 *	@return Returns true on success (the thread was started), false on failure.
	 */
	bool Start(CDocumentIndexer &r_indexer, CTask<std::string> &r_task, CMutex &r_drive_access_mutex, bool b_verbose = false)
	{
		m_p_indexer = &r_indexer;
		m_p_task = &r_task;
		m_p_drive_access_mutex = &r_drive_access_mutex;
		m_n_read_size = 0;
		m_n_unpack_size = 0;
		m_b_result = true;
		m_b_verbose = b_verbose;
		return m_thread.Start();
	}
#endif //MULTITHREADED

#ifdef MULTITHREADED
	/**
	 *	@brief starts the worker thread
	 *
	 *	@param[in] r_indexer is reference to initialized document indexer (the storage engine)
	 *	@param[in] r_task is list of work-items to be processed
	 *	@param[in] r_drive_access_mutex is mutex for exclusive access to harddrive when reading files (improves performance)
	 *
	 *	@return Returns true on success (all tasks finished with no errors), false on failure.
	 */
	bool Run(CDocumentIndexer &r_indexer, CTask<std::string> &r_task, CMutex &r_drive_access_mutex, bool b_verbose = false)
#else //MULTITHREADED
	/**
	 *	@brief starts the worker thread
	 *
	 *	@param[in] r_indexer is reference to initialized document indexer (the storage engine)
	 *	@param[in] r_task is list of work-items to be processed
	 *
	 *	@return Returns true on success (all tasks finished with no errors), false on failure.
	 */
	bool Run(CDocumentIndexer &r_indexer, CTask<std::string> &r_task, bool b_verbose = false)
#endif //MULTITHREADED
	{
		m_p_indexer = &r_indexer;
		m_p_task = &r_task;
#ifdef MULTITHREADED
		m_p_drive_access_mutex = &r_drive_access_mutex;
#endif //MULTITHREADED
		m_n_read_size = 0;
		m_n_unpack_size = 0;
		m_b_result = true;
		m_b_verbose = b_verbose;
		Run();
		return m_b_result;
	}

#ifdef MULTITHREADED
	/**
	 *	@brief waits for the worker thread to finish (blocks program execution)
	 *	@return Returns true on success (all tasks finished with no errors), false on failure.
	 */
	bool WaitForFinish()
	{
		return m_thread.Stop(false) && m_b_result;
	}
#endif //MULTITHREADED

	/**
	 *	@brief gets size of processed input data
	 *	@return Returns size of processed input data in bytes, or UINT64_MAX if there's too much data.
	 */
	uint64_t n_Processed_Size() const
	{
		return m_n_read_size;
	}

	/**
	 *	@brief gets size of processed output data
	 *	@return Returns size of processed output data in bytes (ie. size of processed input data,
	 *		after unpacking), or UINT64_MAX if there's too much data.
	 */
	uint64_t n_Unpacked_Size() const
	{
		return m_n_unpack_size;
	}

protected:
	/**
	 *	@brief main processing function
	 */
	virtual void Run()
	{
		_ASSERTE(m_p_task);
		while(!m_p_task->b_Finished()) {
			bool b_result;
			const std::string *p_filename = m_p_task->p_Get_WorkUnit(b_result);
			if(!b_result) {
				fprintf(stderr, "error: failed to get work-unit\n"); // task scheduler failed
				m_b_result = false;
				return;
			}
			if(!p_filename) {
				_ASSERTE(m_p_task->b_Finished()); // we got no filename, task is finished
				break;
			}
			const char *p_s_filename = p_filename->c_str();
			// get file to work on

			std::string s_date;
			{
				TFileInfo t_file(p_s_filename);
				if(!t_file.b_Valid() || !stl_ut::Format(s_date, "%04d-%02d-%02dT%02d:%02d:%02d",
				   t_file.p_time[TFileInfo::time_LastWrite].n_year,
				   t_file.p_time[TFileInfo::time_LastWrite].n_month,
				   t_file.p_time[TFileInfo::time_LastWrite].n_day,
				   t_file.p_time[TFileInfo::time_LastWrite].n_hour,
				   t_file.p_time[TFileInfo::time_LastWrite].n_minute,
				   t_file.p_time[TFileInfo::time_LastWrite].n_second)) {
					m_b_result = false;
					return;
				}
			}
			// get file date

			if(m_b_verbose)
				printf("processing \'%s\' ...\n", p_s_filename);
			// verbose

			CSGMLNode *p_document;
			{
				TBuffer t_file_data;
#ifdef MULTITHREADED
				CGZipFile unpacker(p_s_filename, m_p_drive_access_mutex);
#else //MULTITHREADED
				CGZipFile unpacker(p_s_filename);
#endif //MULTITHREADED
				if(!unpacker.b_Status()) {
					fprintf(stderr, "error: failed to open \'%s\'\n", p_s_filename);
					m_b_result = false;
					continue;
				}
				// open archive

#ifdef MULTITHREADED
				if(!unpacker.UnpackFile(0, t_file_data, m_p_drive_access_mutex)) {
#else //MULTITHREADED
				if(!unpacker.UnpackFile(0, t_file_data)) {
#endif //MULTITHREADED
					fprintf(stderr, "error: failed to extract \'%s\'\n", p_s_filename);
					m_b_result = false;
					continue;
				}
				// unpack archive

				uint64_t n_file_size = TFileInfo(p_s_filename).n_Size64();
				if(m_n_read_size <= UINT64_MAX - n_file_size)
					m_n_read_size += n_file_size;
				else
					m_n_read_size = UINT64_MAX;
				if(m_n_unpack_size <= UINT64_MAX - t_file_data.n_Size())
					m_n_unpack_size += t_file_data.n_Size();
				else
					m_n_unpack_size = UINT64_MAX;
				// saturated add (saturation means result is inaccurate)

#ifdef PARSE_SGML
				if(!(p_document = CSGMLNode::p_Parse((const char*)t_file_data.p_Data(), t_file_data.n_Size() / sizeof(char)))) {
					fprintf(stderr, "error: failed to parse \'%s\'\n", p_s_filename);
					m_b_result = false;
					continue;
				}
				// parse SGML

				/*size_t n_approx_tag_num = 0;
				{
					const char *p_s_str = (const char*)t_file_data.p_Data();
					const char *p_s_end = (const char*)t_file_data.p_Data() + t_file_data.n_Size() / sizeof(char);
					while(p_s_str != p_s_end) {
						if(*p_s_str == '<' && (p_s_str + 1 == p_s_end || p_s_str[1] != '/')) // count all the opening tags
							++ n_approx_tag_num;
						++ p_s_str;
					}
				}
				// count elements

				size_t n_nodes = CSGMLNode::n_Population();
				size_t n_mem = p_document->n_MemFootprint();
				size_t n_imem = p_document->n_IdealMemFootprint();
				size_t n_dta = p_document->n_DataSize();
				size_t n_over = n_mem - n_dta;
				size_t n_iover = n_imem - n_dta;
				printf("document size: " PRIsizeB "B, parsed to %d SGML nodes (they take " PRIsizeB "B; there is approx %d elements), data size " PRIsizeB
					"B\nmemory footprint " PRIsizeB "B (overhead " PRIsizeB "B), ideal memory footprint " PRIsizeB "B (overhead " PRIsizeB "B)\n",
					PRIsizeBparams(t_file_data.n_Size()), n_nodes, PRIsizeBparams(n_nodes * sizeof(CSGMLNode)), n_approx_tag_num, PRIsizeBparams(n_dta), PRIsizeBparams(n_mem), PRIsizeBparams(n_over), PRIsizeBparams(n_imem), PRIsizeBparams(n_iover));*/
				// debug
#endif // PARSE_SGML
			}
			// limit lifetime of data buffer

#ifdef PARSE_SGML
			{
				if(!m_p_indexer->Lock()) {
					fprintf(stderr, "error: clucene indexer lock failed\n");
					m_b_result = false;
					return;
				}

				const std::vector<CSGMLNode*> &r_doc_list = p_document->r_Subnode_List();
				for(size_t i = 0, n = r_doc_list.size(); i < n; ++ i) {
					const CSGMLNode *p_doc = r_doc_list[i];
					if(p_doc->n_Type() == CSGMLNode::node_Element && !_strcmpi(p_doc->s_Name().c_str(), "DOC")) {
						const char *p_s_id = p_doc->p_s_FindParam("id");
						const char *p_s_type = p_doc->p_s_FindParam("type");
						if(!p_s_id || !p_s_type)
							continue; // !!
						// get document id and type

						const char *p_s_headline = p_doc->p_FindChild("HEADLINE")->s_InnerText().c_str();
						const char *p_s_dateline = p_doc->p_FindChild("DATELINE")->s_InnerText().c_str();
						// get document headline and dateline (might be empty)

						const CSGMLNode *p_text = p_doc->p_FindChild("TEXT");
						if(p_text->n_Type() != CSGMLNode::node_Element) {
							_ASSERTE(p_text->n_Type() == CSGMLNode::node_Dummy);
							fprintf(stderr, "warning: strange text element in document \'%s\'\n", p_s_id);
							continue; // !!
						}
						// find text node

						std::string s_text;
						try {
							s_text += p_text->s_InnerText();
							const std::vector<CSGMLNode*> &r_text_elem_list = p_text->r_Subnode_List();
							for(size_t j = 0, m = r_text_elem_list.size(); j < m; ++ j) {
								const CSGMLNode *p_text_elem = r_text_elem_list[j];
								// get text element

								if(p_text_elem->n_Type() == CSGMLNode::node_Element && !_strcmpi(p_text_elem->s_Name().c_str(), "P")) {
									s_text += p_text_elem->s_InnerText();
									s_text += "\n";
									// we got a <P> paragraph
								} else if(p_text_elem->n_Type() == CSGMLNode::node_Text) {
									s_text += p_text_elem->s_InnerText(); // contains the text (usually m will be 1)
									// we got raw text
								} else {
									fprintf(stderr, "warning: unknown paragraph element in document \'%s\'\n", p_s_id);
									break;
								}
							}
						} catch(std::bad_alloc&) {
							fprintf(stderr, "error: not enough memory while processing \'%s\'\n", p_s_filename);
							m_b_result = false;
							m_p_indexer->Unlock(); // !!
							return;
						}
						// concatenate text, loses formatting

#ifdef PROCESS_SGML
						/*printf("document(id: \'%s\', type: \'%s\', head: \'%s\', date: \'%s\')\n",
							p_s_id, p_s_type, p_s_headline, p_s_dateline);
						printf("contents: {%s}\n\n", s_text.c_str());*/
						// debug

						if(!m_p_indexer->AddDocument(p_s_filename, s_date.c_str(),
						   p_s_id, p_s_type, p_s_headline, p_s_dateline, s_text.c_str())) {
							fprintf(stderr, "error: clucene indexer failed while processing \'%s\'\n", p_s_filename);
							m_b_result = false;
							m_p_indexer->Unlock(); // !!
							return;
						}
#endif //PROCESS_SGML
					}
					// go trough all the DOC nodes
				}

				if(!m_p_indexer->Unlock()) {
					fprintf(stderr, "error: clucene indexer unlock failed\n");
					m_b_result = false;
					return;
				}
			}
			// parse SGML
			delete p_document;
#endif // PARSE_SGML
		}
	}
};

/**
 *	@brief main
 *
 *	@param[in] n_arg_num is number of commandline arguments
 *	@param[in] p_arg_list is list of commandline arguments; use "--in-path <path to gigaword files (.gz)> -o <output>"
 *
 *	@return Returns 0 on success, -1 on failure. Error messages are written to stderr.
 */
int main(int n_arg_num, const char **p_arg_list)
{
	const char *p_s_gigaword_path = 0;
	// path to gw

#ifdef STORAGE_CLUCENE
	const char *p_s_output_index = 0;
	// clucene index directory
#else //STORAGE_CLUCENE
	const char *p_s_output_file = 0;
	// unicode file, containing all the documents' fields as 32-bit length, followed by utf-16 contents. plain and simple.
	// the fields go in the following order: L"gw-id", L"gw-type", L"gw-headline", L"gw-dateline", L"contents"
#endif //STORAGE_CLUCENE
	// path to output index

	bool b_verbose = false;
	// verbose

	for(int i = 1; i < n_arg_num; ++ i) {
		if(!strcmp(p_arg_list[i], "--verbose"))
			b_verbose = true;
		else if(!strcmp(p_arg_list[i], "--no-verbose"))
			b_verbose = false;
		else if(!strcmp(p_arg_list[i], "--help")) {
#if defined(STORAGE_CLUCENE)
			printf("GigaWord_to_CLucene --gw-path <path to directory, containing gigaword files in gz format> -o <output path>\n");
#elif defined(STORAGE_RAW)
			printf("GigaWord_to_CLucene --gw-path <path to directory, containing gigaword files in gz format> -o <output file>\n");
#else //STORAGE_RAW
			printf("GigaWord_to_CLucene --gw-path <path to directory, containing gigaword files in gz format>\n");
#endif //STORAGE_RAW
			return 0;
		} else if(i + 1 == n_arg_num) {
			fprintf(stderr, "error: argument \'%s\': unknown argument or needs a value\n", p_arg_list[i]);
			return -1;
		} else if(!strcmp(p_arg_list[i], "--in-path"))
			p_s_gigaword_path = p_arg_list[++ i];
		else if(!strcmp(p_arg_list[i], "-o")) {
#ifdef STORAGE_CLUCENE
			p_s_output_index = p_arg_list[++ i];
#else //STORAGE_CLUCENE
			p_s_output_file = p_arg_list[++ i];
#endif //STORAGE_CLUCENE
		} else {
			fprintf(stderr, "error: argument \'%s\': unknown argument\n", p_arg_list[i]);
			return -1;
		}
	}
	// "parse" commandline

	if(!p_s_gigaword_path) {
		fprintf(stderr, "error: need to specify path to the source data. use --help to see how.\n");
		return -1;
	}
#if defined(STORAGE_CLUCENE)
	if(!p_s_output_index) {
		fprintf(stderr, "error: need to specify output directory. use --help to see how.\n");
		return -1;
	}
#elif defined(STORAGE_RAW)
	if(!p_s_output_file) {
		fprintf(stderr, "error: need to specify output file. use --help to see how.\n");
		return -1;
	}
#else //STORAGE_RAW
	if(p_s_output_file) {
		fprintf(stderr, "warning: built without STORAGE_CLUCENE or STORAGE_RAW. output file ignored.\n");
		return -1;
	}
#endif //STORAGE_RAW
	// check args

	if(b_verbose) {
		printf("built with the following flags:");
#ifdef STORAGE_CLUCENE
		printf(" STORAGE_CLUCENE");
#endif //STORAGE_CLUCENE
#ifdef STORAGE_RAW
		printf(" STORAGE_RAW");
#endif //STORAGE_RAW
#ifdef STORAGE_NONE
		printf(" STORAGE_NONE");
#endif //STORAGE_NONE
#ifdef MULTITHREADED
		printf(" MULTITHREADED");
#endif //MULTITHREADED
#ifdef DRIVE_ACCESS_CONTROL
		printf(" DRIVE_ACCESS_CONTROL");
#endif //DRIVE_ACCESS_CONTROL
		printf("\n");
	}

	{
#if defined(STORAGE_CLUCENE)
		lucene::analysis::standard::StandardAnalyzer analyzer;
		lucene::index::IndexWriter writer(lucene::store::FSDirectory::getDirectory(p_s_output_index, true),
			&analyzer, true, true);
		writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
		// prepare CLucene
#elif defined(STORAGE_RAW)
		FILE *p_fw;
		if(fopen_s(&p_fw, p_s_output_file, "wb"))
			return false;
		// open output file .
#endif //STORAGE_RAW

		std::vector<std::string> file_list;
		if(!CDirTraversal::Traverse2(p_s_gigaword_path, CFileFilter(file_list, "gz\0"))) {
			fprintf(stderr, "error: failed to traverse gigaword directory (\'%s\')\n", p_s_gigaword_path);
			return -1;
		}
		// get list of files

		if(b_verbose) {
			printf("got %d files\n", file_list.size());
			printf("reading documents ...\n");
		}

#if defined(STORAGE_CLUCENE)
		CDocumentIndexer indexer(writer, b_verbose);
#elif defined(STORAGE_RAW)
		CDocumentIndexer indexer(p_fw, b_verbose);
#else //STORAGE_RAW
		CDocumentIndexer indexer(b_verbose);
#endif //STORAGE_RAW
		// function object for adding files to CLucene

		uint64_t n_total_size = 0;
		uint64_t n_total_unpacked = 0;
		CTimer timer;
		double f_start_time = timer.f_Time();

		CTask<std::string> task(file_list);
		// task for multithreaded processing

#ifdef MULTITHREADED
		CMutex drive_access_mutex;

		int n_thread_num = CThread::n_CPU_Num();
		CGWUnpacker *p_unpacker;
		if(!(p_unpacker = new(std::nothrow) CGWUnpacker[n_thread_num])) {
			fprintf(stderr, "error: not enough memory\n");
			return -1;
		}
		for(int i = 0; i < n_thread_num; ++ i) {
			if(!p_unpacker[i].Start(indexer, task, drive_access_mutex, b_verbose)) {
				fprintf(stderr, "error: failed to start unpacker thread\n");
				return -1;
			}
		}
		for(int i = 0; i < n_thread_num; ++ i) {
			if(!p_unpacker[i].WaitForFinish()) {
				fprintf(stderr, "error: unpacker thread %d failed\n", i);
				return -1;
			}
			n_total_size += p_unpacker[i].n_Processed_Size();
			n_total_unpacked += p_unpacker[i].n_Unpacked_Size();
		}
		delete[] p_unpacker;
		// run unpacking in parallel
#else //MULTITHREADED
		CGWUnpacker unpacker;
		if(!unpacker.Run(indexer, task, b_verbose)) {
			fprintf(stderr, "error: unpacker thread %d failed\n", 0);
			return -1;
		}
		n_total_size += unpacker.n_Processed_Size();
		n_total_unpacked += unpacker.n_Unpacked_Size();
		// run unpacking in a single thread
#endif //MULTITHREADED

		if(b_verbose) {
			printf("%79s\rdone\n", "");

			double f_total_time = timer.f_Time() - f_start_time;
			printf("read " PRIsizeB "B @ " PRIsizeB "B/sec, unpacked " PRIsizeB "B @ " PRIsizeB "B/sec (it took " PRItime ")\n",
				PRIsizeBparams(n_total_size), PRIsizeBparams(n_total_size / f_total_time),
				PRIsizeBparams(n_total_unpacked), PRIsizeBparams(n_total_unpacked / f_total_time), PRItimeparams(f_total_time));
		}
		// verbose

#if defined(STORAGE_CLUCENE)
		if(b_verbose)
			printf("optimizing index ...\n");

		writer.optimize();

		if(b_verbose)
			printf("closing index ...\n");

		writer.close();
#elif defined(STORAGE_RAW)
		fclose(p_fw);
#endif //STORAGE_RAW

		if(b_verbose)
			printf("finished\n");
	}

	return 0;
}
