/*
								+---------------------------------+
								|                                 |
								|  ***   Simple HTML parser  ***  |
								|                                 |
								|  Copyright   -tHE SWINe- 2008  |
								|                                 |
								|         HTML_Parser.cpp         |
								|                                 |
								+---------------------------------+
*/

#include "../UberLame_src/NewFix.h"
#include "../UberLame_src/CallStack.h"
#include <string>
#include <vector>
#include <algorithm>
#include <string.h>
#include <stdio.h>
#include "../UberLame_src/MinMax.h"
#include "../UberLame_src/StlUtils.h"
//#include "../UberLame_src/XML.h"
#include "SGML_Parser.h"
#if !defined(_WIN32) && !defined(_WIN64)
extern int _strcmpi(const char*a, const char *b);
#endif //!WIN32 && !_WIN64

#if defined(_MSC_VER) && !defined(__MWERKS__)
#define for if(0) {} else for
#endif

/*
 *								=== globals ===
 */

bool __isspace(int c)
{
	return c == ' ' || c == '\n' || c == '\t' || c == '\r';
}

/*
 *								=== ~globals ===
 */

/*
 *								=== CSGMLNode ===
 */

CSGMLNode *CSGMLNode::p_dummy = 0;
size_t CSGMLNode::n_ref_count = 0;

CSGMLNode::CSGMLNode(int _n_type)
	:n_type(_n_type), p_param_list(0)
{
	++ n_ref_count;
}

CSGMLNode::CSGMLNode(int _n_type, const char *p_s_name)
	:n_type(_n_type), p_param_list(0)
{
	++ n_ref_count;
	if(!stl_ut::AssignCStr(s_name, p_s_name))
		return;
}

CSGMLNode::CSGMLNode(int _n_type, std::string &r_s_name) // swaps the string
	:n_type(_n_type), p_param_list(0)
{
	++ n_ref_count;
	s_name.swap(r_s_name);
}

CSGMLNode::~CSGMLNode()
{
	if(p_param_list)
		delete p_param_list;
	if((-- n_ref_count) == 1 && p_dummy) { // last node in existence does this (ie. not the root node)
		delete p_dummy;
		p_dummy = 0;
	}
	std::for_each(subnode_list.begin(), subnode_list.end(), DeleteNode);
}

bool CSGMLNode::AddChild(CSGMLNode *p_child)
{
	if(!stl_ut::Resize_Add_1More(subnode_list, p_child))
		return false;
	return true;
}

const CSGMLNode *CSGMLNode::p_FindChild(const char *p_s_name) const
{
	for(size_t i = 0, n = subnode_list.size(); i < n; ++ i) {
		if(subnode_list[i]->n_type == node_Element &&
		   !_strcmpi(subnode_list[i]->s_name.c_str(), p_s_name))
			return subnode_list[i];
	}
	return p_DummyNode();
}

const CSGMLNode *CSGMLNode::p_FindNextChild(const char *p_s_name, const CSGMLNode *p_prev) const
{
	std::vector<CSGMLNode*>::const_iterator p_node_it =
		std::find(subnode_list.begin(), subnode_list.end(), p_prev);
	if(p_node_it == subnode_list.end())
		return 0;
	for(size_t i = (p_node_it - subnode_list.begin()) + 1, n = subnode_list.size(); i < n; ++ i) {
		if(subnode_list[i]->n_type == node_Element &&
		   !_strcmpi(subnode_list[i]->s_name.c_str(), p_s_name))
			return subnode_list[i];
	}
	return p_DummyNode();
}

const char *CSGMLNode::p_s_FindParam(const char *p_s_name) const
{
	if(!p_param_list)
		return 0;
	for(size_t i = 0, n = p_param_list->size(); i < n; ++ i) {
		if(!_strcmpi((*p_param_list)[i].first.c_str(), p_s_name))
			return (*p_param_list)[i].second.c_str();
	}
	return 0;
}

void CSGMLNode::Dump(int n_level)
{
	if(n_type == node_Element) {
		Indent(n_level);
		printf("<%s", s_name.c_str());
		for(size_t i = 0, n = p_param_list->size(); i < n; ++ i)
			printf(" %s=\"%s\"", (*p_param_list)[i].first.c_str(), (*p_param_list)[i].second.c_str());
		if(subnode_list.empty())
			printf("/>\n");
		else
			printf(">\n");
	} else if(n_type == node_Text) {
		Indent(n_level);
		printf("%s\n", s_name.c_str());
	}
	for(size_t i = 0, n = subnode_list.size(); i < n; ++ i)
		subnode_list[i]->Dump(n_level + 1);
	if(n_type == node_Element && !subnode_list.empty()) {
		Indent(n_level);
		printf("</%s>\n", s_name.c_str());
	}
}

CSGMLNode *CSGMLNode::p_Parse(const std::string &r_s_page_source)
{
	return p_Parse(r_s_page_source.c_str());
}

CSGMLNode *CSGMLNode::p_Parse(const char *p_s_page_source)
{
	return p_Parse(p_s_page_source, strlen(p_s_page_source));
}

CSGMLNode *CSGMLNode::p_Parse(const char *p_page_source, size_t n_length)
{
	std::vector<CSGMLNode*> node_stack;
	if(!stl_ut::Reserve_1More(node_stack)) {
		fprintf(stderr, "error: not enough memory\n");
		return 0;
	}
	CSGMLNode *p_root;
	if(!(p_root = new(std::nothrow) CSGMLNode(node_Root))) {
		fprintf(stderr, "error: not enough memory\n");
		return 0;
	}
	node_stack.push_back(p_root);
	// add alloc root node, add it to the stack

	size_t b = 0, e = n_length;
	while(b < e) {
		while(b < e && __isspace(p_page_source[b]))
			++ b;
		size_t n_text_begin = b;
		while(b < e && p_page_source[b] != '<')
			++ b;
		if(b == e)
			break;
		size_t n_text_end = b;
		while(n_text_begin < n_text_end && __isspace(p_page_source[n_text_end - 1]))
			-- n_text_end;
		if(b < e)
			++ b;
		// find <

		if(n_text_begin < n_text_end) {
			CSGMLNode *p_node = 0;
			std::string temp_str;
			try {
				temp_str.insert(temp_str.begin(), p_page_source + n_text_begin, p_page_source + n_text_end);
			} catch(std::bad_alloc&) {
				fprintf(stderr, "error: not enough memory\n");
				delete p_root;
				return 0;
			}
			_ASSERTE(!node_stack.empty());
			CSGMLNode *p_parent = node_stack.back();
			if(p_parent->subnode_list.empty() && p_parent->n_type == node_Element) {
				_ASSERTE(p_parent->s_text.empty());
				p_parent->s_text.swap(temp_str); // save first block of text directly to the node
			} else {
				if(!(p_node = new(std::nothrow) CSGMLNode(node_Text))) {
					fprintf(stderr, "error: not enough memory\n");
					if(p_node)
						delete p_node;
					delete p_root;
					return 0;
				}
				p_node->s_text.swap(temp_str);
				if(!stl_ut::Resize_Add_1More(p_parent->subnode_list, p_node)) {
					fprintf(stderr, "error: not enough memory\n");
					delete p_node;
					delete p_root;
					return 0;
				}
			}
		}
		// create text tag

		if(b < e && p_page_source[b] == '!') {
			if(!memcmp(p_page_source + b, "!--", 3 * sizeof(char))/*r_s_page_source.find("!--", b) == b*/) {
				/*b = r_s_page_source.find("-->", b + 3);
				if(b == std::string::npos) {
					fprintf(stderr, "error: comment into the end of file\n");
					b = e;
					break;
				}*/
				for(b += 3;; ++ b) {
					if(b + 3 > e) {
						fprintf(stderr, "error: comment into the end of file\n");
						b = e;
						break;
					}
					if(!memcmp(p_page_source + b, "-->", 3 * sizeof(char)))
						break;
				}
				b += 3;
				continue;
			}
		}
		// handle comments

		bool b_closing_tag = false;
		if(b < e && p_page_source[b] == '/') {
			++ b;
			b_closing_tag = true;
		}
		// detect /

		size_t n_tag_begin = b;
		while(b < e && !__isspace(p_page_source[b]) && p_page_source[b] != '>')
			++ b;
		size_t n_tag_end = b;
		if(n_tag_begin == n_tag_end)
			continue; // this is just lost '<' in the text
		std::string s_tag;// = r_s_page_source.substr(n_tag_begin, n_tag_end - n_tag_begin);
		try {
			s_tag.insert(s_tag.begin(), p_page_source + n_tag_begin, p_page_source + n_tag_end);
		} catch(std::bad_alloc&) {
			fprintf(stderr, "error: not enough memory\n");
			delete p_root;
			return 0;
		}
		// read tag name

		std::vector<std::pair<std::string, std::string> > param_list;
		for(;;) {
			while(b < e && __isspace(p_page_source[b]))
				++ b;
			if(b == e || ((b < e && p_page_source[b] == '>') ||
			   (b + 1 < e && p_page_source[b] == '/' && p_page_source[b + 1] == '>')))
				break;
			// wait for > or />

			size_t n_name_begin = b;
			size_t n_name_end;
			if(b < e && (p_page_source[b] == '\"' || p_page_source[b] == '\'')) {
				char c_quot = p_page_source[b];
				++ n_name_begin;
				++ b;
				while(b < e && p_page_source[b] != c_quot)
					++ b;
				n_name_end = b ++;
				std::string s_value;// = r_s_page_source.substr(n_name_begin, n_name_end - n_name_begin);
				try {
					s_value.insert(s_value.begin(), p_page_source + n_name_begin, p_page_source + n_name_end);
				} catch(std::bad_alloc&) {
					fprintf(stderr, "error: not enough memory\n");
					delete p_root;
					return 0;
				}

				try {
					param_list.push_back(std::pair<std::string, std::string>(std::string(), s_value));
				} catch(std::bad_alloc&) {
					fprintf(stderr, "error: not enough memory\n");
					delete p_root;
					return 0;
				}
				// only value

				continue;
			} else {
				while(b < e && !__isspace(p_page_source[b]) && p_page_source[b] != '>' && p_page_source[b] != '=')
					++ b;
				n_name_end = b;
			}
			std::string s_name;// = r_s_page_source.substr(n_name_begin, n_name_end - n_name_begin);
			try {
				s_name.insert(s_name.begin(), p_page_source + n_name_begin, p_page_source + n_name_end);
			} catch(std::bad_alloc&) {
				fprintf(stderr, "error: not enough memory\n");
				delete p_root;
				return 0;
			}
			// read param name

			while(b < e && __isspace(p_page_source[b]))
				++ b;
			if(b == e || ((b < e && p_page_source[b] == '>') ||
			   (b + 1 < e && p_page_source[b] == '/' && p_page_source[b + 1] == '>')))
				break;
			std::string s_value;
			if(b < e && p_page_source[b] == '=') {
				++ b;
				while(b < e && __isspace(p_page_source[b]))
					++ b;
				// skip =[ \t]*

				char c_quote = p_page_source[b];
				if(b == e || ((b < e && p_page_source[b] == '>') ||
				   (b + 1 < e && p_page_source[b] == '/' && p_page_source[b + 1] == '>')))
					break;
				else if(b < e && c_quote == '"' || c_quote == '\'')
					++ b;
				// skip "

				size_t n_value_begin = b;
				while(b < e && p_page_source[b] != c_quote)
					++ b;
				size_t n_value_end = b;
				s_value.erase();// = r_s_page_source.substr(n_value_begin, n_value_end - n_value_begin);
				try {
					s_value.insert(s_value.begin(), p_page_source + n_value_begin, p_page_source + n_value_end);
				} catch(std::bad_alloc&) {
					fprintf(stderr, "error: not enough memory\n");
					delete p_root;
					return 0;
				}
				// read value

				if(b == e || ((b < e && p_page_source[b] == '>') ||
				   (b + 1 < e && p_page_source[b] == '/' && p_page_source[b + 1] == '>')))
					break;
				else if(b < e && p_page_source[b] == c_quote)
					++ b;
				// skip "
			}

			try {
				param_list.push_back(std::pair<std::string, std::string>(s_name, s_value));
			} catch(std::bad_alloc&) {
				fprintf(stderr, "error: not enough memory\n");
				delete p_root;
				return 0;
			}
		}
		// read tag parameters

		bool b_single_tag = b + 1 < e && p_page_source[b] == '/' && p_page_source[b + 1] == '>';
		// detect single tags

		if(b_single_tag)
			b += 2;
		else
			++ b;
		// skip '>' or '/>'

		const char *p_single_tag_list[] = {"br", "img", "input", "option", "frame", "hr",
			"area", "col", "colgroup", "basefont", "meta", "base", "link", "!DOCTYPE"};
		for(unsigned int i = 0; i < sizeof(p_single_tag_list) / sizeof(p_single_tag_list[0]); ++ i) {
			if(!_strcmpi(s_tag.c_str(), p_single_tag_list[i])) {
				b_single_tag = true;
				break;
			}
		}
		// force single tag on some tags

		/*if(!_strcmpi(s_tag.c_str(), "script")) {
			while(b < e) {// && _strcmpi(r_s_page_source.c_str() + b, "</script"))
				size_t sb;
				if((sb = r_s_page_source.find("</", b)) == std::string::npos) {
					b = e;
					break;
				}
				b = sb + 2;
				if(!_strcmpi(r_s_page_source.substr(b, 6).c_str(), "script"))
					break;
				++ b;
			}
			if(b == e) {
				fprintf(stderr, "error: script into the end of file\n");
				break;
			}
			b = r_s_page_source.find(">", b + 6);
			if(b == std::string::npos)
				b = e;
			else
				++ b;
			continue;
		}*/ // todo
		// skip script tags, they are dangerous to this parser

		/*if(b_single_tag) {
			printf("<%s", s_tag.c_str());
			for(int i = 0, n = param_list.size(); i < n; ++ i)
				printf(" %s=\"%s\"", param_list[i].first.c_str(), param_list[i].second.c_str());
			printf("/>\n", s_tag.c_str());
		} else if(b_closing_tag)
			printf("</%s>\n", s_tag.c_str());
		else {
			printf("<%s", s_tag.c_str());
			for(int i = 0, n = param_list.size(); i < n; ++ i)
				printf(" %s=\"%s\"", param_list[i].first.c_str(), param_list[i].second.c_str());
			printf(">\n", s_tag.c_str());
		}*/
		// debug

		CSGMLNode *p_node = 0;
		if(!b_closing_tag) {
			if(!(p_node = new(std::nothrow) CSGMLNode(node_Element, s_tag.c_str())) || p_node->s_name.empty()) {
				fprintf(stderr, "error: not enough memory\n");
				if(p_node)
					delete p_node;
				delete p_root;
				return 0;
			}
			if(!param_list.empty()) {
				if(!(p_node->p_param_list = new(std::nothrow) std::vector<std::pair<std::string, std::string> >)) {
					fprintf(stderr, "error: not enough memory\n");
					if(p_node)
						delete p_node;
					delete p_root;
					return 0;
				}
				p_node->p_param_list->swap(param_list);
			}
			_ASSERTE(!node_stack.empty());
			CSGMLNode *p_parent = node_stack.back();
			if(!stl_ut::Resize_Add_1More(p_parent->subnode_list, p_node)) {
				fprintf(stderr, "error: not enough memory\n");
				delete p_node;
				delete p_root;
				return 0;
			}
		}
		// create a new node (in case this is not closing tag)

		if(!b_closing_tag && !b_single_tag) {
			if(!stl_ut::Resize_Add_1More(node_stack, p_node)) {
				fprintf(stderr, "error: not enough memory\n");
				delete p_root;
				return 0;
			}
			// add node to the list, it might have some subnodes
		} else if(!b_single_tag) {
			if(node_stack.size() < 2) {
				//fprintf(stderr, "error: more closing tags than opening tags\n");
			} else if(node_stack.back()->s_name != s_tag) {
				std::vector<CSGMLNode*> backup_stack;
				if(!stl_ut::Reserve_N(backup_stack, node_stack.size())) {
					fprintf(stderr, "error: not enough memory\n");
					delete p_root;
					return 0;
				}
				backup_stack.insert(backup_stack.begin(), node_stack.begin(), node_stack.end());
				bool b_found = false;
				while(node_stack.size() > 1) {
					if(node_stack.back()->s_name == s_tag) {
						b_found = true;
						/*fprintf(stderr, "warning (line %d): tag mismatch, "
							"closing tag %s which is not opened "
							"(opened is %s)\n", n_Line(r_s_page_source, b),
							s_tag.c_str(), backup_stack.back()->s_name.c_str());*/
						break;
					}
					node_stack.erase(node_stack.end() - 1);
				}
				if(!b_found) {
					node_stack.swap(backup_stack);
					/*fprintf(stderr, "error (line %d): tag mismatch, "
						"closing tag %s which is not opened "
						"(opened is %s)\n", n_Line(r_s_page_source, b),
						s_tag.c_str(), node_stack.back()->s_name.c_str());*/
				}
			} else
				node_stack.erase(node_stack.end() - 1);
		}
		// work with the tag stack
	}
	// parse page code

	return p_root;
}

inline void CSGMLNode::DeleteNode(CSGMLNode *p_node)
{
	delete p_node;
}

int CSGMLNode::n_Line(const std::string &r_str, int n_position)
{
	int n_line = 1;
	n_position = min(n_position, int(r_str.length() - 1));
	for(const char *p_str = r_str.c_str(),
	   *p_end = p_str + n_position; p_str != p_end; ++ p_str) {
		if(*p_str == '\n')
			++ n_line;
	}
	return n_line;
}

void CSGMLNode::Indent(int n_level)
{
	for(; n_level; -- n_level)
		printf("    ");
}

/*
 *								=== ~CSGMLNode ===
 */
