/*
								+----------------------------------+
								|                                  |
								|  ***   Regular expr tools   ***  |
								|                                  |
								|   Copyright  -tHE SWINe- 2007   |
								|                                  |
								|             RegExp.h             |
								|                                  |
								+----------------------------------+
*/

#pragma once
#ifndef __REG_EXP_INCLUDED
#define __REG_EXP_INCLUDED

/**
 *	@file RegExp.h
 *	@author -tHE SWINe-
 *	@date 2007
 *	@brief simple regular expression parser
 *
 *	Regexp syntax:
 *
 *	<code>.		any single character
 *	[]		character range; minus [\- ...], brackets [\] ...], [\[ ...], [\]\[ ...]
 *	[^]		inverse character range
 *	()		subexpression
 *	?		0 - 1 quantifier
 *	*		0 - inf quantifier
 *	+		1 - inf quantifier
 *	{n}		n quantifier
 *	{n,}	n - inf quantifier
 *	{n,m}	n - m quantifier
 *
 *	\x[0-9a-fA-F]+;										escape sequence for characters
 *	\- \( \) *	\[ \] \^ \* \+ \{ \} \. \? \\			other escape sequences</code>
 *
 *	@date 2007-09-24
 *
 *	structures TTransition and TState were moved from CScanner to TScannerDrivingTable
 *
 *	@date 2007-09-26
 *
 *	rearranged members of TScannerDrivingTable::TState so the structure is smaller
 *
 *	@date 2007-10-25
 *
 *	fixed error in CRegExp_DFSM::MakeMinimal causing equal states linking
 *	themselves couldn't be merged
 *
 *	@date 2008-03-23
 *
 *	fixed couple of memory leaks in CRegExp_NFSM::CollapseIdentityTransitions
 *	and in CRegExp_DFSM::MakeMinimal
 *	improved TScannerDrivingTable::PrintSource so it can be directly used to create header files
 *
 *	@date 2008-04-20
 *
 *	optimized TScannerDrivingTable::TTransition so it can be whole packed to 64 bits
 *	(n_char_min and n_state offen fit in 16-bit integer) therefore speeding-up lexical
 *	analysis a little bit (experiments on XML parser showed improvement about 9%)
 *
 *	@date 2009-05-23
 *
 *	removed all instances of std::vector::reserve and replaced them by stl_ut::Reserve_*
 *
 *	@date 2009-10-20
 *
 *	fixed some warnings when compiling under VC 2005, implemented "Security
 *	Enhancements in the CRT" for VC 2008. compare against MyProjects_2009-10-19_
 *
 *	@date 2012-06-19
 *
 *	Moved multiple inclusion guard before file documentation comment.
 *
 */

/**
 *	@def REG_EXP_COMPILER_DUMP
 *
 *	@brief if defined, some classes have Print()-like member function compiled
 */
//#define REG_EXP_COMPILER_DUMP

/**
 *	@def REG_EXP_COMPILER_STDERR_OUTPUT
 *
 *	@brief if defined, compiler outputs errors and warnings to stderr (otherwise silent)
 */
#define REG_EXP_COMPILER_STDERR_OUTPUT

/**
 *	@def REG_EXP_MACHINES_DOTTY_CAPABILITY
 *
 *	@brief if defined, [N|D]FSM classes have PrintDotty() member function compiled
 */
#define REG_EXP_MACHINES_DOTTY_CAPABILITY

extern const int n_first_unicode_char /*= 0*/, /**< min expected input char */
				 n_last_unicode_char /*= 0x10ffff*/; /**< max expected input char */

#include "NewFix.h"
#include "MinMax.h"

/*
 *								--- TInterval ---
 */

/**
 *	@brief simple numeric interval structure
 */
struct TInterval {
	int n_char_min, n_char_max; // min / max character range

	/**
	 *	@brief default constructor
	 *
	 *	Interval is always closed, ie. both endpoints are included.
	 *	To create empty interval, _n_char_max must be less than _n_char_min.
	 *
	 *	@param[in] _n_char_min is minimal interval value
	 *	@param[in] _n_char_max is maximal interval value
	 */
	inline TInterval(int _n_char_min = 0, int _n_char_max = -1)
		:n_char_min(_n_char_min), n_char_max(_n_char_max)
	{}

	/**
	 *	@brief checks if interval conjunction is not empty
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns true in case this interval and r_interval are conjunct, otherwise false
	 *		  (intervals overlap).
	 */
	inline bool b_Conjunct(const TInterval &r_interval) const
	{
		return !b_Disjunct(r_interval);
	}

	/**
	 *	@brief checks if interval disjunction is not empty
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns true in case this interval and r_interval are disjunct, otherwise false
	 *		  (intervals do not overlap).
	 */
	inline bool b_Disjunct(const TInterval &r_interval) const
	{
		return b_Empty() || r_interval.b_Empty() || n_char_max < r_interval.n_char_min ||
			n_char_min > r_interval.n_char_max;
	}

	/**
	 *	@brief checks if interval is empty
	 *
	 *	@return Returns true in case interval is empty, otherwise false.
	 */
	inline bool b_Empty() const
	{
		return n_char_min > n_char_max;
	}

	/**
	 *	@brief checks if interval is sub-interval of other interval
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns true in case this interval is completely contained in r_interval,
	 *		  otherwise returns false.
	 */
	inline bool b_Contained(const TInterval &r_interval) const
	{
		return n_char_min >= r_interval.n_char_min &&
			n_char_max <= r_interval.n_char_max;
	}

	/**
	 *	@brief checks if interval is sub-interval of this
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns true in case r_interval is completely contained in this interval,
	 *		  otherwise returns false.
	 */
	inline bool b_Contains(const TInterval &r_interval) const
	{
		return r_interval.n_char_min >= n_char_min &&
			r_interval.n_char_max <= n_char_max;
	}

	/**
	 *	@brief interval compariosn operator
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns true in case intervals are identic, otherwise false.
	 */
	inline bool operator ==(const TInterval &r_interval) const
	{
		return r_interval.n_char_min == n_char_min &&
			r_interval.n_char_max == n_char_max;
	}

	/**
	 *	@brief salculates intersection of two intervals
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns intersection of this and r_interval.
	 *
	 *	@note This returns empty interval in case intervals are disjunct.
	 */
	inline TInterval t_Intersection(const TInterval &r_interval) const
	{
		return TInterval(max(n_char_min, r_interval.n_char_min),
			min(n_char_max, r_interval.n_char_max));
	}

	/**
	 *	@brief salculates union of two intervals
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns union of this and r_interval.
	 *
	 *	@note This works only and only in case intervals are conjunct.
	 */
	inline TInterval t_Union(const TInterval &r_interval) const
	{
		_ASSERTE(b_Conjunct(r_interval));
		return TInterval(min(n_char_min, r_interval.n_char_min),
			max(n_char_max, r_interval.n_char_max));
	}

	/**
	 *	@brief gets left part of interval complement intersected with this
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns left part of r_interval's complement in this interval (this - r_interval).
	 *
	 *	@note This returns empty interval if this interval is contained in r_interval
	 *		  or if r_interval.n_char_min - 1 < n_char_min.
	 */
	inline TInterval t_LeftComplement(const TInterval &r_interval) const
	{
		return TInterval(n_char_min, min(n_char_max, r_interval.n_char_min - 1));
	}

	/**
	 *	@brief gets right part of interval complement intersected with this
	 *
	 *	@param[in] r_interval is the other interval (first interval being this)
	 *
	 *	@return Returns right part of r_interval's complement in this interval (this - r_interval).
	 *
	 *	@note This returns empty interval if this interval is contained in r_interval
	 *		  or if r_interval.n_char_max + 1 > n_char_max.
	 */
	inline TInterval t_RightComplement(const TInterval &r_interval) const
	{
		return TInterval(max(n_char_min, r_interval.n_char_max + 1), n_char_max);
	}
};

/*
 *								--- ~TInterval ---
 */

/*
 *								--- CRegExp_Lexer ---
 */

/**
 *	@brief regular expression lexical analyzer
 */
class CRegExp_Lexer {
public:
	/**
	 *	@brief token types
	 */
	enum {
		token_Unknown,		/**< unknown token (error) */
		token_Char,			/**< regular character */
		token_AnyChar,		/**< any character "." */
		token_Option,		/**< optin operator "|" */
		token_LeftPar,		/**< left parenthesis "(" */
		token_RightPar,		/**< right parenthesis ")" */
		token_CharRange,	/**< character range "[a-z]" */
		token_Quant			/**< quantification specifier "*", or "{m,n}" */
	};

	/**
	 *	@brief regular expression token structure
	 */
	struct TToken {
		int n_type;		/**< contains token type (one of token_*) */
		int n_column;	/**< column index, for error reporting */

		union {
			int n_character;		/**< character (token_Char) */
			struct TRangeData {		/**< character range structure */
				TInterval *p_range;	/**< list of character intervals */
				int n_range_num;	/**< number of cahracter intervals */
				bool b_inverse;		/**< inverse range? */
			} t_range;				/**< character range (token_CharRange) */
			struct TQuantData {		/**< quantification specifier structure */
				int n_min;			/**< minimum number of repeats */
				int n_max;			/**< maximum number of repeats (-1 = inf) */
			} t_quant;				/**< quantification specifier (token_Quant) */
		} t_data;		/**< token data */

		/**
		 *	@brief deletes potential token data on heap (character ranges array)
		 *
		 *	@note Must not be called multiple times on single token without
		 *		  clearing / re-allocating data!
		 */
		inline void Free()
		{
			if(n_type == token_CharRange && t_data.t_range.p_range)
				delete[] t_data.t_range.p_range;
		}
	};

	/**
	 *	@brief simple function object for freeing-up tokens in std::vector 
	 */
	class CFreeToken {
	public:
		/**
		 *	@brief calls r_token.Free()
		 *
		 *	@param[in,out] r_token is token to free
		 */
		inline void operator ()(TToken &r_token) const
		{
			r_token.Free();
		}
	};

#ifdef REG_EXP_COMPILER_DUMP
	/**
	 *	@brief simple function object for printing tokens (various debug dumps)
	 */
	class CPrintToken {
	public:
		/**
		 *	@brief gets token type name
		 *
		 *	@param[in] n_token_type is token type (one of token_*)
		 *
		 *	@return Returns string name for n_token_type,
		 *		if n_token_type is out of token types range, always returns "token_Unknown".
		 */
		static const char *p_s_TokenName(int n_token_type);

		/**
		 *	@brief prints token type and it's possible data
		 *		  (such as character codes for char-range tokens, etc)
		 *
		 *	@param[in] r_token is token to print
		 */
		void operator ()(const TToken &r_token) const;
	};
#endif // REG_EXP_COMPILER_DUMP

	/**
	 *	@brief performs lexical analysis on null-terminated string
	 *
	 *	Fills r_token_list with tokens generated from p_s_regexp regular expression.
	 *
	 *	@param[in] p_s_regexp is string to be analyzed
	 *	@param[out] r_token_list is list of found tokens
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note it prints error messages to stderr.
	 */
	static bool Tokenize(const char *p_s_regexp, std::vector<TToken> &r_token_list);

private:
	static bool Parse_EscapeSequence(const char *&p_s_regexp, int &r_n_value);
};

/*
 *								--- ~CRegExp_Lexer ---
 */

/*
 *								--- finite state machinery ---
 */

/*
 *	struct TNFAState
 *		- non-deterministic finite state machine state structure
 *		- has simple state id to identify states and expression id for accepting states
 */
struct TNFAState {
	int n_id;
	static int n_id_space;
	int n_expression_id; // -1 in case not accepting, otherwise regular expression id

	/*
	 *	inline TNFAState::TNFAState()
	 *		- default constructor
	 *		- assigns state globally unique id and no expression id
	 *		  (state is initially not accepting)
	 */
	inline TNFAState()
		:n_expression_id(-1), n_id(++ n_id_space) // set expression id later
	{}
};

/*
 *	template <class TStateType>
 *	struct TTransition
 *		- template for transition, connecting two states of TStateType type
 *		- transition input character range inherited from interval struct
 */
template <class TStateType>
struct TTransition : public TInterval {
	TStateType *p_start, *p_end; // start and end state

	/*
	 *	inline TTransition<class TStateType>::TTransition(TStateType *_p_start, TStateType *_p_end,
	 *		int _n_char_min = n_first_unicode_char, int _n_char_max = n_last_unicode_char)
	 *		- default constructor
	 *		- p_start and p_end are start and end states connected by the transition
	 *		- _n_char_min and _n_char_max is character range
	 */
	inline TTransition(TStateType *_p_start, TStateType *_p_end,
		int _n_char_min = n_first_unicode_char, int _n_char_max = n_last_unicode_char)
		:TInterval(_n_char_min, _n_char_max), p_start(_p_start), p_end(_p_end)
	{}
};

/*
 *	struct TNFATransition
 *		- non-deterministic finite automata transition
 *		- bears type member which specifies wheter it is regular transition,
 *		  epsilon transition or identity connection
 */
struct TNFATransition : public TTransition<TNFAState> {
	int n_type;

	/*
	 *	transition types
	 */
	enum {
		tran_Char, // transitions defined clearly by char ranges
		tran_Epsilon, // "important" epsilon transitions
		tran_Identity // identity transitions can be removed right away
	};

	/*
	 *	inline TNFATransition::TNFATransition(TNFAState *_p_start, TNFAState *_p_end,
	 *		int _n_type = tran_Char, int _n_char_min = n_first_unicode_char,
	 *		int _n_char_max = n_last_unicode_char)
	 *		- default constructor
	 *		- p_start and p_end are start and end states connected by the transition
	 *		- _n_type is trans. type, possible values are tran_Char, tran_Epsilon or tran_Identity
	 *		- _n_char_min and _n_char_max is character range (required for tran_Char type only)
	 */
	inline TNFATransition(TNFAState *_p_start, TNFAState *_p_end, int _n_type = tran_Char,
		int _n_char_min = n_first_unicode_char, int _n_char_max = n_last_unicode_char)
		:TTransition<TNFAState>(_p_start, _p_end, _n_char_min, _n_char_max), n_type(_n_type)
	{
		_ASSERTE(_p_start && _p_end);
	}
};

/*
 *	struct TDFAState
 *		- deterministic finite automata state structure
 *		- contains bit array, marking combination of NFA states (powerset) to identify states
 *		- has expression id for accepting states, same as NFA states
 */
struct TDFAState {
	CBitArray state_flags; // bit array as long as list of states inside finite state machine
	int n_expression_id; // -1 in case not accepting or regexp id in case it's accepting

	/*
	 *	inline TDFAState::TDFAState(int n_bit_array_length = 0)
	 *		- default constructor, initializes bit array to n_bit_array_length and clears it
	 *		- note it's necessary to see wheter state_flags.n_Length() matches required length
	 */
	inline TDFAState(int n_bit_array_length = 0)
		:state_flags(n_bit_array_length), n_expression_id(-1)
	{
		state_flags = false; // clear the whole array
	}

	/*
	 *	inline bool TDFAState::operator =(const TDFAState &r_state)
	 *		- copy-operator
	 *		- returns true in case of success, false on failure (not enough memory)
	 */
	inline bool operator =(const TDFAState &r_state)
	{
		n_expression_id = r_state.n_expression_id;
		state_flags = r_state.state_flags; // returns void, as bool could be assigned to CBitArray
		return state_flags.n_Length() == r_state.state_flags.n_Length();
	}

	/*
	 *	inline bool TDFAState::operator ==(const TDFAState &r_t_state) const
	 *		- comparison operator
	 *		- note this compares bit array contents only, not n_expression_id
	 */
	inline bool operator ==(const TDFAState &r_t_state) const
	{
		return state_flags == r_t_state.state_flags;
	}
};

/*
 *	struct TDFATransition
 *		- transition for DFA states
 */
typedef TTransition<TDFAState> TDFATransition;

/*
 *	template <class TStateType, class TTransitionType>
 *	class CRegExp_FSM
 *		- finite state machine base class
 */
template <class TStateType, class TTransitionType>
class CRegExp_FSM {
protected:
	std::vector<TStateType*> m_state_list;
	std::vector<TTransitionType> m_transition_list;
	TStateType m_t_initial_state;

public:
	/*
	 *	CRegExp_FSM::~CRegExp_FSM()
	 *		- default destructor
	 *		  (deletes states (allocated explicitly to have constant address))
	 */
	~CRegExp_FSM()
	{
		std::for_each(m_state_list.begin(), m_state_list.end(), DeleteStates);
	}

	/*
	 *	inline TStateType *CRegExp_FSM::p_GetInitState()
	 *		- returns pointer to machine initial state (never 0)
	 */
	inline TStateType *p_GetInitState()
	{
		return &m_t_initial_state;
	}

	/*
	 *	TStateType *CRegExp_FSM::p_GetState()
	 *		- returns new unused state or 0 in case there was not enough memory
	 */
	TStateType *p_GetState()
	{
		if(!stl_ut::Reserve_1More(m_state_list))
			return 0;
		TStateType *p_state;
		if(!(p_state = new(std::nothrow) TStateType))
			return 0;
		m_state_list.push_back(p_state);
		return p_state;
	}

	/*
	 *	bool CRegExp_FSM::AddTransition(TTransitionType &r_transition)
	 *		- adds a new transition r_transition
	 *		- returns true on success, false on failure
	 *		- note transition's pointers should point on states of this machine
	 *		  (not checked in this function)
	 */
	bool AddTransition(TTransitionType &r_transition)
	{
		_ASSERTE(!r_transition.b_Empty());
		if(!stl_ut::Reserve_1More(m_transition_list))
			return false;
		m_transition_list.push_back(r_transition);
		return true;
	}

protected:
	static inline void DeleteStates(TStateType *p_state)
	{
		delete p_state;
	}
};

/*
 *	struct TScannerDrivingTable
 *		- simple scanner driving table (contains finite state machine,
 *		  necessary for driving CScanner)
 *		- note there is intentionally no destructor (so there has to be no copy operator),
 *		  it's necessary to call Free()
 */
struct TScannerDrivingTable {
	/*
	 *	struct TScannerDrivingTable::TTransition
	 *		- DFA transition
	 */
	struct TTransition {
		int n_char_max, n_char_min; // accepted input character range
		int n_state; // state transition leads to
	};

	/*
	 *	struct TScannerDrivingTable::TState
	 *		- DFA state
	 */
	struct TState {
		const TTransition *p_transition; // transitions
		int n_regexp_id; // zero-based CTokenEmit index for accepting states, -1 for other states
		int n_transition_num; // number of transitions from this state
	};

	TState *p_state;
	TTransition *p_transition;
	int n_state_num;
	int n_transition_num;

	/*
	 *	inline TScannerDrivingTable::TScannerDrivingTable
	 *		- default constructor
	 */
	inline TScannerDrivingTable()
		:p_state(0), p_transition(0), n_state_num(0), n_transition_num(0)
	{}

	/*
	 *	void TScannerDrivingTable::PrintSource(FILE *p_stream,
	 *		const char *p_s_type_s = "const TScannerDrivingTable::TState",
	 *		const char *p_s_type_t = "const TScannerDrivingTable::TTransition",
	 *		const char *p_s_name_s = "p_state",
	 *		const char *p_s_name_t = "p_transition",
	 *		const char *p_s_name_s_num = "n_state_num",
	 *		const char *p_s_name_t_num = "n_transition_num",
	 *		const char *p_s_class_name = 0,
	 *		int n_max_line_length = 100)
	 *		- prints driving table source to file p_stream
	 *		- note it doesn't check for any i/o errors
	 */
	void TScannerDrivingTable::PrintSource(FILE *p_stream,
		const char *p_s_type_s = "const TScannerDrivingTable::TState",
		const char *p_s_type_t = "const TScannerDrivingTable::TTransition",
		const char *p_s_name_s = "p_state",
		const char *p_s_name_t = "p_transition",
		const char *p_s_name_s_num = "n_state_num",
		const char *p_s_name_t_num = "n_transition_num",
		const char *p_s_class_name = 0,
		int n_max_line_length = 100);

	/*
	 *	void TScannerDrivingTable::Free()
	 *		- frees up allocated data
	 */
	void Free();
};

/*
 *	class CRegExp_DFSM
 *		- deterministic finite state machine
 */
class CRegExp_DFSM : public CRegExp_FSM<TDFAState, TDFATransition> {
protected:
	class CFindStatePtr;
	class CCopyStates;

#ifdef REG_EXP_MACHINES_DOTTY_CAPABILITY
	class CPrintAcceptingStateNames;
	class CPrintTransitionNames;
#endif // REG_EXP_MACHINES_DOTTY_CAPABILITY

public:
	/*
	 *	TDFAState *CRegExp_DFSM::p_FindState(const TDFAState &r_state)
	 *		- finds state with the same bit array as r_state and returns it's pointer
	 *		- returns 0 in case no such state was found
	 *		- note regular expression id is not compared
	 */
	TDFAState *p_FindState(const TDFAState &r_state);

	/*
	 *	TDFAState *CRegExp_DFSM::p_GetState(unsigned int n_array_size)
	 *		- returns new unused state or 0 in case there was not enough memory
	 *		- state bit array is allocated to n_array_size bits
	 */
	TDFAState *p_GetState(unsigned int n_array_size);

#ifdef REG_EXP_MACHINES_DOTTY_CAPABILITY
	/*
	 *	void CRegExp_DFSM::PrintDotty(FILE *p_out)
	 *		- prints dotty format file, containing FSM graph description
	 *		- note this doesn't check for any i/o errors
	 */
	void PrintDotty(FILE *p_out);
#endif // REG_EXP_MACHINES_DOTTY_CAPABILITY

	/*
	 *	bool CRegExp_DFSM::MakeMinimal()
	 *		- minimalizes DFSM using a little bit profound and slow algorithm
	 *		- todo: implement something better
	 *		- returns true on success, false on failure
	 */
	bool MakeMinimal();

	/*
	 *	bool CRegExp_DFSM::MakeTable(TScannerDrivingTable &r_table) const
	 *		- creates driving table for CScanner
	 *		- note the transitions come sorted by min-char so binary search can be used in lexer
	 *		- returns true on success, false on failure
	 */
	bool MakeTable(TScannerDrivingTable &r_table) const;
};

/*
 *	class CRegExp_NFSM
 *		- nondeterministic finite state machine (the most general)
 */
class CRegExp_NFSM : public CRegExp_FSM<TNFAState, TNFATransition> {
protected:
#ifdef REG_EXP_MACHINES_DOTTY_CAPABILITY
	class CPrintAcceptingStateNames;
	class CPrintTransitionNames;
#endif // REG_EXP_MACHINES_DOTTY_CAPABILITY

	class CReassingState;
	class CGatherStates;
	class CNumberStates;
	class CGatherTransitions;
	class CFindConjunct;
	class CFindLeft;
	class CCompareDFATransitions;
	class CGenerateEndState;
	class CEmitDFATransitions;

public:
	/*
	 *	void CRegExp_NFSM::CollapseIdentityTransitions()
	 *		- as a result of state generation policy, sometimes occur more than one
	 *		  constructs should end up in the same state, but there is no mechanism
	 *		  to specify this state so identity transitions are added (don't confuse
	 *		  with epsilon transitions)
	 *		- when all states are generated, groups of states connected by identity
	 *		  transitions should be collapsed to single state each which is purpose
	 *		  of this function
	 */
	void CollapseIdentityTransitions();

#ifdef REG_EXP_MACHINES_DOTTY_CAPABILITY
	/*
	 *	void CRegExp_NFSM::PrintDotty(FILE *p_out)
	 *		- prints dotty format file, containing FSM graph description
	 *		- note this doesn't check for any i/o errors
	 */
	void PrintDotty(FILE *p_out);
#endif // REG_EXP_MACHINES_DOTTY_CAPABILITY

	/*
	 *	bool CRegExp_NFSM::MakeDeterministic(CRegExp_DFSM &r_dfsm)
	 *		- converts this NFSM to equivalent DFSM
	 *		- returns true on success, false on failure
	 *		- note r_dfsm should be blank
	 */
	bool MakeDeterministic(CRegExp_DFSM &r_dfsm);

protected:
	void GatherStates(TDFAState &r_state) const;
	void NumberStates();
	bool GatherTransitions(const TDFAState &r_dfa_state,
		std::vector<TNFATransition> &r_transition_list) const;
	typedef std::vector<TDFATransition>::difference_type dfa_offset_t;
	static bool CreateDisjunctTransitions(TDFAState *p_start,
		const std::vector<TNFATransition> &r_nfa_tran_list,
		std::vector<TDFATransition> &r_dfa_tran_list);
	static inline bool b_IsIdentityTransition(const TNFATransition &r_transition)
	{
		return r_transition.n_type == TNFATransition::tran_Identity;
	}
};

/*
 *								--- ~finite state machinery ---
 */

/*
 *								--- CRegExp_Parser ---
 */

/*
 *	class CRegExp_Parser
 *		- regular expression parser
 */
class CRegExp_Parser {
protected:
	bool m_b_parse_errors;

public:
	struct CNode {
	protected:
		int m_n_type;

	public:
		enum {
			node_Unknown,
			node_String, // string of another nodes (can contain strings again)
				node_Char, // single character
				node_AnyChar, // any single character
				node_RangeChar, // single character form given range
				node_Quant, // quantifier node (repeat it's inside)
				node_Option // option node (one of n)
		};

		inline CNode(int n_type = node_Unknown)
			:m_n_type(n_type)
		{}

		virtual ~CNode()
		{}

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const = 0;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);

		void PrintIndent(int n_level);
#endif // REG_EXP_COMPILER_DUMP
	};

	struct CNodeString : public CNode {
	protected:
		std::vector<CNode*> m_node_list;

		class CAddSerialTransitions;

	public:
		inline CNodeString(int n_type = node_String)
			:CNode(n_type)
		{}

		virtual ~CNodeString();

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

		bool AddNode(CNode *p_node);

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP

	protected:
		static inline void DeleteNodes(CNode *p_node)
		{
			delete p_node;
		}
	};

	struct CNodeChar : public CNode {
	protected:
		int m_n_code;

	public:
		inline CNodeChar(int n_code)
			:CNode(node_Char), m_n_code(n_code)
		{
			_ASSERTE(n_code >= n_first_unicode_char && n_code <= n_last_unicode_char);
		}

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP
	};

	struct CNodeAnyChar : public CNode {
	public:
		inline CNodeAnyChar()
			:CNode(node_AnyChar)
		{}

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP
	};

	struct CNodeRangeChar : public CNode {
	protected:
		bool m_b_inverse;
		int m_n_range_num;
		const TInterval *m_p_range;

		class CFindConjunct;

	public:
		inline CNodeRangeChar(bool b_inverse, int n_range_num,
			const TInterval *p_range)
			:CNode(node_RangeChar), m_b_inverse(b_inverse),
			m_n_range_num(n_range_num), m_p_range(p_range)
		{
			for(const TInterval *p_end = m_p_range + m_n_range_num;
			   p_range != p_end; ++ p_range) {
				_ASSERTE(!p_range->b_Empty());
				_ASSERTE(p_range->n_char_min >= n_first_unicode_char &&
					p_range->n_char_max <= n_last_unicode_char);
			}
		}

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP

	protected:
		static inline bool CompareIntervals(const TInterval &r_t_range_a,
			const TInterval &r_t_range_b)
		{
			_ASSERTE(!r_t_range_a.b_Empty());
			_ASSERTE(!r_t_range_b.b_Empty());
			//_ASSERTE(!r_t_range_a.b_Conjunct(r_t_range_b));
			// my implementation of std::sort occasionaly compares pivot with itself
			// assert no collisions or malformed intervals

			return r_t_range_a.n_char_min < r_t_range_b.n_char_min;
		}
	};

	struct CNodeQuant : public CNode {
	protected:
		int m_n_min_repeat, m_n_max_repeat;
		CNode *m_p_subnode;

	public:
		inline CNodeQuant(CNode *p_subnode, int n_min_repeat, int n_max_repeat)
			:CNode(node_Quant), m_p_subnode(p_subnode),
			m_n_min_repeat(n_min_repeat), m_n_max_repeat(n_max_repeat)
		{}

		virtual ~CNodeQuant();

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP
	};

	struct CNodeOption : public CNodeString {
	protected:
		class CAddParallelTransitions {
		protected:
			TNFAState *m_p_initial, *m_p_final;
			CRegExp_NFSM &m_r_fsm;

		public:
			inline CAddParallelTransitions(TNFAState *p_initial, TNFAState *p_final,
				CRegExp_NFSM &r_fsm)
				:m_p_initial(p_initial), m_p_final(p_final), m_r_fsm(r_fsm)
			{}

			inline void operator ()(CNode *p_serial_node)
			{
				if(m_p_final) {
					TNFAState *p_final_node;
					if(!(p_final_node = p_serial_node->p_AddTransitions(m_p_initial, m_r_fsm)) ||
					   (p_final_node != m_p_initial && !m_r_fsm.AddTransition(TNFATransition(
					   p_final_node, m_p_final, TNFATransition::tran_Epsilon)))) // changed identity to epsilon here
						m_p_final = 0;
					// create transition for node and in case
				}
			}

			inline operator TNFAState*() const
			{
				return m_p_final;
			}
		};

	public:
		inline CNodeOption()
			:CNodeString(node_Option)
		{}

		virtual TNFAState *p_AddTransitions(TNFAState *p_initial,
			CRegExp_NFSM &r_fsm) const;

#ifdef REG_EXP_COMPILER_DUMP
		virtual void Print(int n_level);
#endif // REG_EXP_COMPILER_DUMP

		inline int n_Size() const
		{
			return m_node_list.size();
		}
	};

	/*
	 *	inline CRegExp_Parser::CRegExp_Parser()
	 *		- default constructor
	 */
	inline CRegExp_Parser()
		:m_b_parse_errors(false)
	{}

	/*
	 *	inline bool CRegExp_Parser::b_ParseErrors() const
	 *		- returns true in case any parse error(s) occured
	 *		- note this doesn't clear the error flag (ie. once error occured
	 *		  it will always return true for one particular parser instance)
	 */
	inline bool b_ParseErrors() const
	{
		return m_b_parse_errors;
	}

	typedef std::vector<CRegExp_Lexer::TToken>::iterator token_iterator;

	/*
	 *	token_iterator CRegExp_Parser::p_Parse_RegularExpression(token_iterator p_begin,
	 *		token_iterator p_end, CNodeString &r_reg_exp)
	 *		- parses regular expression from tokens p_begin trough p_end
	 *		- parse tree nodes are added to r_reg_exp (which should be blank)
	 *		- returns iterator pointing on last processed token
	 */
	token_iterator p_Parse_RegularExpression(token_iterator p_begin,
		token_iterator p_end, CNodeString &r_reg_exp);

protected:
	CNode *p_Parse_RE_Val(token_iterator &r_p_begin, token_iterator p_end);
	CNode *p_Parse_RE_Quant(token_iterator &r_p_begin, token_iterator p_end);
	CNode *p_Parse_RE_Option(token_iterator &r_p_begin, token_iterator p_end);
};

/*
 *								--- ~CRegExp_Parser ---
 */

#endif // !__REG_EXP_INCLUDED
