Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power.

2013-07-02 01:47:42 -04:00
parent 94a7739bd9
commit 85da0bf646
10 changed files with 72 additions and 16 deletions
@@ -13,7 +13,7 @@ class Lexer {
 		Lexer();
 		Lexer(std::string inputString);
 		~Lexer();
-		void addRegexString(std::string regExString);
+		void addRegEx(std::string regExString);
 		void setInput(std::string inputString);
 		Symbol* next();
 	private:
@@ -32,7 +32,7 @@ class Parser {
 		std::string stateSetToString();
 		void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
 		ParseAction* getTable(int state, Symbol* token);
-		NodeTree* parseInput(Lexer* lexer);
+		NodeTree* parseInput(std::string inputString);

 		std::string grammerToString();
 		std::string grammerToDOT();
@@ -41,6 +41,7 @@ class Parser {

 	private:
 		StringReader reader;
+		Lexer lexer;
 		std::map<std::string, Symbol*> symbols;
 		std::vector<ParseRule*> loadedGrammer;

@@ -15,6 +15,7 @@ class RegEx {

 		int longMatch(std::string stringToMatch);
 		std::string getPattern();
+		std::string toString();
 	private:
 		std::string pattern;
 		RegExState* begin;
@@ -18,6 +18,7 @@ class RegExState {
 		bool characterIs(char inCharacter);
 		std::vector<RegExState*>* advance(char advanceCharacter);
 		bool isGoal();
+		std::string toString();

 	private:
 		std::vector<RegExState*> nextStates;
@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) {
 	//outFile << parser.grammerToDOT() << std::endl;

 	std::cout << programInputFileString << std::endl;
-	NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString));
+	NodeTree* parseTree = parser.parseInput(programInputFileString);

 	if (parseTree) {
 		std::cout << parseTree->DOTGraphString() << std::endl;
@@ -2,6 +2,7 @@

 Lexer::Lexer() {
 	//Do nothing
+	currentPosition = 0;
 }

 Lexer::Lexer(std::string inputString) {
@@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) {
 	input = inputString;
 }

-void Lexer::addRegexString(std::string regExString) {
+void Lexer::addRegEx(std::string regExString) {
 	regExs.push_back(new RegEx(regExString));
 }

 Symbol* Lexer::next() {
+	std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
+	//If we're at the end, return an eof
+	if (currentPosition == input.length()-1)
+		return new Symbol("$EOF$", false);
 	int longestMatch = 0;
 	RegEx* longestRegEx = NULL;
 	std::string remainingString = input.substr(currentPosition,input.length()-1);
 	for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
+		std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
 		int currentMatch = regExs[i]->longMatch(remainingString);
 		if (currentMatch > longestMatch) {
 			longestMatch = currentMatch;
 			longestRegEx = regExs[i];
 		}
 	}
-	currentPosition += longestMatch;
+	if (longestRegEx != NULL) {
+		currentPosition += longestMatch + 1;
+		std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
 		return new Symbol(longestRegEx->getPattern(), true);
+	} else {
+		std::cout << "Found no applicable regex" << std::endl;
+		std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
+		return NULL;
+	}
 }
@@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) {
 		//Add the right side, adding new Symbols to symbol map.
 		currToken = reader.word();
 		while (currToken != ";") {
-			currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal
+			if (currToken[0] == '\"') {
+				//Remove the quotes
+				currToken = currToken.substr(1,currToken.length()-2);
+				lexer.addRegEx(currToken);
+				currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
+			} else {
+				currentRule->appendToRight(getOrAddSymbol(currToken, false));
+			}
+
 			currToken = reader.word();
 			//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
 			if (currToken == "|") {
@@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) {
 	return (action);
 }

-NodeTree* Parser::parseInput(Lexer* lexer) {
-	Symbol* token = lexer->next();
+NodeTree* Parser::parseInput(std::string inputString) {
+	lexer.setInput(inputString);
+	Symbol* token = lexer.next();
 	ParseAction* action;

 	stateStack.push(0);
@@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) {
 				std::cout << "Shift " << token->toString() << std::endl;

 				symbolStack.push(token);
-				token = lexer->next();
+				token = lexer.next();
 				stateStack.push(action->shiftState);
 				break;
 			case ParseAction::ACCEPT:
@@ -3,7 +3,7 @@
 RegEx::RegEx(std::string inPattern) {
 	pattern = inPattern;
 	RegExState* current;
-	begin = new RegExState(pattern.at(0));
+	begin = new RegExState(pattern[0]);
 	current = begin;
 	for (int i = 1; i < pattern.length(); i++) {
 		RegExState* next = new RegExState(pattern.at(i));
@@ -17,21 +17,30 @@ RegEx::~RegEx() {
 }

 int RegEx::longMatch(std::string stringToMatch) {
+	//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
+	if (!begin->characterIs(stringToMatch[0]))
+		return -1;
+	std::cout << "Matched first character: " << stringToMatch[0] << std::endl;
 	int lastMatch = 0;
-	currentStates = *(begin->advance(stringToMatch.at(0)));
+	currentStates = *(begin->advance(stringToMatch[1]));
 	std::vector<RegExState*> nextStates;

-	for (int i = 1; i < stringToMatch.size(); i++) {
+	for (int i = 2; i < stringToMatch.size(); i++) {
 		//Go through every current state. Check to see if it is goal, if so update last goal.
 		//Also, add each state's advance to nextStates
 		for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
-			if (currentStates[j]->isGoal())
+			if (currentStates[j]->isGoal()) {
 				lastMatch = i-1;
+				std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
+			} else {
+				std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
+			}
 			std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
 			nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
 			delete addStates;
 		}
 		//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
+
 		currentStates.clear();
 		for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
 			bool inCurrStates = false;
@@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) {
 			if (!inCurrStates)
 				currentStates.push_back(nextStates[j]);
 		}
+		if (currentStates.size() != 0)
+			std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
+/*
+		std::cout << "Current states are: ";
+		for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
+			std::cout << currentStates[j]->toString() << " ";
+		std::cout << std::endl;
+*/
 		nextStates.clear();
 		//If we can't continue matching, just return our last matched
 		if (currentStates.size() == 0)
@@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) {
 std::string RegEx::getPattern() {
 	return pattern;
 }
+
+std::string RegEx::toString() {
+	return pattern + " -> " + begin->toString();
+}
@@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) {

 RegExState::RegExState(char inCharacter) {
 	character = inCharacter;
+	inner = NULL;
 }

 RegExState::~RegExState() {
@@ -33,4 +34,13 @@ bool RegExState::isGoal() {
 	return inner == NULL && nextStates.size() == 0;
 }

+std::string RegExState::toString() {
+	std::string string = "";
+	string += character;
+	for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
+		string += "->" + nextStates[i]->toString() + " EC ";
+	//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
+	return string;
+}
+

@@ -4,7 +4,7 @@ Symbol::Symbol(std::string name, bool isTerminal) {
 	this->name = name;
 	this->terminal = isTerminal;
 	this->subTree = NULL;
-	value = "HAHAHA VALUE";
+	value = "NoValue";
 }

 Symbol::Symbol(std::string name, bool isTerminal, std::string value) {