Fixed a lot of bugs. Actually gets through the entire experimental grammer. (A largeish experimental grammer for Kraken written to continue testing and to really start language development.

2013-07-10 23:50:53 -04:00
parent 1c85e3693e
commit f84657f1ed
10 changed files with 151 additions and 60 deletions
--- a/include/ParseAction.h
+++ b/include/ParseAction.h
@@ -18,6 +18,7 @@ class ParseAction {
 		ParseAction(ActionType action, ParseRule* reduceRule);
 		ParseAction(ActionType action, int shiftState);
 		~ParseAction();
 		bool const equalsExceptLookahead(const ParseAction &other);
 		bool const operator==(const ParseAction &other);
 		bool const operator!=(const ParseAction &other);
 		std::string toString();
--- a/include/Parser.h
+++ b/include/Parser.h
@@ -24,6 +24,7 @@ class Parser {
 		void loadGrammer(std::string grammerInputString);
 		std::vector<Symbol*>* firstSet(Symbol* token);
 		std::vector<Symbol*>* firstSet(Symbol* token, std::vector<Symbol*> &avoidList);
 		void printFirstSets();
 		std::vector<Symbol*>* incrementiveFollowSet(ParseRule* rule);
 		void createStateSet();
--- a/include/RegEx.h
+++ b/include/RegEx.h
@@ -16,6 +16,8 @@ class RegEx {
 		RegEx(std::string inPattern);
 		~RegEx();
 		void construct();
 		void deperenthesize();
 		int longMatch(std::string stringToMatch);
 		std::string getPattern();
 		std::string toString();
--- a/main.cpp
+++ b/main.cpp
@@ -45,18 +45,23 @@ int main(int argc, char* argv[]) {
 	Parser parser;
 	parser.loadGrammer(grammerInputFileString);
 	//std::cout << "Creating State Set from Main" << std::endl;
 	std::cout << "\n\n\n\n\n\n\n\n\n\nState Set" << std::endl;
 	parser.createStateSet();
 	//std::cout << "finished State Set from Main" << std::endl;
 	//std::cout << "Doing stateSetToString from Main" << std::endl;
 	std::cout << "\n\n\n\n\n\n\n\n\n\nState Set toString" << std::endl;
 	std::cout << parser.stateSetToString() << std::endl;
 	//std::cout << "finished stateSetToString from Main" << std::endl;
 	std::cout << "\n\n\n\n\n\n\n\n\n\nTable" << std::endl;
 	std::cout << parser.tableToString() << std::endl;
-
+	std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer Input File" << std::endl;
 	std::cout << grammerInputFileString << std::endl;
 	std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer toString" << std::endl;
 	std::cout << parser.grammerToString() << std::endl;
 	//std::cout << parser.grammerToDOT() << std::endl;
 	//outFile << parser.grammerToDOT() << std::endl;
 	std::cout << "\n\n\n\n\n\n\n\n\n\nParsing" << std::endl;
 	std::cout << programInputFileString << std::endl;
 	NodeTree* parseTree = parser.parseInput(programInputFileString);
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -23,7 +23,7 @@ void Lexer::addRegEx(std::string regExString) {
 }
 Symbol* Lexer::next() {
-	std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
+	std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <<std::endl;
 	//If we're at the end, return an eof
 	if (currentPosition == input.length()-1)
 		return new Symbol("$EOF$", true);
@@ -31,7 +31,7 @@ Symbol* Lexer::next() {
 	RegEx* longestRegEx = NULL;
 	std::string remainingString = input.substr(currentPosition,input.length()-1);
 	for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
-		std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
+		std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl;
 		int currentMatch = regExs[i]->longMatch(remainingString);
 		if (currentMatch > longestMatch) {
 			longestMatch = currentMatch;
@@ -40,11 +40,11 @@ Symbol* Lexer::next() {
 	}
 	if (longestRegEx != NULL) {
 		currentPosition += longestMatch + 1;
-		std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
+	std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <<std::endl;
 		return new Symbol(longestRegEx->getPattern(), true);
 	} else {
 		std::cout << "Found no applicable regex" << std::endl;
 		std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
-		return NULL;
+		return new Symbol("$NO_APPLICABLE_REGEX$", true);
 	}
 }
--- a/src/ParseAction.cpp
+++ b/src/ParseAction.cpp
@@ -22,6 +22,9 @@ ParseAction::~ParseAction() {
 }
 const bool ParseAction::equalsExceptLookahead(const ParseAction &other) {
 	return( action == other.action && ( reduceRule == other.reduceRule || reduceRule->equalsExceptLookahead(*(other.reduceRule)) ) && shiftState == other.shiftState);
 }
 const bool ParseAction::operator==(const ParseAction &other) {
 	return( action == other.action && ( reduceRule == other.reduceRule || *reduceRule == *(other.reduceRule) ) && shiftState == other.shiftState);
--- a/src/Parser.cpp
+++ b/src/Parser.cpp
@@ -73,6 +73,21 @@ void Parser::loadGrammer(std::string grammerInputString) {
 }
 std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
 	//std::cout << "Simple first set for " << token->toString() << std::endl;
 	std::vector<Symbol*> avoidList;
 	return firstSet(token, avoidList);
 }
 std::vector<Symbol*>* Parser::firstSet(Symbol* token, std::vector<Symbol*> &avoidList) {
 	//If we've already done this token, don't do it again
 	for (std::vector<Symbol*>::size_type i = 0; i < avoidList.size(); i++)
 		if (*(avoidList[i]) == *token) {
 			return new std::vector<Symbol*>();
 			//std::cout << "Avoiding firstSet for " << token->toString() << std::endl;
 		}
 	avoidList.push_back(token);
 	//std::cout << "Cpx first set for " << token->toString() << std::endl;
 	//std::cout << "Doing first set for " << token->toString() << std::endl;
 	std::vector<Symbol*>* first = new std::vector<Symbol*>();
 	//First, if the symbol is a terminal, than it's first set is just itself.
 	if (token->isTerminal()) {
@@ -96,7 +111,7 @@ std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
 					recursiveFirstSet->push_back(rightToken);
 				} else {
 					//Add the entire set
-					recursiveFirstSet = firstSet(rightToken);
+					recursiveFirstSet = firstSet(rightToken, avoidList);
 				}
 				first->insert(first->end(), recursiveFirstSet->begin(), recursiveFirstSet->end());
 				//Check to see if the current recursiveFirstSet contains NULL, if so, then go through again with the next token. (if there is one)
@@ -106,6 +121,7 @@ std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
 						recFirstSetHasNull = true;
 					}
 				}
 				delete recursiveFirstSet;
 				j++;
 			} while (recFirstSetHasNull && loadedGrammer[i]->getRightSide().size() > j);
 		}
@@ -159,6 +175,7 @@ std::vector<Symbol*>* Parser::incrementiveFollowSet(ParseRule* rule) {
 		for (std::vector<Symbol*>::size_type i = 0; i < symbolFirstSet->size(); i++) {
 			if (*((*symbolFirstSet)[i]) == *nullSymbol) {
 				symbolFirstSetHasNull = true;
 				symbolFirstSet->erase(symbolFirstSet->begin()+i);
 				break;
 			}
 		}
@@ -170,7 +187,17 @@ std::vector<Symbol*>* Parser::incrementiveFollowSet(ParseRule* rule) {
 		symbolFirstSet = rule->getLookahead();
 		followSet->insert(followSet->end(), symbolFirstSet->begin(), symbolFirstSet->end());
 	}
-	return followSet;
+	std::vector<Symbol*>* followSetReturn = new std::vector<Symbol*>();
 	for (std::vector<Symbol*>::size_type i = 0; i < followSet->size(); i++) {
 		bool alreadyIn = false;
 		for (std::vector<Symbol*>::size_type j = 0; j < followSetReturn->size(); j++)
 			if (*((*followSet)[i]) == *((*followSetReturn)[j]))
 				alreadyIn = true;
 		if (!alreadyIn)
 			followSetReturn->push_back((*followSet)[i]);
 	}
 	delete followSet;
 	return followSetReturn;
 }
 void Parser::closure(State* state) {
@@ -185,6 +212,7 @@ void Parser::closure(State* state) {
 			if ( !currentStateRule->isAtEnd() && *(currentStateRule->getAtNextIndex()) == *(currentGramRule->getLeftSide())) {
 				//std::cout << (*stateTotal)[i]->getAtNextIndex()->toString() << " has an applicable production " << loadedGrammer[j]->toString() << std::endl;
 				//Now, add the correct lookahead. This followSet is built based on the current rule's lookahead if at end, or the next Symbol's first set.
 				//std::cout << "Setting lookahead for " << currentGramRule->toString() << " in state " << state->toString() << std::endl;
 				currentGramRule->setLookahead(incrementiveFollowSet(currentStateRule));
 				//Check to make sure not already in
@@ -340,7 +368,7 @@ void Parser::addToTable(State* fromState, Symbol* tranSymbol, ParseAction* actio
 		(*(table[stateNum]))[symbolIndex] = action;
 	}
 	//If the slot is not empty and does not contain ourself, then it is a conflict
-	else if ( *((*(table[stateNum]))[symbolIndex]) != *action) {
+	else if ( !(*(table[stateNum]))[symbolIndex]->equalsExceptLookahead(*action)) {
 		//std::cout << "not Null!" << std::endl;
 		std::cout << "State: " << stateNum << " Conflict between old: " << (*(table[stateNum]))[symbolIndex]->toString() << " and new: " << action->toString() << std::endl; 
 		//Don't overwrite
--- a/src/RegEx.cpp
+++ b/src/RegEx.cpp
@@ -2,9 +2,15 @@
 RegEx::RegEx(std::string inPattern) {
 	pattern = inPattern;
 	construct();
 	deperenthesize();
 }
 void RegEx::construct() {
 	std::vector<RegExState*> previousStates;
 	std::vector<RegExState*> currentStates;
-	std::stack<std::pair<std::vector<RegExState*>, RegExState*> > perenStack;
+	std::stack<std::pair<std::vector<RegExState*>, std::vector<RegExState*> > > perenStack;
 	bool alternating = false;
 	begin = new RegExState();
 	currentStates.push_back(begin);
 	for (int i = 0; i < pattern.length(); i++) {
@@ -42,11 +48,7 @@ RegEx::RegEx(std::string inPattern) {
 			{
 				std::cout << "Alternation at " << i << " in " << pattern << std::endl;
 				//alternation
-				i++;
+				alternating = true;
 				RegExState* next = new RegExState(pattern[i]);
 				for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++)
 					previousStates[j]->addNext(next);
 				currentStates.push_back(next);
 			}
 				break;
@@ -57,17 +59,35 @@ RegEx::RegEx(std::string inPattern) {
 				//Create a peren node with an inner empty node
 				RegExState* next = new RegExState(new RegExState());
-				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
+				if (alternating) {
-					currentStates[j]->addNext(next);
+					for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++)
 						previousStates[j]->addNext(next);
-				previousStates.clear();
+					//Save both current states here as well as the current preren
-				//Save both current states here as well as the current preren
+					std::vector<RegExState*> savePreviousStates = previousStates;
-				std::vector<RegExState*> saveStates = currentStates;
+					currentStates.push_back(next);
-				// saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end())
+					std::vector<RegExState*> saveCurrentStates = currentStates;
-				perenStack.push(std::make_pair(saveStates, next));
+					perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates));
-				currentStates.clear();
+					previousStates.clear();
-				currentStates.push_back(next->getInner());
+					currentStates.clear();
 					currentStates.push_back(next->getInner());
 					alternating = false;
 				} else {
 					for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
 						currentStates[j]->addNext(next);
 					//Save both current states here as well as the current preren
 					std::vector<RegExState*> savePreviousStates = currentStates;
 					currentStates.clear();
 					currentStates.push_back(next);
 					std::vector<RegExState*> saveCurrentStates = currentStates;
 					perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates));
 					previousStates.clear();
 					currentStates.clear();
 					currentStates.push_back(next->getInner());
 				}
 				std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl;
 			}
 				break;
@@ -77,15 +97,16 @@ RegEx::RegEx(std::string inPattern) {
 				std::cout << "End peren at " << i << " in " << pattern << std::endl;
 				//perentheses
 				//Pop off the states that will now be the previous states and the peren node which will now be the current node
-				std::pair<std::vector<RegExState*>, RegExState*> savedPair = perenStack.top();
+				std::pair<std::vector<RegExState*>, std::vector<RegExState*> > savedPair = perenStack.top();
 				perenStack.pop();
 				//Make the it so
 				previousStates = savedPair.first;
 				//Make sure the end of the inner stuff points back to the peren node
 				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
-					currentStates[j]->addNext(savedPair.second);
+					currentStates[j]->addNext(savedPair.second[savedPair.second.size()-1]);
 					//currentStates[j]->addNext(*(savedPair.second.end()));
 				currentStates.clear();
-				currentStates.push_back(savedPair.second);
+				currentStates = savedPair.second;
 			}
 				break;
@@ -93,42 +114,42 @@ RegEx::RegEx(std::string inPattern) {
 			{
 				i++;
 				std::cout << "Escape! Escaping: " << pattern[i] << std::endl;
-				//Ahh, it's escaping a special character
+				//Ahh, it's escaping a special character, so fall through to the default.
 				RegExState* next = new RegExState(pattern[i]);
 				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
 					currentStates[j]->addNext(next);
 					std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
 				}
 				previousStates.clear();
 				// previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
 				previousStates = currentStates;
 				currentStates.clear();
 				currentStates.push_back(next);
 			}
 				break;
 			default:
 			{
 				std::cout << "Regular" << std::endl;
 				//Ahh, it's regular
 				RegExState* next = new RegExState(pattern[i]);
-				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
+				//If we're alternating, add next as the next for each previous state, and add self to currentStates
-					currentStates[j]->addNext(next);
+				if (alternating) {
-					std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
+					for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++) {
 						previousStates[j]->addNext(next);
 						std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << previousStates[j] << std::endl;
 					}
 					currentStates.push_back(next);
 					alternating = false;
 				} else {
 					//If we're not alternating, add next as next for all the current states, make the current states the new
 					//previous states, and add ourself as the new current state.
 					for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
 						currentStates[j]->addNext(next);
 						std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
 					}
 					previousStates.clear();
 					previousStates = currentStates;
 					currentStates.clear();
 					currentStates.push_back(next);
 				}
 				previousStates.clear();
 				// previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
 				previousStates = currentStates;
 				currentStates.clear();
 				currentStates.push_back(next);
 			}
 		}
 	}
 	//last one is goal state
 	for (std::vector<RegExState*>::size_type i = 0; i < currentStates.size(); i++)
 		currentStates[i]->addNext(NULL);
 }
 void RegEx::deperenthesize() {
 	std::cout << "About to de-perenthesize " << begin->toString() << std::endl;
 	//Now go through and expand the peren nodes to regular nodes
@@ -144,13 +165,13 @@ RegEx::RegEx(std::string inPattern) {
 			if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) {
 				//Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others)
 				std::vector<RegExState*>* insideNextStates = (*nextStates)[j]->getInner()->getNextStates();
-				std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
+				//std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
 				RegExState* perenState = (*nextStates)[j];
 				(*nextStates)[j] = (*insideNextStates)[0];
-				std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
+				//std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
 				for (std::vector<RegExState*>::size_type k = 1; k < insideNextStates->size(); k++)
 					nextStates->push_back((*insideNextStates)[k]);
-				std::cout << "Replaced beginning: " << begin->toString() << std::endl;
+				//std::cout << "Replaced beginning: " << begin->toString() << std::endl;
 				//Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner
 				std::vector<RegExState*>* perenNextNodes = perenState->getNextStates();
 				for (std::vector<RegExState*>::size_type k = 0; k < perenNextNodes->size(); k++) {
@@ -166,17 +187,17 @@ RegEx::RegEx(std::string inPattern) {
 				traversalList.push_back(perenState->getInner());
 				for (std::vector<RegExState*>::size_type k = 0; k < traversalList.size(); k++) {
 					std::vector<RegExState*>* nextTraversalStates = traversalList[k]->getNextStates();
-					std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k]  << " char = " << traversalList[k]->getCharacter() << std::endl;
+					//std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k]  << " char = " << traversalList[k]->getCharacter() << std::endl;
-					std::cout << "with children: ";
+					//std::cout << "with children:" << std::endl;
-					for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
+					//for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
-						std::cout << (*nextTraversalStates)[l]->getCharacter() << " ";
+					//	std::cout << "\t\"" << (*nextTraversalStates)[l]->getCharacter() << "\"" << std::endl;
-					std::cout << std::endl; 
+					//std::cout << std::endl; 
 					for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++) {
 						//If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren
 						//And we now replace this reference with the next nodes from the peren node
-						std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
+						//std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
 						if ((*nextTraversalStates)[l] == perenState) {
-							std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
+						//	std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
 							std::vector<RegExState*> endPerenNextStates = *(perenState->getNextStates());
 							(*nextTraversalStates)[l] = endPerenNextStates[0];
 							for (std::vector<RegExState*>::size_type n = 1; n < endPerenNextStates.size(); n++)
--- a/src/RegExState.cpp
+++ b/src/RegExState.cpp
@@ -79,7 +79,7 @@ std::string RegExState::toString(std::vector<RegExState*>* avoid) {
 			}
 		}
 		if (inAvoid) {
-			string += "->LoopDetected";
+			string += "->loop";
 			continue;
 		}
--- a/src/StringReader.cpp
+++ b/src/StringReader.cpp
@@ -67,7 +67,37 @@ std::string StringReader::getTokens(std::vector<std::string> stop_chars, bool tr
    }
    if (rd_string[str_pos] == '\"') {
-        found_pos = rd_string.find("\"", str_pos+1);
+        //See if we have an even or odd number of backslashes (that is, this quote is not or is escaped)
        int numBackslashes = 0;
        int countBack = 1;
        while (str_pos-countBack >= 0 && rd_string[str_pos-countBack] == '\\') {
            numBackslashes++;
            countBack++;
        }
        //If the quote is not escaped
        if (numBackslashes % 2 == 0) {
            //Find the next quote
            found_pos = rd_string.find("\"", str_pos+1);
            //Check to see if the quote is escaped
            numBackslashes = 0;
            countBack = 1;
            while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') {
                numBackslashes++;
                countBack++;
            }
            //While the quote is escaped
            while (numBackslashes % 2 == 1) {
                //find the next quote
                found_pos = rd_string.find("\"", found_pos+1);
                //Check to see if it's escaped
                numBackslashes = 0;
                countBack = 1;
                while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') {
                    numBackslashes++;
                    countBack++;
                }
            }
        }
    }
    if (found_pos == str_pos)                                   //We are at the endline