Fixed a lot of bugs. Actually gets through the entire experimental grammer. (A largeish experimental grammer for Kraken written to continue testing and to really start language development.

2013-07-10 23:50:53 -04:00
parent 1c85e3693e
commit f84657f1ed
10 changed files with 151 additions and 60 deletions
@@ -18,6 +18,7 @@ class ParseAction {
 		ParseAction(ActionType action, ParseRule* reduceRule);
 		ParseAction(ActionType action, int shiftState);
 		~ParseAction();
+		bool const equalsExceptLookahead(const ParseAction &other);
 		bool const operator==(const ParseAction &other);
 		bool const operator!=(const ParseAction &other);
 		std::string toString();
@@ -24,6 +24,7 @@ class Parser {

 		void loadGrammer(std::string grammerInputString);
 		std::vector<Symbol*>* firstSet(Symbol* token);
+		std::vector<Symbol*>* firstSet(Symbol* token, std::vector<Symbol*> &avoidList);
 		void printFirstSets();
 		std::vector<Symbol*>* incrementiveFollowSet(ParseRule* rule);
 		void createStateSet();
@@ -16,6 +16,8 @@ class RegEx {
 		RegEx(std::string inPattern);
 		~RegEx();

+		void construct();
+		void deperenthesize();
 		int longMatch(std::string stringToMatch);
 		std::string getPattern();
 		std::string toString();
@@ -45,18 +45,23 @@ int main(int argc, char* argv[]) {
 	Parser parser;
 	parser.loadGrammer(grammerInputFileString);
 	//std::cout << "Creating State Set from Main" << std::endl;
+	std::cout << "\n\n\n\n\n\n\n\n\n\nState Set" << std::endl;
 	parser.createStateSet();
 	//std::cout << "finished State Set from Main" << std::endl;
 	//std::cout << "Doing stateSetToString from Main" << std::endl;
+	std::cout << "\n\n\n\n\n\n\n\n\n\nState Set toString" << std::endl;
 	std::cout << parser.stateSetToString() << std::endl;
 	//std::cout << "finished stateSetToString from Main" << std::endl;
+	std::cout << "\n\n\n\n\n\n\n\n\n\nTable" << std::endl;
 	std::cout << parser.tableToString() << std::endl;
-
+	std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer Input File" << std::endl;
 	std::cout << grammerInputFileString << std::endl;
+	std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer toString" << std::endl;
 	std::cout << parser.grammerToString() << std::endl;
 	//std::cout << parser.grammerToDOT() << std::endl;

 	//outFile << parser.grammerToDOT() << std::endl;
+	std::cout << "\n\n\n\n\n\n\n\n\n\nParsing" << std::endl;

 	std::cout << programInputFileString << std::endl;
 	NodeTree* parseTree = parser.parseInput(programInputFileString);
@@ -23,7 +23,7 @@ void Lexer::addRegEx(std::string regExString) {
 }

 Symbol* Lexer::next() {
-	std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
+	std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <<std::endl;
 	//If we're at the end, return an eof
 	if (currentPosition == input.length()-1)
 		return new Symbol("$EOF$", true);
@@ -31,7 +31,7 @@ Symbol* Lexer::next() {
 	RegEx* longestRegEx = NULL;
 	std::string remainingString = input.substr(currentPosition,input.length()-1);
 	for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
-		std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
+		std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl;
 		int currentMatch = regExs[i]->longMatch(remainingString);
 		if (currentMatch > longestMatch) {
 			longestMatch = currentMatch;
@@ -40,11 +40,11 @@ Symbol* Lexer::next() {
 	}
 	if (longestRegEx != NULL) {
 		currentPosition += longestMatch + 1;
-		std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
+	std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <<std::endl;
 		return new Symbol(longestRegEx->getPattern(), true);
 	} else {
 		std::cout << "Found no applicable regex" << std::endl;
 		std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
-		return NULL;
+		return new Symbol("$NO_APPLICABLE_REGEX$", true);
 	}
 }
@@ -22,6 +22,9 @@ ParseAction::~ParseAction() {

 }

+const bool ParseAction::equalsExceptLookahead(const ParseAction &other) {
+	return( action == other.action && ( reduceRule == other.reduceRule || reduceRule->equalsExceptLookahead(*(other.reduceRule)) ) && shiftState == other.shiftState);
+}

 const bool ParseAction::operator==(const ParseAction &other) {
 	return( action == other.action && ( reduceRule == other.reduceRule || *reduceRule == *(other.reduceRule) ) && shiftState == other.shiftState);
@@ -73,6 +73,21 @@ void Parser::loadGrammer(std::string grammerInputString) {
 }

 std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
+	//std::cout << "Simple first set for " << token->toString() << std::endl;
+	std::vector<Symbol*> avoidList;
+	return firstSet(token, avoidList);
+}
+
+std::vector<Symbol*>* Parser::firstSet(Symbol* token, std::vector<Symbol*> &avoidList) {
+	//If we've already done this token, don't do it again
+	for (std::vector<Symbol*>::size_type i = 0; i < avoidList.size(); i++)
+		if (*(avoidList[i]) == *token) {
+			return new std::vector<Symbol*>();
+			//std::cout << "Avoiding firstSet for " << token->toString() << std::endl;
+		}
+	avoidList.push_back(token);
+	//std::cout << "Cpx first set for " << token->toString() << std::endl;
+	//std::cout << "Doing first set for " << token->toString() << std::endl;
 	std::vector<Symbol*>* first = new std::vector<Symbol*>();
 	//First, if the symbol is a terminal, than it's first set is just itself.
 	if (token->isTerminal()) {
@@ -96,7 +111,7 @@ std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
 					recursiveFirstSet->push_back(rightToken);
 				} else {
 					//Add the entire set
-					recursiveFirstSet = firstSet(rightToken);
+					recursiveFirstSet = firstSet(rightToken, avoidList);
 				}
 				first->insert(first->end(), recursiveFirstSet->begin(), recursiveFirstSet->end());
 				//Check to see if the current recursiveFirstSet contains NULL, if so, then go through again with the next token. (if there is one)
@@ -106,6 +121,7 @@ std::vector<Symbol*>* Parser::firstSet(Symbol* token) {
 						recFirstSetHasNull = true;
 					}
 				}
+				delete recursiveFirstSet;
 				j++;
 			} while (recFirstSetHasNull && loadedGrammer[i]->getRightSide().size() > j);
 		}
@@ -159,6 +175,7 @@ std::vector<Symbol*>* Parser::incrementiveFollowSet(ParseRule* rule) {
 		for (std::vector<Symbol*>::size_type i = 0; i < symbolFirstSet->size(); i++) {
 			if (*((*symbolFirstSet)[i]) == *nullSymbol) {
 				symbolFirstSetHasNull = true;
+				symbolFirstSet->erase(symbolFirstSet->begin()+i);
 				break;
 			}
 		}
@@ -170,7 +187,17 @@ std::vector<Symbol*>* Parser::incrementiveFollowSet(ParseRule* rule) {
 		symbolFirstSet = rule->getLookahead();
 		followSet->insert(followSet->end(), symbolFirstSet->begin(), symbolFirstSet->end());
 	}
-	return followSet;
+	std::vector<Symbol*>* followSetReturn = new std::vector<Symbol*>();
+	for (std::vector<Symbol*>::size_type i = 0; i < followSet->size(); i++) {
+		bool alreadyIn = false;
+		for (std::vector<Symbol*>::size_type j = 0; j < followSetReturn->size(); j++)
+			if (*((*followSet)[i]) == *((*followSetReturn)[j]))
+				alreadyIn = true;
+		if (!alreadyIn)
+			followSetReturn->push_back((*followSet)[i]);
+	}
+	delete followSet;
+	return followSetReturn;
 }

 void Parser::closure(State* state) {
@@ -185,6 +212,7 @@ void Parser::closure(State* state) {
 			if ( !currentStateRule->isAtEnd() && *(currentStateRule->getAtNextIndex()) == *(currentGramRule->getLeftSide())) {
 				//std::cout << (*stateTotal)[i]->getAtNextIndex()->toString() << " has an applicable production " << loadedGrammer[j]->toString() << std::endl;
 				//Now, add the correct lookahead. This followSet is built based on the current rule's lookahead if at end, or the next Symbol's first set.
+				//std::cout << "Setting lookahead for " << currentGramRule->toString() << " in state " << state->toString() << std::endl;
 				currentGramRule->setLookahead(incrementiveFollowSet(currentStateRule));

 				//Check to make sure not already in
@@ -340,7 +368,7 @@ void Parser::addToTable(State* fromState, Symbol* tranSymbol, ParseAction* actio
 		(*(table[stateNum]))[symbolIndex] = action;
 	}
 	//If the slot is not empty and does not contain ourself, then it is a conflict
-	else if ( *((*(table[stateNum]))[symbolIndex]) != *action) {
+	else if ( !(*(table[stateNum]))[symbolIndex]->equalsExceptLookahead(*action)) {
 		//std::cout << "not Null!" << std::endl;
 		std::cout << "State: " << stateNum << " Conflict between old: " << (*(table[stateNum]))[symbolIndex]->toString() << " and new: " << action->toString() << std::endl; 
 		//Don't overwrite
@@ -2,9 +2,15 @@

 RegEx::RegEx(std::string inPattern) {
 	pattern = inPattern;
+	construct();
+	deperenthesize();
+}
+
+void RegEx::construct() {
 	std::vector<RegExState*> previousStates;
 	std::vector<RegExState*> currentStates;
-	std::stack<std::pair<std::vector<RegExState*>, RegExState*> > perenStack;
+	std::stack<std::pair<std::vector<RegExState*>, std::vector<RegExState*> > > perenStack;
+	bool alternating = false;
 	begin = new RegExState();
 	currentStates.push_back(begin);
 	for (int i = 0; i < pattern.length(); i++) {
@@ -42,11 +48,7 @@ RegEx::RegEx(std::string inPattern) {
 			{
 				std::cout << "Alternation at " << i << " in " << pattern << std::endl;
 				//alternation
-				i++;
-				RegExState* next = new RegExState(pattern[i]);
-				for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++)
-					previousStates[j]->addNext(next);
-				currentStates.push_back(next);
+				alternating = true;
 			}

 				break;
@@ -57,17 +59,35 @@ RegEx::RegEx(std::string inPattern) {
 				//Create a peren node with an inner empty node
 				RegExState* next = new RegExState(new RegExState());

-				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
-					currentStates[j]->addNext(next);
+				if (alternating) {
+					for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++)
+						previousStates[j]->addNext(next);

-				previousStates.clear();
-				//Save both current states here as well as the current preren
-				std::vector<RegExState*> saveStates = currentStates;
-				// saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end())
-				perenStack.push(std::make_pair(saveStates, next));
+					//Save both current states here as well as the current preren
+					std::vector<RegExState*> savePreviousStates = previousStates;
+					currentStates.push_back(next);
+					std::vector<RegExState*> saveCurrentStates = currentStates;
+					perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates));

-				currentStates.clear();
-				currentStates.push_back(next->getInner());
+					previousStates.clear();
+					currentStates.clear();
+					currentStates.push_back(next->getInner());
+					alternating = false;
+				} else {
+					for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
+						currentStates[j]->addNext(next);
+
+					//Save both current states here as well as the current preren
+					std::vector<RegExState*> savePreviousStates = currentStates;
+					currentStates.clear();
+					currentStates.push_back(next);
+					std::vector<RegExState*> saveCurrentStates = currentStates;
+					perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates));
+
+					previousStates.clear();
+					currentStates.clear();
+					currentStates.push_back(next->getInner());
+				}
 				std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl;
 			}
 				break;
@@ -77,15 +97,16 @@ RegEx::RegEx(std::string inPattern) {
 				std::cout << "End peren at " << i << " in " << pattern << std::endl;
 				//perentheses
 				//Pop off the states that will now be the previous states and the peren node which will now be the current node
-				std::pair<std::vector<RegExState*>, RegExState*> savedPair = perenStack.top();
+				std::pair<std::vector<RegExState*>, std::vector<RegExState*> > savedPair = perenStack.top();
 				perenStack.pop();
 				//Make the it so
 				previousStates = savedPair.first;
 				//Make sure the end of the inner stuff points back to the peren node
 				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
-					currentStates[j]->addNext(savedPair.second);
+					currentStates[j]->addNext(savedPair.second[savedPair.second.size()-1]);
+					//currentStates[j]->addNext(*(savedPair.second.end()));
 				currentStates.clear();
-				currentStates.push_back(savedPair.second);
+				currentStates = savedPair.second;
 			}
 				break;

@@ -93,42 +114,42 @@ RegEx::RegEx(std::string inPattern) {
 			{
 				i++;
 				std::cout << "Escape! Escaping: " << pattern[i] << std::endl;
-				//Ahh, it's escaping a special character
-				RegExState* next = new RegExState(pattern[i]);
-				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
-					currentStates[j]->addNext(next);
-					std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
-				}
-
-				previousStates.clear();
-				// previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
-				previousStates = currentStates;
-				currentStates.clear();
-				currentStates.push_back(next);
+				//Ahh, it's escaping a special character, so fall through to the default.
 			}
-				break;
 			default:
 			{
 				std::cout << "Regular" << std::endl;
 				//Ahh, it's regular
 				RegExState* next = new RegExState(pattern[i]);
-				for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
-					currentStates[j]->addNext(next);
-					std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
+				//If we're alternating, add next as the next for each previous state, and add self to currentStates
+				if (alternating) {
+					for (std::vector<RegExState*>::size_type j = 0; j < previousStates.size(); j++) {
+						previousStates[j]->addNext(next);
+						std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << previousStates[j] << std::endl;
+					}
+					currentStates.push_back(next);
+					alternating = false;
+				} else {
+					//If we're not alternating, add next as next for all the current states, make the current states the new
+					//previous states, and add ourself as the new current state.
+					for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
+						currentStates[j]->addNext(next);
+						std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
+					}
+					previousStates.clear();
+					previousStates = currentStates;
+					currentStates.clear();
+					currentStates.push_back(next);
 				}
-
-				previousStates.clear();
-				// previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
-				previousStates = currentStates;
-				currentStates.clear();
-				currentStates.push_back(next);
 			}
 		}
 	}
 	//last one is goal state
 	for (std::vector<RegExState*>::size_type i = 0; i < currentStates.size(); i++)
 		currentStates[i]->addNext(NULL);
+}

+void RegEx::deperenthesize() {
 	std::cout << "About to de-perenthesize " << begin->toString() << std::endl;

 	//Now go through and expand the peren nodes to regular nodes
@@ -144,13 +165,13 @@ RegEx::RegEx(std::string inPattern) {
 			if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) {
 				//Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others)
 				std::vector<RegExState*>* insideNextStates = (*nextStates)[j]->getInner()->getNextStates();
-				std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
+				//std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
 				RegExState* perenState = (*nextStates)[j];
 				(*nextStates)[j] = (*insideNextStates)[0];
-				std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
+				//std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
 				for (std::vector<RegExState*>::size_type k = 1; k < insideNextStates->size(); k++)
 					nextStates->push_back((*insideNextStates)[k]);
-				std::cout << "Replaced beginning: " << begin->toString() << std::endl;
+				//std::cout << "Replaced beginning: " << begin->toString() << std::endl;
 				//Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner
 				std::vector<RegExState*>* perenNextNodes = perenState->getNextStates();
 				for (std::vector<RegExState*>::size_type k = 0; k < perenNextNodes->size(); k++) {
@@ -166,17 +187,17 @@ RegEx::RegEx(std::string inPattern) {
 				traversalList.push_back(perenState->getInner());
 				for (std::vector<RegExState*>::size_type k = 0; k < traversalList.size(); k++) {
 					std::vector<RegExState*>* nextTraversalStates = traversalList[k]->getNextStates();
-					std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k]  << " char = " << traversalList[k]->getCharacter() << std::endl;
-					std::cout << "with children: ";
-					for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
-						std::cout << (*nextTraversalStates)[l]->getCharacter() << " ";
-					std::cout << std::endl; 
+					//std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k]  << " char = " << traversalList[k]->getCharacter() << std::endl;
+					//std::cout << "with children:" << std::endl;
+					//for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
+					//	std::cout << "\t\"" << (*nextTraversalStates)[l]->getCharacter() << "\"" << std::endl;
+					//std::cout << std::endl; 
 					for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++) {
 						//If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren
 						//And we now replace this reference with the next nodes from the peren node
-						std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
+						//std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
 						if ((*nextTraversalStates)[l] == perenState) {
-							std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
+						//	std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
 							std::vector<RegExState*> endPerenNextStates = *(perenState->getNextStates());
 							(*nextTraversalStates)[l] = endPerenNextStates[0];
 							for (std::vector<RegExState*>::size_type n = 1; n < endPerenNextStates.size(); n++)
@@ -79,7 +79,7 @@ std::string RegExState::toString(std::vector<RegExState*>* avoid) {
 			}
 		}
 		if (inAvoid) {
-			string += "->LoopDetected";
+			string += "->loop";
 			continue;
 		}

@@ -67,7 +67,37 @@ std::string StringReader::getTokens(std::vector<std::string> stop_chars, bool tr
    }

    if (rd_string[str_pos] == '\"') {
-        found_pos = rd_string.find("\"", str_pos+1);
+        //See if we have an even or odd number of backslashes (that is, this quote is not or is escaped)
+        int numBackslashes = 0;
+        int countBack = 1;
+        while (str_pos-countBack >= 0 && rd_string[str_pos-countBack] == '\\') {
+            numBackslashes++;
+            countBack++;
+        }
+        //If the quote is not escaped
+        if (numBackslashes % 2 == 0) {
+            //Find the next quote
+            found_pos = rd_string.find("\"", str_pos+1);
+            //Check to see if the quote is escaped
+            numBackslashes = 0;
+            countBack = 1;
+            while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') {
+                numBackslashes++;
+                countBack++;
+            }
+            //While the quote is escaped
+            while (numBackslashes % 2 == 1) {
+                //find the next quote
+                found_pos = rd_string.find("\"", found_pos+1);
+                //Check to see if it's escaped
+                numBackslashes = 0;
+                countBack = 1;
+                while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') {
+                    numBackslashes++;
+                    countBack++;
+                }
+            }
+        }
    }

    if (found_pos == str_pos)                                   //We are at the endline