diff --git a/include/ParseAction.h b/include/ParseAction.h index e7103f0..4d62256 100644 --- a/include/ParseAction.h +++ b/include/ParseAction.h @@ -18,6 +18,7 @@ class ParseAction { ParseAction(ActionType action, ParseRule* reduceRule); ParseAction(ActionType action, int shiftState); ~ParseAction(); + bool const equalsExceptLookahead(const ParseAction &other); bool const operator==(const ParseAction &other); bool const operator!=(const ParseAction &other); std::string toString(); diff --git a/include/Parser.h b/include/Parser.h index 8461525..e9faf9a 100644 --- a/include/Parser.h +++ b/include/Parser.h @@ -24,6 +24,7 @@ class Parser { void loadGrammer(std::string grammerInputString); std::vector* firstSet(Symbol* token); + std::vector* firstSet(Symbol* token, std::vector &avoidList); void printFirstSets(); std::vector* incrementiveFollowSet(ParseRule* rule); void createStateSet(); diff --git a/include/RegEx.h b/include/RegEx.h index 139c082..77db781 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -16,6 +16,8 @@ class RegEx { RegEx(std::string inPattern); ~RegEx(); + void construct(); + void deperenthesize(); int longMatch(std::string stringToMatch); std::string getPattern(); std::string toString(); diff --git a/main.cpp b/main.cpp index e0ddb96..c5c26a8 100644 --- a/main.cpp +++ b/main.cpp @@ -45,18 +45,23 @@ int main(int argc, char* argv[]) { Parser parser; parser.loadGrammer(grammerInputFileString); //std::cout << "Creating State Set from Main" << std::endl; + std::cout << "\n\n\n\n\n\n\n\n\n\nState Set" << std::endl; parser.createStateSet(); //std::cout << "finished State Set from Main" << std::endl; //std::cout << "Doing stateSetToString from Main" << std::endl; + std::cout << "\n\n\n\n\n\n\n\n\n\nState Set toString" << std::endl; std::cout << parser.stateSetToString() << std::endl; //std::cout << "finished stateSetToString from Main" << std::endl; + std::cout << "\n\n\n\n\n\n\n\n\n\nTable" << std::endl; std::cout << parser.tableToString() << std::endl; - + std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer Input File" << std::endl; std::cout << grammerInputFileString << std::endl; + std::cout << "\n\n\n\n\n\n\n\n\n\nGrammer toString" << std::endl; std::cout << parser.grammerToString() << std::endl; //std::cout << parser.grammerToDOT() << std::endl; //outFile << parser.grammerToDOT() << std::endl; + std::cout << "\n\n\n\n\n\n\n\n\n\nParsing" << std::endl; std::cout << programInputFileString << std::endl; NodeTree* parseTree = parser.parseInput(programInputFileString); diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 8dc1f1f..5fba74f 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -23,7 +23,7 @@ void Lexer::addRegEx(std::string regExString) { } Symbol* Lexer::next() { - std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <::size_type i = 0; i < regExs.size(); i++) { - std::cout << "Trying regex " << regExs[i]->toString() << std::endl; + std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl; int currentMatch = regExs[i]->longMatch(remainingString); if (currentMatch > longestMatch) { longestMatch = currentMatch; @@ -40,11 +40,11 @@ Symbol* Lexer::next() { } if (longestRegEx != NULL) { currentPosition += longestMatch + 1; - std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <getPattern(), true); } else { std::cout << "Found no applicable regex" << std::endl; std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl; - return NULL; + return new Symbol("$NO_APPLICABLE_REGEX$", true); } } \ No newline at end of file diff --git a/src/ParseAction.cpp b/src/ParseAction.cpp index dd9193a..e8a304f 100644 --- a/src/ParseAction.cpp +++ b/src/ParseAction.cpp @@ -22,6 +22,9 @@ ParseAction::~ParseAction() { } +const bool ParseAction::equalsExceptLookahead(const ParseAction &other) { + return( action == other.action && ( reduceRule == other.reduceRule || reduceRule->equalsExceptLookahead(*(other.reduceRule)) ) && shiftState == other.shiftState); +} const bool ParseAction::operator==(const ParseAction &other) { return( action == other.action && ( reduceRule == other.reduceRule || *reduceRule == *(other.reduceRule) ) && shiftState == other.shiftState); diff --git a/src/Parser.cpp b/src/Parser.cpp index d149b62..293e0b8 100644 --- a/src/Parser.cpp +++ b/src/Parser.cpp @@ -73,6 +73,21 @@ void Parser::loadGrammer(std::string grammerInputString) { } std::vector* Parser::firstSet(Symbol* token) { + //std::cout << "Simple first set for " << token->toString() << std::endl; + std::vector avoidList; + return firstSet(token, avoidList); +} + +std::vector* Parser::firstSet(Symbol* token, std::vector &avoidList) { + //If we've already done this token, don't do it again + for (std::vector::size_type i = 0; i < avoidList.size(); i++) + if (*(avoidList[i]) == *token) { + return new std::vector(); + //std::cout << "Avoiding firstSet for " << token->toString() << std::endl; + } + avoidList.push_back(token); + //std::cout << "Cpx first set for " << token->toString() << std::endl; + //std::cout << "Doing first set for " << token->toString() << std::endl; std::vector* first = new std::vector(); //First, if the symbol is a terminal, than it's first set is just itself. if (token->isTerminal()) { @@ -96,7 +111,7 @@ std::vector* Parser::firstSet(Symbol* token) { recursiveFirstSet->push_back(rightToken); } else { //Add the entire set - recursiveFirstSet = firstSet(rightToken); + recursiveFirstSet = firstSet(rightToken, avoidList); } first->insert(first->end(), recursiveFirstSet->begin(), recursiveFirstSet->end()); //Check to see if the current recursiveFirstSet contains NULL, if so, then go through again with the next token. (if there is one) @@ -106,6 +121,7 @@ std::vector* Parser::firstSet(Symbol* token) { recFirstSetHasNull = true; } } + delete recursiveFirstSet; j++; } while (recFirstSetHasNull && loadedGrammer[i]->getRightSide().size() > j); } @@ -159,6 +175,7 @@ std::vector* Parser::incrementiveFollowSet(ParseRule* rule) { for (std::vector::size_type i = 0; i < symbolFirstSet->size(); i++) { if (*((*symbolFirstSet)[i]) == *nullSymbol) { symbolFirstSetHasNull = true; + symbolFirstSet->erase(symbolFirstSet->begin()+i); break; } } @@ -170,7 +187,17 @@ std::vector* Parser::incrementiveFollowSet(ParseRule* rule) { symbolFirstSet = rule->getLookahead(); followSet->insert(followSet->end(), symbolFirstSet->begin(), symbolFirstSet->end()); } - return followSet; + std::vector* followSetReturn = new std::vector(); + for (std::vector::size_type i = 0; i < followSet->size(); i++) { + bool alreadyIn = false; + for (std::vector::size_type j = 0; j < followSetReturn->size(); j++) + if (*((*followSet)[i]) == *((*followSetReturn)[j])) + alreadyIn = true; + if (!alreadyIn) + followSetReturn->push_back((*followSet)[i]); + } + delete followSet; + return followSetReturn; } void Parser::closure(State* state) { @@ -185,6 +212,7 @@ void Parser::closure(State* state) { if ( !currentStateRule->isAtEnd() && *(currentStateRule->getAtNextIndex()) == *(currentGramRule->getLeftSide())) { //std::cout << (*stateTotal)[i]->getAtNextIndex()->toString() << " has an applicable production " << loadedGrammer[j]->toString() << std::endl; //Now, add the correct lookahead. This followSet is built based on the current rule's lookahead if at end, or the next Symbol's first set. + //std::cout << "Setting lookahead for " << currentGramRule->toString() << " in state " << state->toString() << std::endl; currentGramRule->setLookahead(incrementiveFollowSet(currentStateRule)); //Check to make sure not already in @@ -340,7 +368,7 @@ void Parser::addToTable(State* fromState, Symbol* tranSymbol, ParseAction* actio (*(table[stateNum]))[symbolIndex] = action; } //If the slot is not empty and does not contain ourself, then it is a conflict - else if ( *((*(table[stateNum]))[symbolIndex]) != *action) { + else if ( !(*(table[stateNum]))[symbolIndex]->equalsExceptLookahead(*action)) { //std::cout << "not Null!" << std::endl; std::cout << "State: " << stateNum << " Conflict between old: " << (*(table[stateNum]))[symbolIndex]->toString() << " and new: " << action->toString() << std::endl; //Don't overwrite diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 80a1ab6..92c6b84 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -2,9 +2,15 @@ RegEx::RegEx(std::string inPattern) { pattern = inPattern; + construct(); + deperenthesize(); +} + +void RegEx::construct() { std::vector previousStates; std::vector currentStates; - std::stack, RegExState*> > perenStack; + std::stack, std::vector > > perenStack; + bool alternating = false; begin = new RegExState(); currentStates.push_back(begin); for (int i = 0; i < pattern.length(); i++) { @@ -42,11 +48,7 @@ RegEx::RegEx(std::string inPattern) { { std::cout << "Alternation at " << i << " in " << pattern << std::endl; //alternation - i++; - RegExState* next = new RegExState(pattern[i]); - for (std::vector::size_type j = 0; j < previousStates.size(); j++) - previousStates[j]->addNext(next); - currentStates.push_back(next); + alternating = true; } break; @@ -57,17 +59,35 @@ RegEx::RegEx(std::string inPattern) { //Create a peren node with an inner empty node RegExState* next = new RegExState(new RegExState()); - for (std::vector::size_type j = 0; j < currentStates.size(); j++) - currentStates[j]->addNext(next); + if (alternating) { + for (std::vector::size_type j = 0; j < previousStates.size(); j++) + previousStates[j]->addNext(next); - previousStates.clear(); - //Save both current states here as well as the current preren - std::vector saveStates = currentStates; - // saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end()) - perenStack.push(std::make_pair(saveStates, next)); + //Save both current states here as well as the current preren + std::vector savePreviousStates = previousStates; + currentStates.push_back(next); + std::vector saveCurrentStates = currentStates; + perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates)); - currentStates.clear(); - currentStates.push_back(next->getInner()); + previousStates.clear(); + currentStates.clear(); + currentStates.push_back(next->getInner()); + alternating = false; + } else { + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + currentStates[j]->addNext(next); + + //Save both current states here as well as the current preren + std::vector savePreviousStates = currentStates; + currentStates.clear(); + currentStates.push_back(next); + std::vector saveCurrentStates = currentStates; + perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates)); + + previousStates.clear(); + currentStates.clear(); + currentStates.push_back(next->getInner()); + } std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl; } break; @@ -77,15 +97,16 @@ RegEx::RegEx(std::string inPattern) { std::cout << "End peren at " << i << " in " << pattern << std::endl; //perentheses //Pop off the states that will now be the previous states and the peren node which will now be the current node - std::pair, RegExState*> savedPair = perenStack.top(); + std::pair, std::vector > savedPair = perenStack.top(); perenStack.pop(); //Make the it so previousStates = savedPair.first; //Make sure the end of the inner stuff points back to the peren node for (std::vector::size_type j = 0; j < currentStates.size(); j++) - currentStates[j]->addNext(savedPair.second); + currentStates[j]->addNext(savedPair.second[savedPair.second.size()-1]); + //currentStates[j]->addNext(*(savedPair.second.end())); currentStates.clear(); - currentStates.push_back(savedPair.second); + currentStates = savedPair.second; } break; @@ -93,42 +114,42 @@ RegEx::RegEx(std::string inPattern) { { i++; std::cout << "Escape! Escaping: " << pattern[i] << std::endl; - //Ahh, it's escaping a special character - RegExState* next = new RegExState(pattern[i]); - for (std::vector::size_type j = 0; j < currentStates.size(); j++) { - currentStates[j]->addNext(next); - std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; - } - - previousStates.clear(); - // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); - previousStates = currentStates; - currentStates.clear(); - currentStates.push_back(next); + //Ahh, it's escaping a special character, so fall through to the default. } - break; default: { std::cout << "Regular" << std::endl; //Ahh, it's regular RegExState* next = new RegExState(pattern[i]); - for (std::vector::size_type j = 0; j < currentStates.size(); j++) { - currentStates[j]->addNext(next); - std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + //If we're alternating, add next as the next for each previous state, and add self to currentStates + if (alternating) { + for (std::vector::size_type j = 0; j < previousStates.size(); j++) { + previousStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << previousStates[j] << std::endl; + } + currentStates.push_back(next); + alternating = false; + } else { + //If we're not alternating, add next as next for all the current states, make the current states the new + //previous states, and add ourself as the new current state. + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { + currentStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + } + previousStates.clear(); + previousStates = currentStates; + currentStates.clear(); + currentStates.push_back(next); } - - previousStates.clear(); - // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); - previousStates = currentStates; - currentStates.clear(); - currentStates.push_back(next); } } } //last one is goal state for (std::vector::size_type i = 0; i < currentStates.size(); i++) currentStates[i]->addNext(NULL); +} +void RegEx::deperenthesize() { std::cout << "About to de-perenthesize " << begin->toString() << std::endl; //Now go through and expand the peren nodes to regular nodes @@ -144,13 +165,13 @@ RegEx::RegEx(std::string inPattern) { if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) { //Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others) std::vector* insideNextStates = (*nextStates)[j]->getInner()->getNextStates(); - std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl; + //std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl; RegExState* perenState = (*nextStates)[j]; (*nextStates)[j] = (*insideNextStates)[0]; - std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl; + //std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl; for (std::vector::size_type k = 1; k < insideNextStates->size(); k++) nextStates->push_back((*insideNextStates)[k]); - std::cout << "Replaced beginning: " << begin->toString() << std::endl; + //std::cout << "Replaced beginning: " << begin->toString() << std::endl; //Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner std::vector* perenNextNodes = perenState->getNextStates(); for (std::vector::size_type k = 0; k < perenNextNodes->size(); k++) { @@ -166,17 +187,17 @@ RegEx::RegEx(std::string inPattern) { traversalList.push_back(perenState->getInner()); for (std::vector::size_type k = 0; k < traversalList.size(); k++) { std::vector* nextTraversalStates = traversalList[k]->getNextStates(); - std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl; - std::cout << "with children: "; - for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) - std::cout << (*nextTraversalStates)[l]->getCharacter() << " "; - std::cout << std::endl; + //std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl; + //std::cout << "with children:" << std::endl; + //for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) + // std::cout << "\t\"" << (*nextTraversalStates)[l]->getCharacter() << "\"" << std::endl; + //std::cout << std::endl; for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) { //If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren //And we now replace this reference with the next nodes from the peren node - std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl; + //std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl; if ((*nextTraversalStates)[l] == perenState) { - std::cout << "nextTraversalStates[l] = to perenState!" << std::endl; + // std::cout << "nextTraversalStates[l] = to perenState!" << std::endl; std::vector endPerenNextStates = *(perenState->getNextStates()); (*nextTraversalStates)[l] = endPerenNextStates[0]; for (std::vector::size_type n = 1; n < endPerenNextStates.size(); n++) diff --git a/src/RegExState.cpp b/src/RegExState.cpp index c5bd917..c147ad2 100644 --- a/src/RegExState.cpp +++ b/src/RegExState.cpp @@ -79,7 +79,7 @@ std::string RegExState::toString(std::vector* avoid) { } } if (inAvoid) { - string += "->LoopDetected"; + string += "->loop"; continue; } diff --git a/src/StringReader.cpp b/src/StringReader.cpp index 3017117..2675a9a 100644 --- a/src/StringReader.cpp +++ b/src/StringReader.cpp @@ -67,7 +67,37 @@ std::string StringReader::getTokens(std::vector stop_chars, bool tr } if (rd_string[str_pos] == '\"') { - found_pos = rd_string.find("\"", str_pos+1); + //See if we have an even or odd number of backslashes (that is, this quote is not or is escaped) + int numBackslashes = 0; + int countBack = 1; + while (str_pos-countBack >= 0 && rd_string[str_pos-countBack] == '\\') { + numBackslashes++; + countBack++; + } + //If the quote is not escaped + if (numBackslashes % 2 == 0) { + //Find the next quote + found_pos = rd_string.find("\"", str_pos+1); + //Check to see if the quote is escaped + numBackslashes = 0; + countBack = 1; + while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') { + numBackslashes++; + countBack++; + } + //While the quote is escaped + while (numBackslashes % 2 == 1) { + //find the next quote + found_pos = rd_string.find("\"", found_pos+1); + //Check to see if it's escaped + numBackslashes = 0; + countBack = 1; + while (found_pos-countBack >= 0 && rd_string[found_pos-countBack] == '\\') { + numBackslashes++; + countBack++; + } + } + } } if (found_pos == str_pos) //We are at the endline