diff --git a/include/RegEx.h b/include/RegEx.h index 55f8edb..139c082 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -6,6 +6,9 @@ #include "Symbol.h" #include +#include +#include +#include class RegEx { public: diff --git a/include/RegExState.h b/include/RegExState.h index da35c05..de4096a 100644 --- a/include/RegExState.h +++ b/include/RegExState.h @@ -18,8 +18,16 @@ class RegExState { void addNext(RegExState* nextState); bool characterIs(char inCharacter); std::vector* advance(char advanceCharacter); + std::vector* getNextStates(); + + RegExState* getInner(); + bool isGoal(); std::string toString(); + std::string toString(RegExState* avoid); + std::string toString(std::vector* avoid); + + char getCharacter(); private: std::vector nextStates; diff --git a/src/RegEx.cpp b/src/RegEx.cpp index f1e93b1..80a1ab6 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -4,6 +4,7 @@ RegEx::RegEx(std::string inPattern) { pattern = inPattern; std::vector previousStates; std::vector currentStates; + std::stack, RegExState*> > perenStack; begin = new RegExState(); currentStates.push_back(begin); for (int i = 0; i < pattern.length(); i++) { @@ -50,19 +51,75 @@ RegEx::RegEx(std::string inPattern) { break; case '(': + { std::cout << "Begin peren at " << i << " in " << pattern << std::endl; //perentheses + //Create a peren node with an inner empty node + RegExState* next = new RegExState(new RegExState()); + + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + currentStates[j]->addNext(next); + + previousStates.clear(); + //Save both current states here as well as the current preren + std::vector saveStates = currentStates; + // saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end()) + perenStack.push(std::make_pair(saveStates, next)); + + currentStates.clear(); + currentStates.push_back(next->getInner()); + std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl; + } + break; + + case ')': + { + std::cout << "End peren at " << i << " in " << pattern << std::endl; + //perentheses + //Pop off the states that will now be the previous states and the peren node which will now be the current node + std::pair, RegExState*> savedPair = perenStack.top(); + perenStack.pop(); + //Make the it so + previousStates = savedPair.first; + //Make sure the end of the inner stuff points back to the peren node + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + currentStates[j]->addNext(savedPair.second); + currentStates.clear(); + currentStates.push_back(savedPair.second); + } + break; + + case '\\': + { + i++; + std::cout << "Escape! Escaping: " << pattern[i] << std::endl; + //Ahh, it's escaping a special character + RegExState* next = new RegExState(pattern[i]); + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { + currentStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + } + + previousStates.clear(); + // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + previousStates = currentStates; + currentStates.clear(); + currentStates.push_back(next); + } break; default: { std::cout << "Regular" << std::endl; //Ahh, it's regular RegExState* next = new RegExState(pattern[i]); - for (std::vector::size_type j = 0; j < currentStates.size(); j++) + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { currentStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + } previousStates.clear(); - previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + previousStates = currentStates; currentStates.clear(); currentStates.push_back(next); } @@ -71,6 +128,82 @@ RegEx::RegEx(std::string inPattern) { //last one is goal state for (std::vector::size_type i = 0; i < currentStates.size(); i++) currentStates[i]->addNext(NULL); + + std::cout << "About to de-perenthesize " << begin->toString() << std::endl; + + //Now go through and expand the peren nodes to regular nodes + std::vector processedStates; + std::vector statesToProcess; + statesToProcess.push_back(begin); + for (std::vector::size_type i = 0; i < statesToProcess.size(); i++) { + //Don't process null (sucess) state + if (statesToProcess[i] == NULL) + continue; + std::vector* nextStates = statesToProcess[i]->getNextStates(); + for (std::vector::size_type j = 0; j < nextStates->size(); j++) { + if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) { + //Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others) + std::vector* insideNextStates = (*nextStates)[j]->getInner()->getNextStates(); + std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl; + RegExState* perenState = (*nextStates)[j]; + (*nextStates)[j] = (*insideNextStates)[0]; + std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl; + for (std::vector::size_type k = 1; k < insideNextStates->size(); k++) + nextStates->push_back((*insideNextStates)[k]); + std::cout << "Replaced beginning: " << begin->toString() << std::endl; + //Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner + std::vector* perenNextNodes = perenState->getNextStates(); + for (std::vector::size_type k = 0; k < perenNextNodes->size(); k++) { + if ((*perenNextNodes)[k] == perenState) { + (*perenNextNodes)[k] = (*insideNextStates)[0]; + for (std::vector::size_type l = 1; l < insideNextStates->size(); l++) + perenNextNodes->push_back((*insideNextStates)[l]); + } + } + //std::cout << "Fixed self-references: " << begin->toString() << std::endl; + //Need to fix the end too + std::vector traversalList; + traversalList.push_back(perenState->getInner()); + for (std::vector::size_type k = 0; k < traversalList.size(); k++) { + std::vector* nextTraversalStates = traversalList[k]->getNextStates(); + std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl; + std::cout << "with children: "; + for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) + std::cout << (*nextTraversalStates)[l]->getCharacter() << " "; + std::cout << std::endl; + for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) { + //If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren + //And we now replace this reference with the next nodes from the peren node + std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl; + if ((*nextTraversalStates)[l] == perenState) { + std::cout << "nextTraversalStates[l] = to perenState!" << std::endl; + std::vector endPerenNextStates = *(perenState->getNextStates()); + (*nextTraversalStates)[l] = endPerenNextStates[0]; + for (std::vector::size_type n = 1; n < endPerenNextStates.size(); n++) + nextTraversalStates->push_back(endPerenNextStates[n]); + //Now make sure we don't now try to continue through and end up processing stuff we just replaced the peren reference with + break; + } else { + traversalList.push_back((*nextTraversalStates)[l]); + } + } + } + } + } + //Now add all these next states to process, only if they haven't already been processed + for (std::vector::size_type j = 0; j < nextStates->size(); j++) { + bool inCurrStates = false; + for (std::vector::size_type k = 0; k < statesToProcess.size(); k++) { + if ((*nextStates)[j] == statesToProcess[k]) + inCurrStates = true; + } + if (!inCurrStates) { + statesToProcess.push_back((*nextStates)[j]); + //std::cout << (*nextStates)[j] << "Is not in states to process" << std::endl; + } + } + } + std::cout << "Finished de-perenthesization " << begin->toString() << std::endl; } RegEx::~RegEx() { @@ -103,7 +236,7 @@ int RegEx::longMatch(std::string stringToMatch) { for (std::vector::size_type j = 0; j < nextStates.size(); j++) { bool inCurrStates = false; for (std::vector::size_type k = 0; k < currentStates.size(); k++) { - if (nextStates[j] == currentStates[i]) + if (nextStates[j] == currentStates[k]) inCurrStates = true; } if (!inCurrStates) @@ -111,12 +244,7 @@ int RegEx::longMatch(std::string stringToMatch) { } if (currentStates.size() != 0) std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl; -/* - std::cout << "Current states are: "; - for (std::vector::size_type j = 0; j < currentStates.size(); j++) - std::cout << currentStates[j]->toString() << " "; - std::cout << std::endl; -*/ + nextStates.clear(); //If we can't continue matching, just return our last matched if (currentStates.size() == 0) diff --git a/src/RegExState.cpp b/src/RegExState.cpp index 677063f..c5bd917 100644 --- a/src/RegExState.cpp +++ b/src/RegExState.cpp @@ -35,6 +35,14 @@ std::vector* RegExState::advance(char advanceCharacter) { return advanceStates; } +RegExState* RegExState::getInner() { + return inner; +} + +std::vector* RegExState::getNextStates() { + return &nextStates; +} + bool RegExState::isGoal() { //return inner == NULL && nextStates.size() == 0; for (std::vector::size_type i = 0; i < nextStates.size(); i++) @@ -44,17 +52,48 @@ bool RegExState::isGoal() { } std::string RegExState::toString() { + std::vector avoidList; + return toString(&avoidList); +} + +std::string RegExState::toString(RegExState* avoid) { + std::vector avoidList; + avoidList.push_back(avoid); + return toString(&avoidList); +} + +std::string RegExState::toString(std::vector* avoid) { + avoid->push_back(this); std::string string = ""; string += std::string("\"") + character + "\""; - for (std::vector::size_type i = 0; i < nextStates.size(); i++) + if (inner != NULL) { + string += "inner: "; + string += inner->toString(avoid); + string += " end inner "; + } + for (std::vector::size_type i = 0; i < nextStates.size(); i++) { + bool inAvoid = false; + for (std::vector::size_type j = 0; j < avoid->size(); j++) { + if (nextStates[i] == (*avoid)[j]) { + inAvoid = true; + } + } + if (inAvoid) { + string += "->LoopDetected"; + continue; + } + if (nextStates[i] != this && nextStates[i] != NULL) - string += "->" + nextStates[i]->toString() + " EC "; + string += "->" + nextStates[i]->toString(avoid) + " EC "; else if (nextStates[i] == NULL) string += "-> GOAL "; else string += "->this"; + } //std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <