From 4c2fd967f0c316d9c949cfce1078296271095334 Mon Sep 17 00:00:00 2001 From: Nathan Braswell Date: Sun, 7 Jul 2013 02:13:05 -0400 Subject: [PATCH] Perens now work fully! The RegEx part of Kraken should now be fully legitimate. The only problem is not gracefully letting the user know about faulty input, but that goes for all of Kraken. --- include/RegEx.h | 3 + include/RegExState.h | 8 +++ src/RegEx.cpp | 146 ++++++++++++++++++++++++++++++++++++++++--- src/RegExState.cpp | 45 ++++++++++++- 4 files changed, 190 insertions(+), 12 deletions(-) diff --git a/include/RegEx.h b/include/RegEx.h index 55f8edb..139c082 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -6,6 +6,9 @@ #include "Symbol.h" #include +#include +#include +#include class RegEx { public: diff --git a/include/RegExState.h b/include/RegExState.h index da35c05..de4096a 100644 --- a/include/RegExState.h +++ b/include/RegExState.h @@ -18,8 +18,16 @@ class RegExState { void addNext(RegExState* nextState); bool characterIs(char inCharacter); std::vector* advance(char advanceCharacter); + std::vector* getNextStates(); + + RegExState* getInner(); + bool isGoal(); std::string toString(); + std::string toString(RegExState* avoid); + std::string toString(std::vector* avoid); + + char getCharacter(); private: std::vector nextStates; diff --git a/src/RegEx.cpp b/src/RegEx.cpp index f1e93b1..80a1ab6 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -4,6 +4,7 @@ RegEx::RegEx(std::string inPattern) { pattern = inPattern; std::vector previousStates; std::vector currentStates; + std::stack, RegExState*> > perenStack; begin = new RegExState(); currentStates.push_back(begin); for (int i = 0; i < pattern.length(); i++) { @@ -50,19 +51,75 @@ RegEx::RegEx(std::string inPattern) { break; case '(': + { std::cout << "Begin peren at " << i << " in " << pattern << std::endl; //perentheses + //Create a peren node with an inner empty node + RegExState* next = new RegExState(new RegExState()); + + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + currentStates[j]->addNext(next); + + previousStates.clear(); + //Save both current states here as well as the current preren + std::vector saveStates = currentStates; + // saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end()) + perenStack.push(std::make_pair(saveStates, next)); + + currentStates.clear(); + currentStates.push_back(next->getInner()); + std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl; + } + break; + + case ')': + { + std::cout << "End peren at " << i << " in " << pattern << std::endl; + //perentheses + //Pop off the states that will now be the previous states and the peren node which will now be the current node + std::pair, RegExState*> savedPair = perenStack.top(); + perenStack.pop(); + //Make the it so + previousStates = savedPair.first; + //Make sure the end of the inner stuff points back to the peren node + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + currentStates[j]->addNext(savedPair.second); + currentStates.clear(); + currentStates.push_back(savedPair.second); + } + break; + + case '\\': + { + i++; + std::cout << "Escape! Escaping: " << pattern[i] << std::endl; + //Ahh, it's escaping a special character + RegExState* next = new RegExState(pattern[i]); + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { + currentStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + } + + previousStates.clear(); + // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + previousStates = currentStates; + currentStates.clear(); + currentStates.push_back(next); + } break; default: { std::cout << "Regular" << std::endl; //Ahh, it's regular RegExState* next = new RegExState(pattern[i]); - for (std::vector::size_type j = 0; j < currentStates.size(); j++) + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { currentStates[j]->addNext(next); + std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; + } previousStates.clear(); - previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); + previousStates = currentStates; currentStates.clear(); currentStates.push_back(next); } @@ -71,6 +128,82 @@ RegEx::RegEx(std::string inPattern) { //last one is goal state for (std::vector::size_type i = 0; i < currentStates.size(); i++) currentStates[i]->addNext(NULL); + + std::cout << "About to de-perenthesize " << begin->toString() << std::endl; + + //Now go through and expand the peren nodes to regular nodes + std::vector processedStates; + std::vector statesToProcess; + statesToProcess.push_back(begin); + for (std::vector::size_type i = 0; i < statesToProcess.size(); i++) { + //Don't process null (sucess) state + if (statesToProcess[i] == NULL) + continue; + std::vector* nextStates = statesToProcess[i]->getNextStates(); + for (std::vector::size_type j = 0; j < nextStates->size(); j++) { + if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) { + //Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others) + std::vector* insideNextStates = (*nextStates)[j]->getInner()->getNextStates(); + std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl; + RegExState* perenState = (*nextStates)[j]; + (*nextStates)[j] = (*insideNextStates)[0]; + std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl; + for (std::vector::size_type k = 1; k < insideNextStates->size(); k++) + nextStates->push_back((*insideNextStates)[k]); + std::cout << "Replaced beginning: " << begin->toString() << std::endl; + //Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner + std::vector* perenNextNodes = perenState->getNextStates(); + for (std::vector::size_type k = 0; k < perenNextNodes->size(); k++) { + if ((*perenNextNodes)[k] == perenState) { + (*perenNextNodes)[k] = (*insideNextStates)[0]; + for (std::vector::size_type l = 1; l < insideNextStates->size(); l++) + perenNextNodes->push_back((*insideNextStates)[l]); + } + } + //std::cout << "Fixed self-references: " << begin->toString() << std::endl; + //Need to fix the end too + std::vector traversalList; + traversalList.push_back(perenState->getInner()); + for (std::vector::size_type k = 0; k < traversalList.size(); k++) { + std::vector* nextTraversalStates = traversalList[k]->getNextStates(); + std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl; + std::cout << "with children: "; + for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) + std::cout << (*nextTraversalStates)[l]->getCharacter() << " "; + std::cout << std::endl; + for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) { + //If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren + //And we now replace this reference with the next nodes from the peren node + std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl; + if ((*nextTraversalStates)[l] == perenState) { + std::cout << "nextTraversalStates[l] = to perenState!" << std::endl; + std::vector endPerenNextStates = *(perenState->getNextStates()); + (*nextTraversalStates)[l] = endPerenNextStates[0]; + for (std::vector::size_type n = 1; n < endPerenNextStates.size(); n++) + nextTraversalStates->push_back(endPerenNextStates[n]); + //Now make sure we don't now try to continue through and end up processing stuff we just replaced the peren reference with + break; + } else { + traversalList.push_back((*nextTraversalStates)[l]); + } + } + } + } + } + //Now add all these next states to process, only if they haven't already been processed + for (std::vector::size_type j = 0; j < nextStates->size(); j++) { + bool inCurrStates = false; + for (std::vector::size_type k = 0; k < statesToProcess.size(); k++) { + if ((*nextStates)[j] == statesToProcess[k]) + inCurrStates = true; + } + if (!inCurrStates) { + statesToProcess.push_back((*nextStates)[j]); + //std::cout << (*nextStates)[j] << "Is not in states to process" << std::endl; + } + } + } + std::cout << "Finished de-perenthesization " << begin->toString() << std::endl; } RegEx::~RegEx() { @@ -103,7 +236,7 @@ int RegEx::longMatch(std::string stringToMatch) { for (std::vector::size_type j = 0; j < nextStates.size(); j++) { bool inCurrStates = false; for (std::vector::size_type k = 0; k < currentStates.size(); k++) { - if (nextStates[j] == currentStates[i]) + if (nextStates[j] == currentStates[k]) inCurrStates = true; } if (!inCurrStates) @@ -111,12 +244,7 @@ int RegEx::longMatch(std::string stringToMatch) { } if (currentStates.size() != 0) std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl; -/* - std::cout << "Current states are: "; - for (std::vector::size_type j = 0; j < currentStates.size(); j++) - std::cout << currentStates[j]->toString() << " "; - std::cout << std::endl; -*/ + nextStates.clear(); //If we can't continue matching, just return our last matched if (currentStates.size() == 0) diff --git a/src/RegExState.cpp b/src/RegExState.cpp index 677063f..c5bd917 100644 --- a/src/RegExState.cpp +++ b/src/RegExState.cpp @@ -35,6 +35,14 @@ std::vector* RegExState::advance(char advanceCharacter) { return advanceStates; } +RegExState* RegExState::getInner() { + return inner; +} + +std::vector* RegExState::getNextStates() { + return &nextStates; +} + bool RegExState::isGoal() { //return inner == NULL && nextStates.size() == 0; for (std::vector::size_type i = 0; i < nextStates.size(); i++) @@ -44,17 +52,48 @@ bool RegExState::isGoal() { } std::string RegExState::toString() { + std::vector avoidList; + return toString(&avoidList); +} + +std::string RegExState::toString(RegExState* avoid) { + std::vector avoidList; + avoidList.push_back(avoid); + return toString(&avoidList); +} + +std::string RegExState::toString(std::vector* avoid) { + avoid->push_back(this); std::string string = ""; string += std::string("\"") + character + "\""; - for (std::vector::size_type i = 0; i < nextStates.size(); i++) + if (inner != NULL) { + string += "inner: "; + string += inner->toString(avoid); + string += " end inner "; + } + for (std::vector::size_type i = 0; i < nextStates.size(); i++) { + bool inAvoid = false; + for (std::vector::size_type j = 0; j < avoid->size(); j++) { + if (nextStates[i] == (*avoid)[j]) { + inAvoid = true; + } + } + if (inAvoid) { + string += "->LoopDetected"; + continue; + } + if (nextStates[i] != this && nextStates[i] != NULL) - string += "->" + nextStates[i]->toString() + " EC "; + string += "->" + nextStates[i]->toString(avoid) + " EC "; else if (nextStates[i] == NULL) string += "-> GOAL "; else string += "->this"; + } //std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <