Perens now work fully! The RegEx part of Kraken should now be fully legitimate. The only problem is not gracefully letting the user know about faulty input, but that goes for all of Kraken.

This commit is contained in:
Nathan Braswell
2013-07-07 02:13:05 -04:00
parent 502929963c
commit 4c2fd967f0
4 changed files with 190 additions and 12 deletions

View File

@@ -6,6 +6,9 @@
#include "Symbol.h" #include "Symbol.h"
#include <string> #include <string>
#include <utility>
#include <stack>
#include <vector>
class RegEx { class RegEx {
public: public:

View File

@@ -18,8 +18,16 @@ class RegExState {
void addNext(RegExState* nextState); void addNext(RegExState* nextState);
bool characterIs(char inCharacter); bool characterIs(char inCharacter);
std::vector<RegExState*>* advance(char advanceCharacter); std::vector<RegExState*>* advance(char advanceCharacter);
std::vector<RegExState*>* getNextStates();
RegExState* getInner();
bool isGoal(); bool isGoal();
std::string toString(); std::string toString();
std::string toString(RegExState* avoid);
std::string toString(std::vector<RegExState*>* avoid);
char getCharacter();
private: private:
std::vector<RegExState*> nextStates; std::vector<RegExState*> nextStates;

View File

@@ -4,6 +4,7 @@ RegEx::RegEx(std::string inPattern) {
pattern = inPattern; pattern = inPattern;
std::vector<RegExState*> previousStates; std::vector<RegExState*> previousStates;
std::vector<RegExState*> currentStates; std::vector<RegExState*> currentStates;
std::stack<std::pair<std::vector<RegExState*>, RegExState*> > perenStack;
begin = new RegExState(); begin = new RegExState();
currentStates.push_back(begin); currentStates.push_back(begin);
for (int i = 0; i < pattern.length(); i++) { for (int i = 0; i < pattern.length(); i++) {
@@ -50,19 +51,75 @@ RegEx::RegEx(std::string inPattern) {
break; break;
case '(': case '(':
{
std::cout << "Begin peren at " << i << " in " << pattern << std::endl; std::cout << "Begin peren at " << i << " in " << pattern << std::endl;
//perentheses //perentheses
//Create a peren node with an inner empty node
RegExState* next = new RegExState(new RegExState());
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
currentStates[j]->addNext(next);
previousStates.clear();
//Save both current states here as well as the current preren
std::vector<RegExState*> saveStates = currentStates;
// saveStates.insert(saveStates.end(), currentStates.begin(), currentStates.end())
perenStack.push(std::make_pair(saveStates, next));
currentStates.clear();
currentStates.push_back(next->getInner());
std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl;
}
break;
case ')':
{
std::cout << "End peren at " << i << " in " << pattern << std::endl;
//perentheses
//Pop off the states that will now be the previous states and the peren node which will now be the current node
std::pair<std::vector<RegExState*>, RegExState*> savedPair = perenStack.top();
perenStack.pop();
//Make the it so
previousStates = savedPair.first;
//Make sure the end of the inner stuff points back to the peren node
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
currentStates[j]->addNext(savedPair.second);
currentStates.clear();
currentStates.push_back(savedPair.second);
}
break;
case '\\':
{
i++;
std::cout << "Escape! Escaping: " << pattern[i] << std::endl;
//Ahh, it's escaping a special character
RegExState* next = new RegExState(pattern[i]);
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
currentStates[j]->addNext(next);
std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
}
previousStates.clear();
// previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
previousStates = currentStates;
currentStates.clear();
currentStates.push_back(next);
}
break; break;
default: default:
{ {
std::cout << "Regular" << std::endl; std::cout << "Regular" << std::endl;
//Ahh, it's regular //Ahh, it's regular
RegExState* next = new RegExState(pattern[i]); RegExState* next = new RegExState(pattern[i]);
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
currentStates[j]->addNext(next); currentStates[j]->addNext(next);
std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
}
previousStates.clear(); previousStates.clear();
previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end()); // previousStates.insert(previousStates.begin(), currentStates.begin(), currentStates.end());
previousStates = currentStates;
currentStates.clear(); currentStates.clear();
currentStates.push_back(next); currentStates.push_back(next);
} }
@@ -71,6 +128,82 @@ RegEx::RegEx(std::string inPattern) {
//last one is goal state //last one is goal state
for (std::vector<RegExState*>::size_type i = 0; i < currentStates.size(); i++) for (std::vector<RegExState*>::size_type i = 0; i < currentStates.size(); i++)
currentStates[i]->addNext(NULL); currentStates[i]->addNext(NULL);
std::cout << "About to de-perenthesize " << begin->toString() << std::endl;
//Now go through and expand the peren nodes to regular nodes
std::vector<RegExState*> processedStates;
std::vector<RegExState*> statesToProcess;
statesToProcess.push_back(begin);
for (std::vector<RegExState*>::size_type i = 0; i < statesToProcess.size(); i++) {
//Don't process null (sucess) state
if (statesToProcess[i] == NULL)
continue;
std::vector<RegExState*>* nextStates = statesToProcess[i]->getNextStates();
for (std::vector<RegExState*>::size_type j = 0; j < nextStates->size(); j++) {
if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) {
//Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others)
std::vector<RegExState*>* insideNextStates = (*nextStates)[j]->getInner()->getNextStates();
std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
RegExState* perenState = (*nextStates)[j];
(*nextStates)[j] = (*insideNextStates)[0];
std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
for (std::vector<RegExState*>::size_type k = 1; k < insideNextStates->size(); k++)
nextStates->push_back((*insideNextStates)[k]);
std::cout << "Replaced beginning: " << begin->toString() << std::endl;
//Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner
std::vector<RegExState*>* perenNextNodes = perenState->getNextStates();
for (std::vector<RegExState*>::size_type k = 0; k < perenNextNodes->size(); k++) {
if ((*perenNextNodes)[k] == perenState) {
(*perenNextNodes)[k] = (*insideNextStates)[0];
for (std::vector<RegExState*>::size_type l = 1; l < insideNextStates->size(); l++)
perenNextNodes->push_back((*insideNextStates)[l]);
}
}
//std::cout << "Fixed self-references: " << begin->toString() << std::endl;
//Need to fix the end too
std::vector<RegExState*> traversalList;
traversalList.push_back(perenState->getInner());
for (std::vector<RegExState*>::size_type k = 0; k < traversalList.size(); k++) {
std::vector<RegExState*>* nextTraversalStates = traversalList[k]->getNextStates();
std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl;
std::cout << "with children: ";
for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
std::cout << (*nextTraversalStates)[l]->getCharacter() << " ";
std::cout << std::endl;
for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++) {
//If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren
//And we now replace this reference with the next nodes from the peren node
std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
if ((*nextTraversalStates)[l] == perenState) {
std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
std::vector<RegExState*> endPerenNextStates = *(perenState->getNextStates());
(*nextTraversalStates)[l] = endPerenNextStates[0];
for (std::vector<RegExState*>::size_type n = 1; n < endPerenNextStates.size(); n++)
nextTraversalStates->push_back(endPerenNextStates[n]);
//Now make sure we don't now try to continue through and end up processing stuff we just replaced the peren reference with
break;
} else {
traversalList.push_back((*nextTraversalStates)[l]);
}
}
}
}
}
//Now add all these next states to process, only if they haven't already been processed
for (std::vector<RegExState*>::size_type j = 0; j < nextStates->size(); j++) {
bool inCurrStates = false;
for (std::vector<RegExState*>::size_type k = 0; k < statesToProcess.size(); k++) {
if ((*nextStates)[j] == statesToProcess[k])
inCurrStates = true;
}
if (!inCurrStates) {
statesToProcess.push_back((*nextStates)[j]);
//std::cout << (*nextStates)[j] << "Is not in states to process" << std::endl;
}
}
}
std::cout << "Finished de-perenthesization " << begin->toString() << std::endl;
} }
RegEx::~RegEx() { RegEx::~RegEx() {
@@ -103,7 +236,7 @@ int RegEx::longMatch(std::string stringToMatch) {
for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) { for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
bool inCurrStates = false; bool inCurrStates = false;
for (std::vector<RegExState*>::size_type k = 0; k < currentStates.size(); k++) { for (std::vector<RegExState*>::size_type k = 0; k < currentStates.size(); k++) {
if (nextStates[j] == currentStates[i]) if (nextStates[j] == currentStates[k])
inCurrStates = true; inCurrStates = true;
} }
if (!inCurrStates) if (!inCurrStates)
@@ -111,12 +244,7 @@ int RegEx::longMatch(std::string stringToMatch) {
} }
if (currentStates.size() != 0) if (currentStates.size() != 0)
std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl; std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
/*
std::cout << "Current states are: ";
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
std::cout << currentStates[j]->toString() << " ";
std::cout << std::endl;
*/
nextStates.clear(); nextStates.clear();
//If we can't continue matching, just return our last matched //If we can't continue matching, just return our last matched
if (currentStates.size() == 0) if (currentStates.size() == 0)

View File

@@ -35,6 +35,14 @@ std::vector<RegExState*>* RegExState::advance(char advanceCharacter) {
return advanceStates; return advanceStates;
} }
RegExState* RegExState::getInner() {
return inner;
}
std::vector<RegExState*>* RegExState::getNextStates() {
return &nextStates;
}
bool RegExState::isGoal() { bool RegExState::isGoal() {
//return inner == NULL && nextStates.size() == 0; //return inner == NULL && nextStates.size() == 0;
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++) for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
@@ -44,17 +52,48 @@ bool RegExState::isGoal() {
} }
std::string RegExState::toString() { std::string RegExState::toString() {
std::vector<RegExState*> avoidList;
return toString(&avoidList);
}
std::string RegExState::toString(RegExState* avoid) {
std::vector<RegExState*> avoidList;
avoidList.push_back(avoid);
return toString(&avoidList);
}
std::string RegExState::toString(std::vector<RegExState*>* avoid) {
avoid->push_back(this);
std::string string = ""; std::string string = "";
string += std::string("\"") + character + "\""; string += std::string("\"") + character + "\"";
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++) if (inner != NULL) {
string += "inner: ";
string += inner->toString(avoid);
string += " end inner ";
}
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++) {
bool inAvoid = false;
for (std::vector<RegExState*>::size_type j = 0; j < avoid->size(); j++) {
if (nextStates[i] == (*avoid)[j]) {
inAvoid = true;
}
}
if (inAvoid) {
string += "->LoopDetected";
continue;
}
if (nextStates[i] != this && nextStates[i] != NULL) if (nextStates[i] != this && nextStates[i] != NULL)
string += "->" + nextStates[i]->toString() + " EC "; string += "->" + nextStates[i]->toString(avoid) + " EC ";
else if (nextStates[i] == NULL) else if (nextStates[i] == NULL)
string += "-> GOAL "; string += "-> GOAL ";
else else
string += "->this"; string += "->this";
}
//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl; //std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
return string; return string;
} }
char RegExState::getCharacter() {
return character;
}