From 0d47a039862873387be143fe485512e869245fce Mon Sep 17 00:00:00 2001 From: Nathan Braswell Date: Tue, 7 Jan 2014 21:31:56 -0500 Subject: [PATCH] Fixed regex! Much simpler and sensible implementation too. --- include/RegEx.h | 3 +- include/util.h | 1 + krakenGrammer.kgm | 8 +- src/Lexer.cpp | 1 - src/RegEx.cpp | 234 ++++++++++++++-------------------------------- src/util.cpp | 12 +++ 6 files changed, 92 insertions(+), 167 deletions(-) diff --git a/include/RegEx.h b/include/RegEx.h index 7a9e7de..2e18c35 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -16,8 +16,7 @@ class RegEx { RegEx(std::string inPattern); ~RegEx(); - void construct(); - void deperenthesize(); + RegExState* construct(std::vector* ending, std::string pattern); int longMatch(std::string stringToMatch); std::string getPattern(); std::string toString(); diff --git a/include/util.h b/include/util.h index 42b3d8d..4cf10a6 100644 --- a/include/util.h +++ b/include/util.h @@ -12,5 +12,6 @@ std::string intToString(int theInt); std::string replaceExEscape(std::string first, std::string search, std::string replace); std::string strSlice(std::string str, int begin, int end); +int findPerenEnd(std::string str, int i); #endif diff --git a/krakenGrammer.kgm b/krakenGrammer.kgm index 43dff1b..d437ec0 100644 --- a/krakenGrammer.kgm +++ b/krakenGrammer.kgm @@ -19,7 +19,13 @@ if_comp = "__if_comp__" WS identifier WS if_comp_pred ; if_comp_pred = code_block | simple_passthrough ; simple_passthrough = "__simple_passthrough__" WS triple_quoted_string ; -triple_quoted_string = "((b)|a)*" ; +triple_quoted_string = "\"\"\"((\"\"(`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )+)|(\"(`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )+))*(`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )*(((`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )+\")|((`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )+\"\")|((`|1|2|3|4|5|6|7|8|9|0|-|=| |q|w|e|r|t|y|u|i|o|p|[|]|\\|a|s|d|f|g|h|j|k|l|;|'| +|z|x|c|v|b|n|m|,|.|/|~|!|@|#|$|%|^|&|\*|\(|\)|_|\+|Q|W|E|R|T|Y|U|I|O|P|{|}|\||A|S|D|F|G|H|J|K|L|:|\"|Z|X|C|V|B|N|M|<|>|\?| )+))*\"\"\"" ; identifier = alpha | alpha alphanumeric ; diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 9aa6059..b6f7033 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -20,7 +20,6 @@ void Lexer::setInput(std::string inputString) { } void Lexer::addRegEx(std::string regExString) { - std::cout << regExString << " at lexer" << std::endl; regExs.push_back(new RegEx(regExString)); } diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 3b36158..5a408a6 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -2,49 +2,56 @@ #include RegEx::RegEx(std::string inPattern) { - std::cout << inPattern << " at rexex" << std::endl; pattern = inPattern; - construct(); - std::cout << inPattern << " at rexex post" << std::endl; - deperenthesize(); + std::vector ending; + begin = construct(&ending, inPattern); + //last one is goal state, add it to the end of all of these last states + for (std::vector::size_type i = 0; i < ending.size(); i++) + ending[i]->addNext(NULL); } -void RegEx::construct() { - std::vector previousStates; - std::vector currentStates; - std::stack, std::vector > > perenStack; +RegExState* RegEx::construct(std::vector* ending, std::string pattern) { + //In the RegEx re-write, instead of doing complicated unperenthesising, we keep track of both the "front" and the "end" of a state. + //(these could be different if the state is perenthesezed) + std::vector previousStatesBegin; + std::vector previousStatesEnd; + std::vector currentStatesBegin; + std::vector currentStatesEnd; + bool alternating = false; - begin = new RegExState(); - currentStates.push_back(begin); + RegExState* begin = new RegExState(); + currentStatesBegin.push_back(begin); + currentStatesEnd.push_back(begin); + for (int i = 0; i < pattern.length(); i++) { switch (pattern[i]) { case '*': { //std::cout << "Star at " << i << " in " << pattern << std::endl; - // for (std::vector::size_type j = 0; j < currentStates.size(); j++) - // for (std::vector::size_type k = 0; k < currentStates.size(); k++) - // currentStates[j]->addNext(currentStates[k]); - currentStates[currentStates.size()-1]->addNext(currentStates[currentStates.size()-1]); + //NOTE: Because of the re-write, this is necessary again + for (std::vector::size_type j = 0; j < currentStatesEnd.size(); j++) + for (std::vector::size_type k = 0; k < currentStatesBegin.size(); k++) + currentStatesEnd[j]->addNext(currentStatesBegin[k]); //Make the ends point to the beginnings //add all previous states to current states to enable skipping over the starred item - currentStates.insert(currentStates.end(), previousStates.begin(), previousStates.end()); + currentStatesBegin.insert(currentStatesBegin.end(), previousStatesBegin.begin(), previousStatesBegin.end()); + currentStatesEnd.insert(currentStatesEnd.end(), previousStatesEnd.begin(), previousStatesEnd.end()); } break; case '+': { //std::cout << "Plus at " << i << " in " << pattern << std::endl; - //OtherThingy - //current->addNext(current); - // for (std::vector::size_type j = 0; j < currentStates.size(); j++) - // for (std::vector::size_type k = 0; k < currentStates.size(); k++) - // currentStates[j]->addNext(currentStates[k]); - currentStates[currentStates.size()-1]->addNext(currentStates[currentStates.size()-1]); + //NOTE: Because of the re-write, this is necessary again + for (std::vector::size_type j = 0; j < currentStatesEnd.size(); j++) + for (std::vector::size_type k = 0; k < currentStatesBegin.size(); k++) + currentStatesEnd[j]->addNext(currentStatesBegin[k]); //Make the ends point to the beginnings } break; case '?': { //std::cout << "Question at " << i << " in " << pattern << std::endl; //add all previous states to current states to enable skipping over the questioned item - currentStates.insert(currentStates.end(), previousStates.begin(), previousStates.end()); + currentStatesBegin.insert(currentStatesBegin.end(), previousStatesBegin.begin(), previousStatesBegin.end()); + currentStatesEnd.insert(currentStatesEnd.end(), previousStatesEnd.begin(), previousStatesEnd.end()); } break; case '|': @@ -59,59 +66,31 @@ void RegEx::construct() { { //std::cout << "Begin peren at " << i << " in " << pattern << std::endl; //perentheses - //Create a peren node with an inner empty node - RegExState* next = new RegExState(new RegExState()); - + std::vector innerEnds; + int perenEnd = findPerenEnd(pattern, i); + RegExState* innerBegin = construct(&innerEnds, strSlice(pattern, i+1, perenEnd)); + i = perenEnd; + std::vector innerBegins = *(innerBegin->getNextStates()); if (alternating) { - for (std::vector::size_type j = 0; j < previousStates.size(); j++) - previousStates[j]->addNext(next); - - //Save both current states here as well as the current preren - std::vector savePreviousStates = previousStates; - currentStates.push_back(next); - std::vector saveCurrentStates = currentStates; - perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates)); - - previousStates.clear(); - currentStates.clear(); - currentStates.push_back(next->getInner()); - alternating = false; + for (std::vector::size_type j = 0; j < previousStatesEnd.size(); j++) + for (std::vector::size_type k = 0; k < innerBegins.size(); k++) + previousStatesEnd[j]->addNext(innerBegins[k]); + currentStatesBegin.insert(currentStatesBegin.end(), innerBegins.begin(), innerBegins.end()); + currentStatesEnd.insert(currentStatesEnd.end(), innerEnds.begin(), innerEnds.end()); } else { - for (std::vector::size_type j = 0; j < currentStates.size(); j++) - currentStates[j]->addNext(next); - - //Save both current states here as well as the current preren - std::vector savePreviousStates = currentStates; - currentStates.clear(); - currentStates.push_back(next); - std::vector saveCurrentStates = currentStates; - perenStack.push(std::make_pair(savePreviousStates, saveCurrentStates)); - - previousStates.clear(); - currentStates.clear(); - currentStates.push_back(next->getInner()); + for (std::vector::size_type j = 0; j < currentStatesEnd.size(); j++) + for (std::vector::size_type k = 0; k < innerBegins.size(); k++) + currentStatesEnd[j]->addNext(innerBegins[k]); + previousStatesBegin = currentStatesBegin; + previousStatesEnd = currentStatesEnd; + currentStatesBegin = innerBegins; + currentStatesEnd = innerEnds; } - //std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl; + alternating = false; } break; - case ')': - { - //std::cout << "End peren at " << i << " in " << pattern << std::endl; - //perentheses - //Pop off the states that will now be the previous states and the peren node which will now be the current node - std::pair, std::vector > savedPair = perenStack.top(); - perenStack.pop(); - //Make the it so - previousStates = savedPair.first; - //Make sure the end of the inner stuff points back to the peren node - for (std::vector::size_type j = 0; j < currentStates.size(); j++) - currentStates[j]->addNext(savedPair.second[savedPair.second.size()-1]); - //currentStates[j]->addNext(*(savedPair.second.end())); - currentStates.clear(); - currentStates = savedPair.second; - } - break; + // ) does not need a case as we skip over it after finding it in ('s case case '\\': { @@ -126,109 +105,33 @@ void RegEx::construct() { RegExState* next = new RegExState(pattern[i]); //If we're alternating, add next as the next for each previous state, and add self to currentStates if (alternating) { - for (std::vector::size_type j = 0; j < previousStates.size(); j++) { - previousStates[j]->addNext(next); - //std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << previousStates[j] << std::endl; - } - currentStates.push_back(next); + for (std::vector::size_type j = 0; j < previousStatesEnd.size(); j++) + previousStatesEnd[j]->addNext(next); + currentStatesBegin.push_back(next); + currentStatesEnd.push_back(next); alternating = false; } else { //If we're not alternating, add next as next for all the current states, make the current states the new //previous states, and add ourself as the new current state. - for (std::vector::size_type j = 0; j < currentStates.size(); j++) { - currentStates[j]->addNext(next); - //std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl; - } - previousStates.clear(); - previousStates = currentStates; - currentStates.clear(); - currentStates.push_back(next); + for (std::vector::size_type j = 0; j < currentStatesEnd.size(); j++) + currentStatesEnd[j]->addNext(next); + + previousStatesBegin.clear(); + previousStatesEnd.clear(); + previousStatesBegin = currentStatesBegin; + previousStatesEnd = currentStatesEnd; + currentStatesBegin.clear(); + currentStatesEnd.clear(); + currentStatesBegin.push_back(next); + currentStatesEnd.push_back(next); } } } } - //last one is goal state - for (std::vector::size_type i = 0; i < currentStates.size(); i++) - currentStates[i]->addNext(NULL); + (*ending) = currentStatesEnd; + return(begin); } -void RegEx::deperenthesize() { - //std::cout << "About to de-perenthesize " << begin->toString() << std::endl; - - //Now go through and expand the peren nodes to regular nodes - std::vector processedStates; - std::vector statesToProcess; - statesToProcess.push_back(begin); - for (std::vector::size_type i = 0; i < statesToProcess.size(); i++) { - //Don't process null (sucess) state - if (statesToProcess[i] == NULL) - continue; - std::vector* nextStates = statesToProcess[i]->getNextStates(); - for (std::vector::size_type j = 0; j < nextStates->size(); j++) { - if ((*nextStates)[j] != NULL && (*nextStates)[j]->getInner() != NULL) { - //Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others) - std::vector* insideNextStates = (*nextStates)[j]->getInner()->getNextStates(); - //std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl; - RegExState* perenState = (*nextStates)[j]; - (*nextStates)[j] = (*insideNextStates)[0]; - //std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl; - for (std::vector::size_type k = 1; k < insideNextStates->size(); k++) - nextStates->push_back((*insideNextStates)[k]); - //std::cout << "Replaced beginning: " << begin->toString() << std::endl; - //Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner - std::vector* perenNextNodes = perenState->getNextStates(); - for (std::vector::size_type k = 0; k < perenNextNodes->size(); k++) { - if ((*perenNextNodes)[k] == perenState) { - (*perenNextNodes)[k] = (*insideNextStates)[0]; - for (std::vector::size_type l = 1; l < insideNextStates->size(); l++) - perenNextNodes->push_back((*insideNextStates)[l]); - } - } - //std::cout << "Fixed self-references: " << begin->toString() << std::endl; - //Need to fix the end too - std::vector traversalList; - traversalList.push_back(perenState->getInner()); - for (std::vector::size_type k = 0; k < traversalList.size(); k++) { - std::vector* nextTraversalStates = traversalList[k]->getNextStates(); - //std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl; - //std::cout << "with children:" << std::endl; - //for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) - // std::cout << "\t\"" << (*nextTraversalStates)[l]->getCharacter() << "\"" << std::endl; - //std::cout << std::endl; - for (std::vector::size_type l = 0; l < nextTraversalStates->size(); l++) { - //If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren - //And we now replace this reference with the next nodes from the peren node - //std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl; - if ((*nextTraversalStates)[l] == perenState) { - // std::cout << "nextTraversalStates[l] = to perenState!" << std::endl; - std::vector endPerenNextStates = *(perenState->getNextStates()); - (*nextTraversalStates)[l] = endPerenNextStates[0]; - for (std::vector::size_type n = 1; n < endPerenNextStates.size(); n++) - nextTraversalStates->push_back(endPerenNextStates[n]); - //Now make sure we don't now try to continue through and end up processing stuff we just replaced the peren reference with - break; - } else { - traversalList.push_back((*nextTraversalStates)[l]); - } - } - } - } - } - //Now add all these next states to process, only if they haven't already been processed - for (std::vector::size_type j = 0; j < nextStates->size(); j++) { - bool inCurrStates = false; - for (std::vector::size_type k = 0; k < statesToProcess.size(); k++) { - if ((*nextStates)[j] == statesToProcess[k]) - inCurrStates = true; - } - if (!inCurrStates) { - statesToProcess.push_back((*nextStates)[j]); - //std::cout << (*nextStates)[j] << "Is not in states to process" << std::endl; - } - } - } - //std::cout << "Finished de-perenthesization " << begin->toString() << std::endl; -} RegEx::~RegEx() { //No cleanup necessary @@ -315,9 +218,14 @@ void RegEx::test() { { RegEx re("((ab)|c)*"); assert(re.longMatch("ababc") == 5); - assert(re.longMatch("ad") == 1); + assert(re.longMatch("ad") == 0); assert(re.longMatch("ababccd") == 6); } + { + RegEx re("bbb((bba+)|(ba+))*a*((a+b)|(a+bb)|(a+))*bbb") ; + std::cout << re.longMatch("bbbababbbaaaaaaaaaaaaaaaaaaabbb") << std::endl; + + } std::cout << "RegEx tests pass\n"; } diff --git a/src/util.cpp b/src/util.cpp index 03d5cd3..a08e776 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -40,3 +40,15 @@ std::string strSlice(std::string str, int begin, int end) { end += str.length()+1; return str.substr(begin, end-begin); } + +int findPerenEnd(std::string str, int i) { + int numHangingOpen = 0; + for (; i< str.length(); i++) { + if (str[i] == '(') + numHangingOpen++; + else if (str[i] == ')') + numHangingOpen--; + if (numHangingOpen == 0) + return i; + } +}