2013-07-01 22:45:33 -04:00
# include "RegEx.h"
RegEx : : RegEx ( std : : string inPattern ) {
pattern = inPattern ;
2013-07-10 23:50:53 -04:00
construct ( ) ;
deperenthesize ( ) ;
}
void RegEx : : construct ( ) {
2013-07-03 23:40:36 -04:00
std : : vector < RegExState * > previousStates ;
std : : vector < RegExState * > currentStates ;
2013-07-10 23:50:53 -04:00
std : : stack < std : : pair < std : : vector < RegExState * > , std : : vector < RegExState * > > > perenStack ;
bool alternating = false ;
2013-07-02 13:14:40 -04:00
begin = new RegExState ( ) ;
2013-07-03 23:40:36 -04:00
currentStates . push_back ( begin ) ;
2013-07-02 13:14:40 -04:00
for ( int i = 0 ; i < pattern . length ( ) ; i + + ) {
2013-07-03 23:40:36 -04:00
switch ( pattern [ i ] ) {
case ' * ' :
{
2013-07-28 19:45:08 -04:00
//std::cout << "Star at " << i << " in " << pattern << std::endl;
2013-07-04 15:10:32 -04:00
// for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
// for (std::vector<RegExState*>::size_type k = 0; k < currentStates.size(); k++)
// currentStates[j]->addNext(currentStates[k]);
currentStates [ currentStates . size ( ) - 1 ] - > addNext ( currentStates [ currentStates . size ( ) - 1 ] ) ;
2013-07-03 23:40:36 -04:00
//add all previous states to current states to enable skipping over the starred item
currentStates . insert ( currentStates . end ( ) , previousStates . begin ( ) , previousStates . end ( ) ) ;
}
break ;
case ' + ' :
{
2013-07-28 19:45:08 -04:00
//std::cout << "Plus at " << i << " in " << pattern << std::endl;
2013-07-03 23:40:36 -04:00
//OtherThingy
//current->addNext(current);
2013-07-04 15:10:32 -04:00
// for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
// for (std::vector<RegExState*>::size_type k = 0; k < currentStates.size(); k++)
// currentStates[j]->addNext(currentStates[k]);
currentStates [ currentStates . size ( ) - 1 ] - > addNext ( currentStates [ currentStates . size ( ) - 1 ] ) ;
2013-07-03 23:40:36 -04:00
}
break ;
case ' ? ' :
{
2013-07-28 19:45:08 -04:00
//std::cout << "Question at " << i << " in " << pattern << std::endl;
2013-07-03 23:40:36 -04:00
//add all previous states to current states to enable skipping over the questioned item
currentStates . insert ( currentStates . end ( ) , previousStates . begin ( ) , previousStates . end ( ) ) ;
}
break ;
case ' | ' :
2013-07-04 15:10:32 -04:00
{
2013-07-28 19:45:08 -04:00
//std::cout << "Alternation at " << i << " in " << pattern << std::endl;
2013-07-03 23:40:36 -04:00
//alternation
2013-07-10 23:50:53 -04:00
alternating = true ;
2013-07-04 15:10:32 -04:00
}
2013-07-03 23:40:36 -04:00
break ;
case ' ( ' :
2013-07-07 02:13:05 -04:00
{
2013-07-28 19:45:08 -04:00
//std::cout << "Begin peren at " << i << " in " << pattern << std::endl;
2013-07-03 23:40:36 -04:00
//perentheses
2013-07-07 02:13:05 -04:00
//Create a peren node with an inner empty node
RegExState * next = new RegExState ( new RegExState ( ) ) ;
2013-07-10 23:50:53 -04:00
if ( alternating ) {
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < previousStates . size ( ) ; j + + )
previousStates [ j ] - > addNext ( next ) ;
2013-07-07 02:13:05 -04:00
2013-07-10 23:50:53 -04:00
//Save both current states here as well as the current preren
std : : vector < RegExState * > savePreviousStates = previousStates ;
currentStates . push_back ( next ) ;
std : : vector < RegExState * > saveCurrentStates = currentStates ;
perenStack . push ( std : : make_pair ( savePreviousStates , saveCurrentStates ) ) ;
2013-07-07 02:13:05 -04:00
2013-07-10 23:50:53 -04:00
previousStates . clear ( ) ;
currentStates . clear ( ) ;
currentStates . push_back ( next - > getInner ( ) ) ;
alternating = false ;
} else {
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < currentStates . size ( ) ; j + + )
currentStates [ j ] - > addNext ( next ) ;
//Save both current states here as well as the current preren
std : : vector < RegExState * > savePreviousStates = currentStates ;
currentStates . clear ( ) ;
currentStates . push_back ( next ) ;
std : : vector < RegExState * > saveCurrentStates = currentStates ;
perenStack . push ( std : : make_pair ( savePreviousStates , saveCurrentStates ) ) ;
previousStates . clear ( ) ;
currentStates . clear ( ) ;
currentStates . push_back ( next - > getInner ( ) ) ;
}
2013-07-28 19:45:08 -04:00
//std::cout << "Peren is " << next << " Inner is " << currentStates[0] << " = " << next->getInner() << std::endl;
2013-07-07 02:13:05 -04:00
}
break ;
case ' ) ' :
{
2013-07-28 19:45:08 -04:00
//std::cout << "End peren at " << i << " in " << pattern << std::endl;
2013-07-07 02:13:05 -04:00
//perentheses
//Pop off the states that will now be the previous states and the peren node which will now be the current node
2013-07-10 23:50:53 -04:00
std : : pair < std : : vector < RegExState * > , std : : vector < RegExState * > > savedPair = perenStack . top ( ) ;
2013-07-07 02:13:05 -04:00
perenStack . pop ( ) ;
//Make the it so
previousStates = savedPair . first ;
//Make sure the end of the inner stuff points back to the peren node
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < currentStates . size ( ) ; j + + )
2013-07-10 23:50:53 -04:00
currentStates [ j ] - > addNext ( savedPair . second [ savedPair . second . size ( ) - 1 ] ) ;
//currentStates[j]->addNext(*(savedPair.second.end()));
2013-07-07 02:13:05 -04:00
currentStates . clear ( ) ;
2013-07-10 23:50:53 -04:00
currentStates = savedPair . second ;
2013-07-07 02:13:05 -04:00
}
break ;
case ' \\ ' :
{
i + + ;
2013-07-28 19:45:08 -04:00
//std::cout << "Escape! Escaping: " << pattern[i] << std::endl;
2013-07-10 23:50:53 -04:00
//Ahh, it's escaping a special character, so fall through to the default.
2013-07-07 02:13:05 -04:00
}
2013-07-03 23:40:36 -04:00
default :
{
2013-07-28 19:45:08 -04:00
//std::cout << "Regular" << std::endl;
2013-07-03 23:40:36 -04:00
//Ahh, it's regular
RegExState * next = new RegExState ( pattern [ i ] ) ;
2013-07-10 23:50:53 -04:00
//If we're alternating, add next as the next for each previous state, and add self to currentStates
if ( alternating ) {
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < previousStates . size ( ) ; j + + ) {
previousStates [ j ] - > addNext ( next ) ;
2013-07-28 19:45:08 -04:00
//std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << previousStates[j] << std::endl;
2013-07-10 23:50:53 -04:00
}
currentStates . push_back ( next ) ;
alternating = false ;
} else {
//If we're not alternating, add next as next for all the current states, make the current states the new
//previous states, and add ourself as the new current state.
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < currentStates . size ( ) ; j + + ) {
currentStates [ j ] - > addNext ( next ) ;
2013-07-28 19:45:08 -04:00
//std::cout << "Adding " << next << ", which is " << pattern[i] << " to " << currentStates[j] << std::endl;
2013-07-10 23:50:53 -04:00
}
previousStates . clear ( ) ;
previousStates = currentStates ;
currentStates . clear ( ) ;
currentStates . push_back ( next ) ;
2013-07-07 02:13:05 -04:00
}
2013-07-03 23:40:36 -04:00
}
}
2013-07-01 22:45:33 -04:00
}
2013-07-03 23:40:36 -04:00
//last one is goal state
for ( std : : vector < RegExState * > : : size_type i = 0 ; i < currentStates . size ( ) ; i + + )
currentStates [ i ] - > addNext ( NULL ) ;
2013-07-10 23:50:53 -04:00
}
2013-07-07 02:13:05 -04:00
2013-07-10 23:50:53 -04:00
void RegEx : : deperenthesize ( ) {
2013-07-28 19:45:08 -04:00
//std::cout << "About to de-perenthesize " << begin->toString() << std::endl;
2013-07-07 02:13:05 -04:00
//Now go through and expand the peren nodes to regular nodes
std : : vector < RegExState * > processedStates ;
std : : vector < RegExState * > statesToProcess ;
statesToProcess . push_back ( begin ) ;
for ( std : : vector < RegExState * > : : size_type i = 0 ; i < statesToProcess . size ( ) ; i + + ) {
//Don't process null (sucess) state
if ( statesToProcess [ i ] = = NULL )
continue ;
std : : vector < RegExState * > * nextStates = statesToProcess [ i ] - > getNextStates ( ) ;
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < nextStates - > size ( ) ; j + + ) {
if ( ( * nextStates ) [ j ] ! = NULL & & ( * nextStates ) [ j ] - > getInner ( ) ! = NULL ) {
//Fix all the next references pointing to the peren node to point to the inner nodes. (if more than one, push back to add others)
std : : vector < RegExState * > * insideNextStates = ( * nextStates ) [ j ] - > getInner ( ) - > getNextStates ( ) ;
2013-07-10 23:50:53 -04:00
//std::cout << "insideNextStates = " << insideNextStates << " [0] " << (*insideNextStates)[0] << std::endl;
2013-07-07 02:13:05 -04:00
RegExState * perenState = ( * nextStates ) [ j ] ;
( * nextStates ) [ j ] = ( * insideNextStates ) [ 0 ] ;
2013-07-10 23:50:53 -04:00
//std::cout << "So now nextstates[j] = " << (*nextStates)[j] << std::endl;
2013-07-07 02:13:05 -04:00
for ( std : : vector < RegExState * > : : size_type k = 1 ; k < insideNextStates - > size ( ) ; k + + )
nextStates - > push_back ( ( * insideNextStates ) [ k ] ) ;
2013-07-10 23:50:53 -04:00
//std::cout << "Replaced beginning: " << begin->toString() << std::endl;
2013-07-07 02:13:05 -04:00
//Now, if the peren node is self-referential (has a repitition operator after i), fix it's self-references in the same manner
std : : vector < RegExState * > * perenNextNodes = perenState - > getNextStates ( ) ;
for ( std : : vector < RegExState * > : : size_type k = 0 ; k < perenNextNodes - > size ( ) ; k + + ) {
if ( ( * perenNextNodes ) [ k ] = = perenState ) {
( * perenNextNodes ) [ k ] = ( * insideNextStates ) [ 0 ] ;
for ( std : : vector < RegExState * > : : size_type l = 1 ; l < insideNextStates - > size ( ) ; l + + )
perenNextNodes - > push_back ( ( * insideNextStates ) [ l ] ) ;
}
}
//std::cout << "Fixed self-references: " << begin->toString() << std::endl;
//Need to fix the end too
std : : vector < RegExState * > traversalList ;
traversalList . push_back ( perenState - > getInner ( ) ) ;
for ( std : : vector < RegExState * > : : size_type k = 0 ; k < traversalList . size ( ) ; k + + ) {
std : : vector < RegExState * > * nextTraversalStates = traversalList [ k ] - > getNextStates ( ) ;
2013-07-10 23:50:53 -04:00
//std::cout << "Traversing! nextTraversalStates from traversalList " << traversalList[k] << " char = " << traversalList[k]->getCharacter() << std::endl;
//std::cout << "with children:" << std::endl;
//for (std::vector<RegExState*>::size_type l = 0; l < nextTraversalStates->size(); l++)
// std::cout << "\t\"" << (*nextTraversalStates)[l]->getCharacter() << "\"" << std::endl;
//std::cout << std::endl;
2013-07-07 02:13:05 -04:00
for ( std : : vector < RegExState * > : : size_type l = 0 ; l < nextTraversalStates - > size ( ) ; l + + ) {
//If this node is equal to the peren node we came from, then that means we've reached the end of the inner part of the peren
//And we now replace this reference with the next nodes from the peren node
2013-07-10 23:50:53 -04:00
//std::cout << "Traversal Next is on " << (*nextTraversalStates)[l]->getCharacter() << std::endl;
2013-07-07 02:13:05 -04:00
if ( ( * nextTraversalStates ) [ l ] = = perenState ) {
2013-07-10 23:50:53 -04:00
// std::cout << "nextTraversalStates[l] = to perenState!" << std::endl;
2013-07-07 02:13:05 -04:00
std : : vector < RegExState * > endPerenNextStates = * ( perenState - > getNextStates ( ) ) ;
( * nextTraversalStates ) [ l ] = endPerenNextStates [ 0 ] ;
for ( std : : vector < RegExState * > : : size_type n = 1 ; n < endPerenNextStates . size ( ) ; n + + )
nextTraversalStates - > push_back ( endPerenNextStates [ n ] ) ;
//Now make sure we don't now try to continue through and end up processing stuff we just replaced the peren reference with
break ;
} else {
traversalList . push_back ( ( * nextTraversalStates ) [ l ] ) ;
}
}
}
}
}
//Now add all these next states to process, only if they haven't already been processed
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < nextStates - > size ( ) ; j + + ) {
bool inCurrStates = false ;
for ( std : : vector < RegExState * > : : size_type k = 0 ; k < statesToProcess . size ( ) ; k + + ) {
if ( ( * nextStates ) [ j ] = = statesToProcess [ k ] )
inCurrStates = true ;
}
if ( ! inCurrStates ) {
statesToProcess . push_back ( ( * nextStates ) [ j ] ) ;
//std::cout << (*nextStates)[j] << "Is not in states to process" << std::endl;
}
}
}
2013-07-28 19:45:08 -04:00
//std::cout << "Finished de-perenthesization " << begin->toString() << std::endl;
2013-07-01 22:45:33 -04:00
}
RegEx : : ~ RegEx ( ) {
//No cleanup necessary
}
int RegEx : : longMatch ( std : : string stringToMatch ) {
2013-07-02 01:47:42 -04:00
//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
2013-07-02 13:14:40 -04:00
int lastMatch = - 1 ;
currentStates = * ( begin - > advance ( stringToMatch [ 0 ] ) ) ;
2013-07-01 22:45:33 -04:00
std : : vector < RegExState * > nextStates ;
2013-07-02 13:14:40 -04:00
for ( int i = 1 ; i < stringToMatch . size ( ) ; i + + ) {
2013-07-01 22:45:33 -04:00
//Go through every current state. Check to see if it is goal, if so update last goal.
//Also, add each state's advance to nextStates
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < currentStates . size ( ) ; j + + ) {
2013-07-02 01:47:42 -04:00
if ( currentStates [ j ] - > isGoal ( ) ) {
2013-07-01 22:45:33 -04:00
lastMatch = i - 1 ;
2013-08-06 01:49:45 -04:00
//std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
2013-07-02 01:47:42 -04:00
} else {
2013-08-06 01:49:45 -04:00
//std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
2013-07-02 01:47:42 -04:00
}
2013-07-01 22:45:33 -04:00
std : : vector < RegExState * > * addStates = currentStates [ j ] - > advance ( stringToMatch . at ( i ) ) ;
nextStates . insert ( nextStates . end ( ) , addStates - > begin ( ) , addStates - > end ( ) ) ;
delete addStates ;
}
//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
2013-07-02 01:47:42 -04:00
2013-07-01 22:45:33 -04:00
currentStates . clear ( ) ;
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < nextStates . size ( ) ; j + + ) {
bool inCurrStates = false ;
for ( std : : vector < RegExState * > : : size_type k = 0 ; k < currentStates . size ( ) ; k + + ) {
2013-07-07 02:13:05 -04:00
if ( nextStates [ j ] = = currentStates [ k ] )
2013-07-01 22:45:33 -04:00
inCurrStates = true ;
}
if ( ! inCurrStates )
currentStates . push_back ( nextStates [ j ] ) ;
}
2013-08-16 00:03:26 -04:00
// if (currentStates.size() != 0)
// std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
2013-07-07 02:13:05 -04:00
2013-07-01 22:45:33 -04:00
nextStates . clear ( ) ;
//If we can't continue matching, just return our last matched
if ( currentStates . size ( ) = = 0 )
break ;
}
//Check to see if we match on the last character in the string
for ( std : : vector < RegExState * > : : size_type j = 0 ; j < currentStates . size ( ) ; j + + ) {
if ( currentStates [ j ] - > isGoal ( ) )
lastMatch = stringToMatch . size ( ) - 1 ;
}
return lastMatch ;
}
std : : string RegEx : : getPattern ( ) {
return pattern ;
}
2013-07-02 01:47:42 -04:00
std : : string RegEx : : toString ( ) {
return pattern + " -> " + begin - > toString ( ) ;
}