diff --git a/include/Lexer.h b/include/Lexer.h index 8e87d84..02223a9 100644 --- a/include/Lexer.h +++ b/include/Lexer.h @@ -16,9 +16,10 @@ class Lexer { void addRegEx(std::string regExString); void setInput(std::string inputString); Symbol next(); + static void test(); private: std::vector regExs; std::string input; int currentPosition; }; -#endif \ No newline at end of file +#endif diff --git a/include/RegEx.h b/include/RegEx.h index 77db781..7a9e7de 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -21,9 +21,10 @@ class RegEx { int longMatch(std::string stringToMatch); std::string getPattern(); std::string toString(); + static void test(); private: std::string pattern; RegExState* begin; std::vector currentStates; }; -#endif \ No newline at end of file +#endif diff --git a/main.cpp b/main.cpp index 1c1bf8e..a6d0fe0 100644 --- a/main.cpp +++ b/main.cpp @@ -19,6 +19,8 @@ int main(int argc, char* argv[]) { if (argc == 2 && std::string(argv[1]) == "--test") { StringReader::test(); + RegEx::test(); + Lexer::test(); return 0; } diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 72055ea..a8dccea 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -1,4 +1,5 @@ #include "Lexer.h" +#include Lexer::Lexer() { //Do nothing @@ -23,13 +24,13 @@ void Lexer::addRegEx(std::string regExString) { } Symbol Lexer::next() { - //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition << " out of " << input.length() <= input.length()-1) + if (currentPosition >= input.length()) return Symbol("$EOF$", true); int longestMatch = -1; RegEx* longestRegEx = NULL; - std::string remainingString = input.substr(currentPosition,input.length()-1); + std::string remainingString = input.substr(currentPosition); for (std::vector::size_type i = 0; i < regExs.size(); i++) { //std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl; int currentMatch = regExs[i]->longMatch(remainingString); @@ -39,13 +40,77 @@ Symbol Lexer::next() { } } if (longestRegEx != NULL) { - std::string eatenString = input.substr(currentPosition, longestMatch+1); - currentPosition += longestMatch + 1; - //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <getPattern(), true, eatenString); } else { //std::cout << "Found no applicable regex" << std::endl; - //std::cout << "Remaining is ||" << input.substr(currentPosition,input.length()-1) << "||" << std::endl; + //std::cout << "Remaining is ||" << input.substr(currentPosition) << "||" << std::endl; return Symbol(); } -} \ No newline at end of file +} + +void Lexer::test() { + Symbol s; + { + Lexer lex; + lex.addRegEx("b"); + lex.setInput("bb"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + assert(lex.next() == Symbol("$EOF$", true)); + } + + { + Lexer lex; + lex.addRegEx("a*"); + lex.addRegEx("b"); + lex.setInput("aaabaabb"); + s = lex.next(); + assert(s.getName() == "a*" && s.getValue() == "aaa"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "a*" && s.getValue() == "aa"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + assert(lex.next() == Symbol("$EOF$", true)); + } + + // Test a lexer error condition. + { + Lexer lex; + lex.addRegEx("a|b"); + lex.setInput("blah"); + s = lex.next(); + assert(s.getName() == "a|b" && s.getValue() == "b"); + assert(lex.next() == Symbol()); + } + + // Lexer can consume all the input at once. + { + Lexer lex; + lex.addRegEx("xyzzy"); + lex.setInput("xyzzy"); + s = lex.next(); + assert(s.getName() == "xyzzy" && s.getValue() == "xyzzy"); + assert(lex.next() == Symbol("$EOF$", true)); + } + + // Lexer produces the longest match, not the first. + { + Lexer lex; + lex.addRegEx("int"); + lex.addRegEx("(i|n|t|e)+"); + lex.setInput("intent"); + s = lex.next(); + assert(s.getName() == "(i|n|t|e)+" && s.getValue() == "intent"); + } + + std::cout << "Lexer tests passed\n"; +} diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 293acbe..b54f6ab 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -1,4 +1,5 @@ #include "RegEx.h" +#include RegEx::RegEx(std::string inPattern) { pattern = inPattern; @@ -232,17 +233,18 @@ RegEx::~RegEx() { } int RegEx::longMatch(std::string stringToMatch) { - //If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first + // Start in the begin state (only). int lastMatch = -1; - currentStates = *(begin->advance(stringToMatch[0])); + currentStates.clear(); + currentStates.push_back(begin); std::vector nextStates; - for (int i = 1; i < stringToMatch.size(); i++) { + for (int i = 0; i < stringToMatch.size(); i++) { //Go through every current state. Check to see if it is goal, if so update last goal. //Also, add each state's advance to nextStates for (std::vector::size_type j = 0; j < currentStates.size(); j++) { if (currentStates[j]->isGoal()) { - lastMatch = i-1; + lastMatch = i; //std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl; } else { //std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <::size_type j = 0; j < currentStates.size(); j++) { if (currentStates[j]->isGoal()) - lastMatch = stringToMatch.size()-1; + lastMatch = stringToMatch.size(); } return lastMatch; } @@ -286,3 +288,27 @@ std::string RegEx::getPattern() { std::string RegEx::toString() { return pattern + " -> " + begin->toString(); } + +void RegEx::test() { + { + RegEx re("a*"); + assert(re.longMatch("a") == 1); + assert(re.longMatch("aa") == 2); + assert(re.longMatch("aaaab") == 4); + assert(re.longMatch("b") == 0); + } + + { + RegEx re("a+"); + assert(re.longMatch("aa") == 2); + assert(re.longMatch("aaaab") == 4); + assert(re.longMatch("b") == -1); + } + + { + RegEx re("a(bc)?"); + assert(re.longMatch("ab") == 1); + } + + std::cout << "RegEx tests pass\n"; +}