From 7859b297254dfa180a71e9140d7904902719053d Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sat, 26 Oct 2013 23:05:25 -0700 Subject: [PATCH 1/6] Fix a minor bug in the lexer (it would not match the last character of the input) and add Lexer tests. --- include/Lexer.h | 3 ++- main.cpp | 1 + src/Lexer.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/include/Lexer.h b/include/Lexer.h index 8e87d84..02223a9 100644 --- a/include/Lexer.h +++ b/include/Lexer.h @@ -16,9 +16,10 @@ class Lexer { void addRegEx(std::string regExString); void setInput(std::string inputString); Symbol next(); + static void test(); private: std::vector regExs; std::string input; int currentPosition; }; -#endif \ No newline at end of file +#endif diff --git a/main.cpp b/main.cpp index 1c1bf8e..fb254d1 100644 --- a/main.cpp +++ b/main.cpp @@ -19,6 +19,7 @@ int main(int argc, char* argv[]) { if (argc == 2 && std::string(argv[1]) == "--test") { StringReader::test(); + Lexer::test(); return 0; } diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 72055ea..1dcb4da 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -1,4 +1,5 @@ #include "Lexer.h" +#include Lexer::Lexer() { //Do nothing @@ -25,7 +26,7 @@ void Lexer::addRegEx(std::string regExString) { Symbol Lexer::next() { //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition << " out of " << input.length() <= input.length()-1) + if (currentPosition >= input.length()) return Symbol("$EOF$", true); int longestMatch = -1; RegEx* longestRegEx = NULL; @@ -48,4 +49,48 @@ Symbol Lexer::next() { //std::cout << "Remaining is ||" << input.substr(currentPosition,input.length()-1) << "||" << std::endl; return Symbol(); } -} \ No newline at end of file +} + +void Lexer::test() { + Symbol s; + { + Lexer lex; + lex.addRegEx("b"); + lex.setInput("bb"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + assert(lex.next() == Symbol("$EOF$", true)); + } + + { + Lexer lex; + lex.addRegEx("a*"); + lex.addRegEx("b"); + lex.setInput("aaabaabb"); + s = lex.next(); + assert(s.getName() == "a*" && s.getValue() == "aaa"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "a*" && s.getValue() == "aa"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + s = lex.next(); + assert(s.getName() == "b" && s.getValue() == "b"); + assert(lex.next() == Symbol("$EOF$", true)); + } + + // Test a lexer error condition. + { + Lexer lex; + lex.addRegEx("a|b"); + lex.setInput("blah"); + s = lex.next(); + assert(s.getName() == "a|b" && s.getValue() == "b"); + assert(lex.next() == Symbol()); + } + + std::cout << "Lexer tests passed\n"; +} From d2d38e2516d37a5e2f48dadd843d03a75795ecd0 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sat, 26 Oct 2013 23:29:23 -0700 Subject: [PATCH 2/6] Fix what looks like an off-by-one error in RegEx::longestMatch()'s lastMatch calculation, and a corresponding +1 in code using longestMatch, and add a test. --- include/RegEx.h | 3 ++- main.cpp | 1 + src/Lexer.cpp | 4 ++-- src/RegEx.cpp | 14 ++++++++++++-- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/include/RegEx.h b/include/RegEx.h index 77db781..7a9e7de 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -21,9 +21,10 @@ class RegEx { int longMatch(std::string stringToMatch); std::string getPattern(); std::string toString(); + static void test(); private: std::string pattern; RegExState* begin; std::vector currentStates; }; -#endif \ No newline at end of file +#endif diff --git a/main.cpp b/main.cpp index fb254d1..a6d0fe0 100644 --- a/main.cpp +++ b/main.cpp @@ -19,6 +19,7 @@ int main(int argc, char* argv[]) { if (argc == 2 && std::string(argv[1]) == "--test") { StringReader::test(); + RegEx::test(); Lexer::test(); return 0; } diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 1dcb4da..55b5720 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -40,8 +40,8 @@ Symbol Lexer::next() { } } if (longestRegEx != NULL) { - std::string eatenString = input.substr(currentPosition, longestMatch+1); - currentPosition += longestMatch + 1; + std::string eatenString = input.substr(currentPosition, longestMatch); + currentPosition += longestMatch; //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <getPattern(), true, eatenString); } else { diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 293acbe..7dd309f 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -1,4 +1,5 @@ #include "RegEx.h" +#include RegEx::RegEx(std::string inPattern) { pattern = inPattern; @@ -242,7 +243,7 @@ int RegEx::longMatch(std::string stringToMatch) { //Also, add each state's advance to nextStates for (std::vector::size_type j = 0; j < currentStates.size(); j++) { if (currentStates[j]->isGoal()) { - lastMatch = i-1; + lastMatch = i; //std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl; } else { //std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <::size_type j = 0; j < currentStates.size(); j++) { if (currentStates[j]->isGoal()) - lastMatch = stringToMatch.size()-1; + lastMatch = stringToMatch.size(); } return lastMatch; } @@ -286,3 +287,12 @@ std::string RegEx::getPattern() { std::string RegEx::toString() { return pattern + " -> " + begin->toString(); } + +void RegEx::test() { + { + RegEx re("a*"); + assert(re.longMatch("aa") == 2); + } + + std::cout << "RegEx tests pass\n"; +} From 9336193aaf3f2f8df209a4204c4f60c71d2460a0 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sat, 26 Oct 2013 23:52:54 -0700 Subject: [PATCH 3/6] Allow RegEx matches of length 0. This seems more correct to me. (However, this is not super important in practice. Grammar files should not contain regular expressions that could match the empty string; if such a RegEx matched 0 characters once, it would match again and again forever, since it wouldn't consume any input.) --- src/RegEx.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 7dd309f..88f4f91 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -233,12 +233,13 @@ RegEx::~RegEx() { } int RegEx::longMatch(std::string stringToMatch) { - //If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first + // Start in the begin state (only). int lastMatch = -1; - currentStates = *(begin->advance(stringToMatch[0])); + currentStates.clear(); + currentStates.push_back(begin); std::vector nextStates; - for (int i = 1; i < stringToMatch.size(); i++) { + for (int i = 0; i < stringToMatch.size(); i++) { //Go through every current state. Check to see if it is goal, if so update last goal. //Also, add each state's advance to nextStates for (std::vector::size_type j = 0; j < currentStates.size(); j++) { @@ -292,6 +293,7 @@ void RegEx::test() { { RegEx re("a*"); assert(re.longMatch("aa") == 2); + assert(re.longMatch("b") == 0); } std::cout << "RegEx tests pass\n"; From 0c50ad4197494b64633710040e3f94d3065f154a Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sat, 26 Oct 2013 23:56:40 -0700 Subject: [PATCH 4/6] Add passing RegEx tests. --- src/RegEx.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 88f4f91..b54f6ab 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -292,9 +292,23 @@ std::string RegEx::toString() { void RegEx::test() { { RegEx re("a*"); + assert(re.longMatch("a") == 1); assert(re.longMatch("aa") == 2); + assert(re.longMatch("aaaab") == 4); assert(re.longMatch("b") == 0); } + { + RegEx re("a+"); + assert(re.longMatch("aa") == 2); + assert(re.longMatch("aaaab") == 4); + assert(re.longMatch("b") == -1); + } + + { + RegEx re("a(bc)?"); + assert(re.longMatch("ab") == 1); + } + std::cout << "RegEx tests pass\n"; } From 4d156fa743f76db11c5cfd104b3f575047ab2419 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sun, 27 Oct 2013 00:00:55 -0700 Subject: [PATCH 5/6] Fix substr() bug when the Lexer consumes all the input at once. --- src/Lexer.cpp | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 55b5720..52062a2 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -24,13 +24,13 @@ void Lexer::addRegEx(std::string regExString) { } Symbol Lexer::next() { - //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition << " out of " << input.length() <= input.length()) return Symbol("$EOF$", true); int longestMatch = -1; RegEx* longestRegEx = NULL; - std::string remainingString = input.substr(currentPosition,input.length()-1); + std::string remainingString = input.substr(currentPosition); for (std::vector::size_type i = 0; i < regExs.size(); i++) { //std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl; int currentMatch = regExs[i]->longMatch(remainingString); @@ -42,11 +42,11 @@ Symbol Lexer::next() { if (longestRegEx != NULL) { std::string eatenString = input.substr(currentPosition, longestMatch); currentPosition += longestMatch; - //std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <getPattern(), true, eatenString); } else { //std::cout << "Found no applicable regex" << std::endl; - //std::cout << "Remaining is ||" << input.substr(currentPosition,input.length()-1) << "||" << std::endl; + //std::cout << "Remaining is ||" << input.substr(currentPosition) << "||" << std::endl; return Symbol(); } } @@ -92,5 +92,15 @@ void Lexer::test() { assert(lex.next() == Symbol()); } + // Lexer can consume all the input at once. + { + Lexer lex; + lex.addRegEx("xyzzy"); + lex.setInput("xyzzy"); + s = lex.next(); + assert(s.getName() == "xyzzy" && s.getValue() == "xyzzy"); + assert(lex.next() == Symbol("$EOF$", true)); + } + std::cout << "Lexer tests passed\n"; } From 14a4f822ae40b60a8084d42b303e801929817fa4 Mon Sep 17 00:00:00 2001 From: Jason Orendorff Date: Sun, 27 Oct 2013 00:01:39 -0700 Subject: [PATCH 6/6] Add a passing Lexer test. --- src/Lexer.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 52062a2..a8dccea 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -102,5 +102,15 @@ void Lexer::test() { assert(lex.next() == Symbol("$EOF$", true)); } + // Lexer produces the longest match, not the first. + { + Lexer lex; + lex.addRegEx("int"); + lex.addRegEx("(i|n|t|e)+"); + lex.setInput("intent"); + s = lex.next(); + assert(s.getName() == "(i|n|t|e)+" && s.getValue() == "intent"); + } + std::cout << "Lexer tests passed\n"; }