Merge pull request #2 from jorendorff/master
Minor bug fixes and tests for RegEx and Lexer, lots of thanks to jorendorff
This commit is contained in:
@@ -16,6 +16,7 @@ class Lexer {
|
||||
void addRegEx(std::string regExString);
|
||||
void setInput(std::string inputString);
|
||||
Symbol next();
|
||||
static void test();
|
||||
private:
|
||||
std::vector<RegEx*> regExs;
|
||||
std::string input;
|
||||
|
||||
@@ -21,6 +21,7 @@ class RegEx {
|
||||
int longMatch(std::string stringToMatch);
|
||||
std::string getPattern();
|
||||
std::string toString();
|
||||
static void test();
|
||||
private:
|
||||
std::string pattern;
|
||||
RegExState* begin;
|
||||
|
||||
2
main.cpp
2
main.cpp
@@ -19,6 +19,8 @@
|
||||
int main(int argc, char* argv[]) {
|
||||
if (argc == 2 && std::string(argv[1]) == "--test") {
|
||||
StringReader::test();
|
||||
RegEx::test();
|
||||
Lexer::test();
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "Lexer.h"
|
||||
#include <cassert>
|
||||
|
||||
Lexer::Lexer() {
|
||||
//Do nothing
|
||||
@@ -23,13 +24,13 @@ void Lexer::addRegEx(std::string regExString) {
|
||||
}
|
||||
|
||||
Symbol Lexer::next() {
|
||||
//std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition << " out of " << input.length() <<std::endl;
|
||||
//std::cout << "Current at is \"" << input.substr(currentPosition) << "\" currentPos is " << currentPosition << " out of " << input.length() <<std::endl;
|
||||
//If we're at the end, return an eof
|
||||
if (currentPosition >= input.length()-1)
|
||||
if (currentPosition >= input.length())
|
||||
return Symbol("$EOF$", true);
|
||||
int longestMatch = -1;
|
||||
RegEx* longestRegEx = NULL;
|
||||
std::string remainingString = input.substr(currentPosition,input.length()-1);
|
||||
std::string remainingString = input.substr(currentPosition);
|
||||
for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
|
||||
//std::cout << "Trying regex " << regExs[i]->getPattern() << std::endl;
|
||||
int currentMatch = regExs[i]->longMatch(remainingString);
|
||||
@@ -39,13 +40,77 @@ Symbol Lexer::next() {
|
||||
}
|
||||
}
|
||||
if (longestRegEx != NULL) {
|
||||
std::string eatenString = input.substr(currentPosition, longestMatch+1);
|
||||
currentPosition += longestMatch + 1;
|
||||
//std::cout << "Current at is \"" << input.substr(currentPosition,input.length()-1) << "\" currentPos is " << currentPosition <<std::endl;
|
||||
std::string eatenString = input.substr(currentPosition, longestMatch);
|
||||
currentPosition += longestMatch;
|
||||
//std::cout << "Current at is \"" << input.substr(currentPosition) << "\" currentPos is " << currentPosition <<std::endl;
|
||||
return Symbol(longestRegEx->getPattern(), true, eatenString);
|
||||
} else {
|
||||
//std::cout << "Found no applicable regex" << std::endl;
|
||||
//std::cout << "Remaining is ||" << input.substr(currentPosition,input.length()-1) << "||" << std::endl;
|
||||
//std::cout << "Remaining is ||" << input.substr(currentPosition) << "||" << std::endl;
|
||||
return Symbol();
|
||||
}
|
||||
}
|
||||
|
||||
void Lexer::test() {
|
||||
Symbol s;
|
||||
{
|
||||
Lexer lex;
|
||||
lex.addRegEx("b");
|
||||
lex.setInput("bb");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "b" && s.getValue() == "b");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "b" && s.getValue() == "b");
|
||||
assert(lex.next() == Symbol("$EOF$", true));
|
||||
}
|
||||
|
||||
{
|
||||
Lexer lex;
|
||||
lex.addRegEx("a*");
|
||||
lex.addRegEx("b");
|
||||
lex.setInput("aaabaabb");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "a*" && s.getValue() == "aaa");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "b" && s.getValue() == "b");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "a*" && s.getValue() == "aa");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "b" && s.getValue() == "b");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "b" && s.getValue() == "b");
|
||||
assert(lex.next() == Symbol("$EOF$", true));
|
||||
}
|
||||
|
||||
// Test a lexer error condition.
|
||||
{
|
||||
Lexer lex;
|
||||
lex.addRegEx("a|b");
|
||||
lex.setInput("blah");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "a|b" && s.getValue() == "b");
|
||||
assert(lex.next() == Symbol());
|
||||
}
|
||||
|
||||
// Lexer can consume all the input at once.
|
||||
{
|
||||
Lexer lex;
|
||||
lex.addRegEx("xyzzy");
|
||||
lex.setInput("xyzzy");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "xyzzy" && s.getValue() == "xyzzy");
|
||||
assert(lex.next() == Symbol("$EOF$", true));
|
||||
}
|
||||
|
||||
// Lexer produces the longest match, not the first.
|
||||
{
|
||||
Lexer lex;
|
||||
lex.addRegEx("int");
|
||||
lex.addRegEx("(i|n|t|e)+");
|
||||
lex.setInput("intent");
|
||||
s = lex.next();
|
||||
assert(s.getName() == "(i|n|t|e)+" && s.getValue() == "intent");
|
||||
}
|
||||
|
||||
std::cout << "Lexer tests passed\n";
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include "RegEx.h"
|
||||
#include <cassert>
|
||||
|
||||
RegEx::RegEx(std::string inPattern) {
|
||||
pattern = inPattern;
|
||||
@@ -232,17 +233,18 @@ RegEx::~RegEx() {
|
||||
}
|
||||
|
||||
int RegEx::longMatch(std::string stringToMatch) {
|
||||
//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
|
||||
// Start in the begin state (only).
|
||||
int lastMatch = -1;
|
||||
currentStates = *(begin->advance(stringToMatch[0]));
|
||||
currentStates.clear();
|
||||
currentStates.push_back(begin);
|
||||
std::vector<RegExState*> nextStates;
|
||||
|
||||
for (int i = 1; i < stringToMatch.size(); i++) {
|
||||
for (int i = 0; i < stringToMatch.size(); i++) {
|
||||
//Go through every current state. Check to see if it is goal, if so update last goal.
|
||||
//Also, add each state's advance to nextStates
|
||||
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
|
||||
if (currentStates[j]->isGoal()) {
|
||||
lastMatch = i-1;
|
||||
lastMatch = i;
|
||||
//std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
|
||||
} else {
|
||||
//std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
|
||||
@@ -274,7 +276,7 @@ int RegEx::longMatch(std::string stringToMatch) {
|
||||
//Check to see if we match on the last character in the string
|
||||
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
|
||||
if (currentStates[j]->isGoal())
|
||||
lastMatch = stringToMatch.size()-1;
|
||||
lastMatch = stringToMatch.size();
|
||||
}
|
||||
return lastMatch;
|
||||
}
|
||||
@@ -286,3 +288,27 @@ std::string RegEx::getPattern() {
|
||||
std::string RegEx::toString() {
|
||||
return pattern + " -> " + begin->toString();
|
||||
}
|
||||
|
||||
void RegEx::test() {
|
||||
{
|
||||
RegEx re("a*");
|
||||
assert(re.longMatch("a") == 1);
|
||||
assert(re.longMatch("aa") == 2);
|
||||
assert(re.longMatch("aaaab") == 4);
|
||||
assert(re.longMatch("b") == 0);
|
||||
}
|
||||
|
||||
{
|
||||
RegEx re("a+");
|
||||
assert(re.longMatch("aa") == 2);
|
||||
assert(re.longMatch("aaaab") == 4);
|
||||
assert(re.longMatch("b") == -1);
|
||||
}
|
||||
|
||||
{
|
||||
RegEx re("a(bc)?");
|
||||
assert(re.longMatch("ab") == 1);
|
||||
}
|
||||
|
||||
std::cout << "RegEx tests pass\n";
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user