From 85da0bf646cf0d6845a4384dac325226ebfcd25c Mon Sep 17 00:00:00 2001 From: Nathan Braswell Date: Tue, 2 Jul 2013 01:47:42 -0400 Subject: [PATCH] Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power. --- include/Lexer.h | 2 +- include/Parser.h | 3 ++- include/RegEx.h | 1 + include/RegExState.h | 1 + main.cpp | 2 +- src/Lexer.cpp | 21 +++++++++++++++++---- src/Parser.cpp | 17 +++++++++++++---- src/RegEx.cpp | 29 +++++++++++++++++++++++++---- src/RegExState.cpp | 10 ++++++++++ src/Symbol.cpp | 2 +- 10 files changed, 72 insertions(+), 16 deletions(-) diff --git a/include/Lexer.h b/include/Lexer.h index 34bc455..d6f6946 100644 --- a/include/Lexer.h +++ b/include/Lexer.h @@ -13,7 +13,7 @@ class Lexer { Lexer(); Lexer(std::string inputString); ~Lexer(); - void addRegexString(std::string regExString); + void addRegEx(std::string regExString); void setInput(std::string inputString); Symbol* next(); private: diff --git a/include/Parser.h b/include/Parser.h index 901ec64..bb3a9f4 100644 --- a/include/Parser.h +++ b/include/Parser.h @@ -32,7 +32,7 @@ class Parser { std::string stateSetToString(); void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action); ParseAction* getTable(int state, Symbol* token); - NodeTree* parseInput(Lexer* lexer); + NodeTree* parseInput(std::string inputString); std::string grammerToString(); std::string grammerToDOT(); @@ -41,6 +41,7 @@ class Parser { private: StringReader reader; + Lexer lexer; std::map symbols; std::vector loadedGrammer; diff --git a/include/RegEx.h b/include/RegEx.h index 4ba73a6..55f8edb 100644 --- a/include/RegEx.h +++ b/include/RegEx.h @@ -15,6 +15,7 @@ class RegEx { int longMatch(std::string stringToMatch); std::string getPattern(); + std::string toString(); private: std::string pattern; RegExState* begin; diff --git a/include/RegExState.h b/include/RegExState.h index 5b37b57..a843071 100644 --- a/include/RegExState.h +++ b/include/RegExState.h @@ -18,6 +18,7 @@ class RegExState { bool characterIs(char inCharacter); std::vector* advance(char advanceCharacter); bool isGoal(); + std::string toString(); private: std::vector nextStates; diff --git a/main.cpp b/main.cpp index c6392c0..e0ddb96 100644 --- a/main.cpp +++ b/main.cpp @@ -59,7 +59,7 @@ int main(int argc, char* argv[]) { //outFile << parser.grammerToDOT() << std::endl; std::cout << programInputFileString << std::endl; - NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString)); + NodeTree* parseTree = parser.parseInput(programInputFileString); if (parseTree) { std::cout << parseTree->DOTGraphString() << std::endl; diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 2af87ad..c08c4b4 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -2,6 +2,7 @@ Lexer::Lexer() { //Do nothing + currentPosition = 0; } Lexer::Lexer(std::string inputString) { @@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) { input = inputString; } -void Lexer::addRegexString(std::string regExString) { +void Lexer::addRegEx(std::string regExString) { regExs.push_back(new RegEx(regExString)); } Symbol* Lexer::next() { + std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <::size_type i = 0; i < regExs.size(); i++) { + std::cout << "Trying regex " << regExs[i]->toString() << std::endl; int currentMatch = regExs[i]->longMatch(remainingString); if (currentMatch > longestMatch) { longestMatch = currentMatch; longestRegEx = regExs[i]; } } - currentPosition += longestMatch; - return new Symbol(longestRegEx->getPattern(), true); + if (longestRegEx != NULL) { + currentPosition += longestMatch + 1; + std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <getPattern(), true); + } else { + std::cout << "Found no applicable regex" << std::endl; + std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl; + return NULL; + } } \ No newline at end of file diff --git a/src/Parser.cpp b/src/Parser.cpp index 9a26168..6126b22 100644 --- a/src/Parser.cpp +++ b/src/Parser.cpp @@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) { //Add the right side, adding new Symbols to symbol map. currToken = reader.word(); while (currToken != ";") { - currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal + if (currToken[0] == '\"') { + //Remove the quotes + currToken = currToken.substr(1,currToken.length()-2); + lexer.addRegEx(currToken); + currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal + } else { + currentRule->appendToRight(getOrAddSymbol(currToken, false)); + } + currToken = reader.word(); //If there are multiple endings to this rule, finish this rule and start a new one with same left handle if (currToken == "|") { @@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) { return (action); } -NodeTree* Parser::parseInput(Lexer* lexer) { - Symbol* token = lexer->next(); +NodeTree* Parser::parseInput(std::string inputString) { + lexer.setInput(inputString); + Symbol* token = lexer.next(); ParseAction* action; stateStack.push(0); @@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) { std::cout << "Shift " << token->toString() << std::endl; symbolStack.push(token); - token = lexer->next(); + token = lexer.next(); stateStack.push(action->shiftState); break; case ParseAction::ACCEPT: diff --git a/src/RegEx.cpp b/src/RegEx.cpp index 89bc3a4..22f2bbc 100644 --- a/src/RegEx.cpp +++ b/src/RegEx.cpp @@ -3,7 +3,7 @@ RegEx::RegEx(std::string inPattern) { pattern = inPattern; RegExState* current; - begin = new RegExState(pattern.at(0)); + begin = new RegExState(pattern[0]); current = begin; for (int i = 1; i < pattern.length(); i++) { RegExState* next = new RegExState(pattern.at(i)); @@ -17,21 +17,30 @@ RegEx::~RegEx() { } int RegEx::longMatch(std::string stringToMatch) { + //If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first + if (!begin->characterIs(stringToMatch[0])) + return -1; + std::cout << "Matched first character: " << stringToMatch[0] << std::endl; int lastMatch = 0; - currentStates = *(begin->advance(stringToMatch.at(0))); + currentStates = *(begin->advance(stringToMatch[1])); std::vector nextStates; - for (int i = 1; i < stringToMatch.size(); i++) { + for (int i = 2; i < stringToMatch.size(); i++) { //Go through every current state. Check to see if it is goal, if so update last goal. //Also, add each state's advance to nextStates for (std::vector::size_type j = 0; j < currentStates.size(); j++) { - if (currentStates[j]->isGoal()) + if (currentStates[j]->isGoal()) { lastMatch = i-1; + std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl; + } else { + std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <* addStates = currentStates[j]->advance(stringToMatch.at(i)); nextStates.insert(nextStates.end(), addStates->begin(), addStates->end()); delete addStates; } //Now, clear our current states and add eaczh one of our addStates if it is not already in current states + currentStates.clear(); for (std::vector::size_type j = 0; j < nextStates.size(); j++) { bool inCurrStates = false; @@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) { if (!inCurrStates) currentStates.push_back(nextStates[j]); } + if (currentStates.size() != 0) + std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl; +/* + std::cout << "Current states are: "; + for (std::vector::size_type j = 0; j < currentStates.size(); j++) + std::cout << currentStates[j]->toString() << " "; + std::cout << std::endl; +*/ nextStates.clear(); //If we can't continue matching, just return our last matched if (currentStates.size() == 0) @@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) { std::string RegEx::getPattern() { return pattern; } + +std::string RegEx::toString() { + return pattern + " -> " + begin->toString(); +} diff --git a/src/RegExState.cpp b/src/RegExState.cpp index ba43093..ade0522 100644 --- a/src/RegExState.cpp +++ b/src/RegExState.cpp @@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) { RegExState::RegExState(char inCharacter) { character = inCharacter; + inner = NULL; } RegExState::~RegExState() { @@ -33,4 +34,13 @@ bool RegExState::isGoal() { return inner == NULL && nextStates.size() == 0; } +std::string RegExState::toString() { + std::string string = ""; + string += character; + for (std::vector::size_type i = 0; i < nextStates.size(); i++) + string += "->" + nextStates[i]->toString() + " EC "; + //std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <name = name; this->terminal = isTerminal; this->subTree = NULL; - value = "HAHAHA VALUE"; + value = "NoValue"; } Symbol::Symbol(std::string name, bool isTerminal, std::string value) {