diff --git a/CMakeLists.txt b/CMakeLists.txt index aaf817a..a644098 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ project(Kraken) set( MY_INCLUDES ${PROJECT_SOURCE_DIR}/include) -set( MY_SOURCES main.cpp src/Parser.cpp src/ParseAction.cpp src/ParseRule.cpp src/Symbol.cpp src/StringReader.cpp src/NodeTree.cpp src/State.cpp src/util.cpp src/Lexer.cpp ) +set( MY_SOURCES main.cpp src/Parser.cpp src/ParseAction.cpp src/ParseRule.cpp src/Symbol.cpp src/StringReader.cpp src/NodeTree.cpp src/State.cpp src/util.cpp src/Lexer.cpp src/RegEx.cpp src/RegExState.cpp ) include_directories( ${MY_INCLUDES} ) diff --git a/include/Lexer.h b/include/Lexer.h index d8b1b7b..34bc455 100644 --- a/include/Lexer.h +++ b/include/Lexer.h @@ -3,6 +3,7 @@ #include "util.h" #include "StringReader.h" +#include "RegEx.h" #include "Symbol.h" #include @@ -12,9 +13,12 @@ class Lexer { Lexer(); Lexer(std::string inputString); ~Lexer(); + void addRegexString(std::string regExString); void setInput(std::string inputString); Symbol* next(); private: - StringReader reader; + std::vector regExs; + std::string input; + int currentPosition; }; #endif \ No newline at end of file diff --git a/include/RegEx.h b/include/RegEx.h new file mode 100644 index 0000000..4ba73a6 --- /dev/null +++ b/include/RegEx.h @@ -0,0 +1,23 @@ +#ifndef REGEX_H +#define REGEX_H + +#include "util.h" +#include "RegExState.h" +#include "Symbol.h" + +#include + +class RegEx { + public: + RegEx(); + RegEx(std::string inPattern); + ~RegEx(); + + int longMatch(std::string stringToMatch); + std::string getPattern(); + private: + std::string pattern; + RegExState* begin; + std::vector currentStates; +}; +#endif \ No newline at end of file diff --git a/include/RegExState.h b/include/RegExState.h new file mode 100644 index 0000000..5b37b57 --- /dev/null +++ b/include/RegExState.h @@ -0,0 +1,27 @@ +#ifndef REGEXSTATE_H +#define REGEXSTATE_H + +#include "util.h" +#include "Symbol.h" + +#include +#include + +class RegExState { + public: + RegExState(RegExState* inInnerState); + RegExState(char inCharacter); + + ~RegExState(); + + void addNext(RegExState* nextState); + bool characterIs(char inCharacter); + std::vector* advance(char advanceCharacter); + bool isGoal(); + + private: + std::vector nextStates; + RegExState* inner; + char character; +}; +#endif \ No newline at end of file diff --git a/include/Symbol.h b/include/Symbol.h index 5831f6e..54e6a88 100644 --- a/include/Symbol.h +++ b/include/Symbol.h @@ -16,6 +16,7 @@ class NodeTree; class Symbol { public: Symbol(std::string name, bool isTerminal); + Symbol(std::string name, bool isTerminal, std::string value); Symbol(std::string name, bool isTerminal, NodeTree* tree); ~Symbol(); bool const operator==(const Symbol &other); @@ -30,8 +31,6 @@ class Symbol { std::string value; bool terminal; NodeTree* subTree; - - }; #endif \ No newline at end of file diff --git a/src/Lexer.cpp b/src/Lexer.cpp index 964104a..2af87ad 100644 --- a/src/Lexer.cpp +++ b/src/Lexer.cpp @@ -5,7 +5,8 @@ Lexer::Lexer() { } Lexer::Lexer(std::string inputString) { - reader.setString(inputString); + input = inputString; + currentPosition = 0; } Lexer::~Lexer() { @@ -13,12 +14,24 @@ Lexer::~Lexer() { } void Lexer::setInput(std::string inputString) { - reader.setString(inputString); + input = inputString; +} + +void Lexer::addRegexString(std::string regExString) { + regExs.push_back(new RegEx(regExString)); } Symbol* Lexer::next() { - std::string token = reader.word(); - if (token != "") - return new Symbol("\""+token+"\"", true); - return new Symbol("$EOF$", false); + int longestMatch = 0; + RegEx * longestRegEx = NULL; + std::string remainingString = input.substr(currentPosition,input.length()-1); + for (std::vector::size_type i = 0; i < regExs.size(); i++) { + int currentMatch = regExs[i]->longMatch(remainingString); + if (currentMatch > longestMatch) { + longestMatch = currentMatch; + longestRegEx = regExs[i]; + } + } + currentPosition += longestMatch; + return new Symbol(longestRegEx->getPattern(), true); } \ No newline at end of file diff --git a/src/RegEx.cpp b/src/RegEx.cpp new file mode 100644 index 0000000..89bc3a4 --- /dev/null +++ b/src/RegEx.cpp @@ -0,0 +1,60 @@ +#include "RegEx.h" + +RegEx::RegEx(std::string inPattern) { + pattern = inPattern; + RegExState* current; + begin = new RegExState(pattern.at(0)); + current = begin; + for (int i = 1; i < pattern.length(); i++) { + RegExState* next = new RegExState(pattern.at(i)); + current->addNext(next); + current = next; + } +} + +RegEx::~RegEx() { + //No cleanup necessary +} + +int RegEx::longMatch(std::string stringToMatch) { + int lastMatch = 0; + currentStates = *(begin->advance(stringToMatch.at(0))); + std::vector nextStates; + + for (int i = 1; i < stringToMatch.size(); i++) { + //Go through every current state. Check to see if it is goal, if so update last goal. + //Also, add each state's advance to nextStates + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { + if (currentStates[j]->isGoal()) + lastMatch = i-1; + std::vector* addStates = currentStates[j]->advance(stringToMatch.at(i)); + nextStates.insert(nextStates.end(), addStates->begin(), addStates->end()); + delete addStates; + } + //Now, clear our current states and add eaczh one of our addStates if it is not already in current states + currentStates.clear(); + for (std::vector::size_type j = 0; j < nextStates.size(); j++) { + bool inCurrStates = false; + for (std::vector::size_type k = 0; k < currentStates.size(); k++) { + if (nextStates[j] == currentStates[i]) + inCurrStates = true; + } + if (!inCurrStates) + currentStates.push_back(nextStates[j]); + } + nextStates.clear(); + //If we can't continue matching, just return our last matched + if (currentStates.size() == 0) + break; + } + //Check to see if we match on the last character in the string + for (std::vector::size_type j = 0; j < currentStates.size(); j++) { + if (currentStates[j]->isGoal()) + lastMatch = stringToMatch.size()-1; + } + return lastMatch; +} + +std::string RegEx::getPattern() { + return pattern; +} diff --git a/src/RegExState.cpp b/src/RegExState.cpp new file mode 100644 index 0000000..ba43093 --- /dev/null +++ b/src/RegExState.cpp @@ -0,0 +1,36 @@ +#include "RegExState.h" + +RegExState::RegExState(RegExState* inInnerState) { + inner = inInnerState; +} + +RegExState::RegExState(char inCharacter) { + character = inCharacter; +} + +RegExState::~RegExState() { + //No cleanup necessary +} + +void RegExState::addNext(RegExState* nextState) { + nextStates.push_back(nextState); +} + +bool RegExState::characterIs(char inCharacter) { + return character == inCharacter; +} + +std::vector* RegExState::advance(char advanceCharacter) { + std::vector* advanceStates = new std::vector(); + for (std::vector::size_type i = 0; i < nextStates.size(); i++) { + if (nextStates[i]->characterIs(advanceCharacter)) + advanceStates->push_back(nextStates[i]); + } + return advanceStates; +} + +bool RegExState::isGoal() { + return inner == NULL && nextStates.size() == 0; +} + + diff --git a/src/Symbol.cpp b/src/Symbol.cpp index 4b0e3d5..913c98f 100644 --- a/src/Symbol.cpp +++ b/src/Symbol.cpp @@ -7,6 +7,13 @@ Symbol::Symbol(std::string name, bool isTerminal) { value = "HAHAHA VALUE"; } +Symbol::Symbol(std::string name, bool isTerminal, std::string value) { + this->name = name; + this->terminal = isTerminal; + this->subTree = NULL; + this->value = value; +} + Symbol::Symbol(std::string name, bool isTerminal, NodeTree* tree) { this->name = name; this->terminal = isTerminal;