Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power.

This commit is contained in:
Nathan Braswell
2013-07-02 01:47:42 -04:00
parent 94a7739bd9
commit 85da0bf646
10 changed files with 72 additions and 16 deletions

View File

@@ -13,7 +13,7 @@ class Lexer {
Lexer(); Lexer();
Lexer(std::string inputString); Lexer(std::string inputString);
~Lexer(); ~Lexer();
void addRegexString(std::string regExString); void addRegEx(std::string regExString);
void setInput(std::string inputString); void setInput(std::string inputString);
Symbol* next(); Symbol* next();
private: private:

View File

@@ -32,7 +32,7 @@ class Parser {
std::string stateSetToString(); std::string stateSetToString();
void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action); void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
ParseAction* getTable(int state, Symbol* token); ParseAction* getTable(int state, Symbol* token);
NodeTree* parseInput(Lexer* lexer); NodeTree* parseInput(std::string inputString);
std::string grammerToString(); std::string grammerToString();
std::string grammerToDOT(); std::string grammerToDOT();
@@ -41,6 +41,7 @@ class Parser {
private: private:
StringReader reader; StringReader reader;
Lexer lexer;
std::map<std::string, Symbol*> symbols; std::map<std::string, Symbol*> symbols;
std::vector<ParseRule*> loadedGrammer; std::vector<ParseRule*> loadedGrammer;

View File

@@ -15,6 +15,7 @@ class RegEx {
int longMatch(std::string stringToMatch); int longMatch(std::string stringToMatch);
std::string getPattern(); std::string getPattern();
std::string toString();
private: private:
std::string pattern; std::string pattern;
RegExState* begin; RegExState* begin;

View File

@@ -18,6 +18,7 @@ class RegExState {
bool characterIs(char inCharacter); bool characterIs(char inCharacter);
std::vector<RegExState*>* advance(char advanceCharacter); std::vector<RegExState*>* advance(char advanceCharacter);
bool isGoal(); bool isGoal();
std::string toString();
private: private:
std::vector<RegExState*> nextStates; std::vector<RegExState*> nextStates;

View File

@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) {
//outFile << parser.grammerToDOT() << std::endl; //outFile << parser.grammerToDOT() << std::endl;
std::cout << programInputFileString << std::endl; std::cout << programInputFileString << std::endl;
NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString)); NodeTree* parseTree = parser.parseInput(programInputFileString);
if (parseTree) { if (parseTree) {
std::cout << parseTree->DOTGraphString() << std::endl; std::cout << parseTree->DOTGraphString() << std::endl;

View File

@@ -2,6 +2,7 @@
Lexer::Lexer() { Lexer::Lexer() {
//Do nothing //Do nothing
currentPosition = 0;
} }
Lexer::Lexer(std::string inputString) { Lexer::Lexer(std::string inputString) {
@@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) {
input = inputString; input = inputString;
} }
void Lexer::addRegexString(std::string regExString) { void Lexer::addRegEx(std::string regExString) {
regExs.push_back(new RegEx(regExString)); regExs.push_back(new RegEx(regExString));
} }
Symbol* Lexer::next() { Symbol* Lexer::next() {
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
//If we're at the end, return an eof
if (currentPosition == input.length()-1)
return new Symbol("$EOF$", false);
int longestMatch = 0; int longestMatch = 0;
RegEx * longestRegEx = NULL; RegEx* longestRegEx = NULL;
std::string remainingString = input.substr(currentPosition,input.length()-1); std::string remainingString = input.substr(currentPosition,input.length()-1);
for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) { for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
int currentMatch = regExs[i]->longMatch(remainingString); int currentMatch = regExs[i]->longMatch(remainingString);
if (currentMatch > longestMatch) { if (currentMatch > longestMatch) {
longestMatch = currentMatch; longestMatch = currentMatch;
longestRegEx = regExs[i]; longestRegEx = regExs[i];
} }
} }
currentPosition += longestMatch; if (longestRegEx != NULL) {
return new Symbol(longestRegEx->getPattern(), true); currentPosition += longestMatch + 1;
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
return new Symbol(longestRegEx->getPattern(), true);
} else {
std::cout << "Found no applicable regex" << std::endl;
std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
return NULL;
}
} }

View File

@@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) {
//Add the right side, adding new Symbols to symbol map. //Add the right side, adding new Symbols to symbol map.
currToken = reader.word(); currToken = reader.word();
while (currToken != ";") { while (currToken != ";") {
currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal if (currToken[0] == '\"') {
//Remove the quotes
currToken = currToken.substr(1,currToken.length()-2);
lexer.addRegEx(currToken);
currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
} else {
currentRule->appendToRight(getOrAddSymbol(currToken, false));
}
currToken = reader.word(); currToken = reader.word();
//If there are multiple endings to this rule, finish this rule and start a new one with same left handle //If there are multiple endings to this rule, finish this rule and start a new one with same left handle
if (currToken == "|") { if (currToken == "|") {
@@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) {
return (action); return (action);
} }
NodeTree* Parser::parseInput(Lexer* lexer) { NodeTree* Parser::parseInput(std::string inputString) {
Symbol* token = lexer->next(); lexer.setInput(inputString);
Symbol* token = lexer.next();
ParseAction* action; ParseAction* action;
stateStack.push(0); stateStack.push(0);
@@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) {
std::cout << "Shift " << token->toString() << std::endl; std::cout << "Shift " << token->toString() << std::endl;
symbolStack.push(token); symbolStack.push(token);
token = lexer->next(); token = lexer.next();
stateStack.push(action->shiftState); stateStack.push(action->shiftState);
break; break;
case ParseAction::ACCEPT: case ParseAction::ACCEPT:

View File

@@ -3,7 +3,7 @@
RegEx::RegEx(std::string inPattern) { RegEx::RegEx(std::string inPattern) {
pattern = inPattern; pattern = inPattern;
RegExState* current; RegExState* current;
begin = new RegExState(pattern.at(0)); begin = new RegExState(pattern[0]);
current = begin; current = begin;
for (int i = 1; i < pattern.length(); i++) { for (int i = 1; i < pattern.length(); i++) {
RegExState* next = new RegExState(pattern.at(i)); RegExState* next = new RegExState(pattern.at(i));
@@ -17,21 +17,30 @@ RegEx::~RegEx() {
} }
int RegEx::longMatch(std::string stringToMatch) { int RegEx::longMatch(std::string stringToMatch) {
//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
if (!begin->characterIs(stringToMatch[0]))
return -1;
std::cout << "Matched first character: " << stringToMatch[0] << std::endl;
int lastMatch = 0; int lastMatch = 0;
currentStates = *(begin->advance(stringToMatch.at(0))); currentStates = *(begin->advance(stringToMatch[1]));
std::vector<RegExState*> nextStates; std::vector<RegExState*> nextStates;
for (int i = 1; i < stringToMatch.size(); i++) { for (int i = 2; i < stringToMatch.size(); i++) {
//Go through every current state. Check to see if it is goal, if so update last goal. //Go through every current state. Check to see if it is goal, if so update last goal.
//Also, add each state's advance to nextStates //Also, add each state's advance to nextStates
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) { for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
if (currentStates[j]->isGoal()) if (currentStates[j]->isGoal()) {
lastMatch = i-1; lastMatch = i-1;
std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
} else {
std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
}
std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i)); std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
nextStates.insert(nextStates.end(), addStates->begin(), addStates->end()); nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
delete addStates; delete addStates;
} }
//Now, clear our current states and add eaczh one of our addStates if it is not already in current states //Now, clear our current states and add eaczh one of our addStates if it is not already in current states
currentStates.clear(); currentStates.clear();
for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) { for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
bool inCurrStates = false; bool inCurrStates = false;
@@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) {
if (!inCurrStates) if (!inCurrStates)
currentStates.push_back(nextStates[j]); currentStates.push_back(nextStates[j]);
} }
if (currentStates.size() != 0)
std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
/*
std::cout << "Current states are: ";
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
std::cout << currentStates[j]->toString() << " ";
std::cout << std::endl;
*/
nextStates.clear(); nextStates.clear();
//If we can't continue matching, just return our last matched //If we can't continue matching, just return our last matched
if (currentStates.size() == 0) if (currentStates.size() == 0)
@@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) {
std::string RegEx::getPattern() { std::string RegEx::getPattern() {
return pattern; return pattern;
} }
std::string RegEx::toString() {
return pattern + " -> " + begin->toString();
}

View File

@@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) {
RegExState::RegExState(char inCharacter) { RegExState::RegExState(char inCharacter) {
character = inCharacter; character = inCharacter;
inner = NULL;
} }
RegExState::~RegExState() { RegExState::~RegExState() {
@@ -33,4 +34,13 @@ bool RegExState::isGoal() {
return inner == NULL && nextStates.size() == 0; return inner == NULL && nextStates.size() == 0;
} }
std::string RegExState::toString() {
std::string string = "";
string += character;
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
string += "->" + nextStates[i]->toString() + " EC ";
//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
return string;
}

View File

@@ -4,7 +4,7 @@ Symbol::Symbol(std::string name, bool isTerminal) {
this->name = name; this->name = name;
this->terminal = isTerminal; this->terminal = isTerminal;
this->subTree = NULL; this->subTree = NULL;
value = "HAHAHA VALUE"; value = "NoValue";
} }
Symbol::Symbol(std::string name, bool isTerminal, std::string value) { Symbol::Symbol(std::string name, bool isTerminal, std::string value) {