Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power.
This commit is contained in:
@@ -13,7 +13,7 @@ class Lexer {
|
|||||||
Lexer();
|
Lexer();
|
||||||
Lexer(std::string inputString);
|
Lexer(std::string inputString);
|
||||||
~Lexer();
|
~Lexer();
|
||||||
void addRegexString(std::string regExString);
|
void addRegEx(std::string regExString);
|
||||||
void setInput(std::string inputString);
|
void setInput(std::string inputString);
|
||||||
Symbol* next();
|
Symbol* next();
|
||||||
private:
|
private:
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ class Parser {
|
|||||||
std::string stateSetToString();
|
std::string stateSetToString();
|
||||||
void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
|
void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
|
||||||
ParseAction* getTable(int state, Symbol* token);
|
ParseAction* getTable(int state, Symbol* token);
|
||||||
NodeTree* parseInput(Lexer* lexer);
|
NodeTree* parseInput(std::string inputString);
|
||||||
|
|
||||||
std::string grammerToString();
|
std::string grammerToString();
|
||||||
std::string grammerToDOT();
|
std::string grammerToDOT();
|
||||||
@@ -41,6 +41,7 @@ class Parser {
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
StringReader reader;
|
StringReader reader;
|
||||||
|
Lexer lexer;
|
||||||
std::map<std::string, Symbol*> symbols;
|
std::map<std::string, Symbol*> symbols;
|
||||||
std::vector<ParseRule*> loadedGrammer;
|
std::vector<ParseRule*> loadedGrammer;
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ class RegEx {
|
|||||||
|
|
||||||
int longMatch(std::string stringToMatch);
|
int longMatch(std::string stringToMatch);
|
||||||
std::string getPattern();
|
std::string getPattern();
|
||||||
|
std::string toString();
|
||||||
private:
|
private:
|
||||||
std::string pattern;
|
std::string pattern;
|
||||||
RegExState* begin;
|
RegExState* begin;
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ class RegExState {
|
|||||||
bool characterIs(char inCharacter);
|
bool characterIs(char inCharacter);
|
||||||
std::vector<RegExState*>* advance(char advanceCharacter);
|
std::vector<RegExState*>* advance(char advanceCharacter);
|
||||||
bool isGoal();
|
bool isGoal();
|
||||||
|
std::string toString();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::vector<RegExState*> nextStates;
|
std::vector<RegExState*> nextStates;
|
||||||
|
|||||||
2
main.cpp
2
main.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) {
|
|||||||
//outFile << parser.grammerToDOT() << std::endl;
|
//outFile << parser.grammerToDOT() << std::endl;
|
||||||
|
|
||||||
std::cout << programInputFileString << std::endl;
|
std::cout << programInputFileString << std::endl;
|
||||||
NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString));
|
NodeTree* parseTree = parser.parseInput(programInputFileString);
|
||||||
|
|
||||||
if (parseTree) {
|
if (parseTree) {
|
||||||
std::cout << parseTree->DOTGraphString() << std::endl;
|
std::cout << parseTree->DOTGraphString() << std::endl;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
Lexer::Lexer() {
|
Lexer::Lexer() {
|
||||||
//Do nothing
|
//Do nothing
|
||||||
|
currentPosition = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
Lexer::Lexer(std::string inputString) {
|
Lexer::Lexer(std::string inputString) {
|
||||||
@@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) {
|
|||||||
input = inputString;
|
input = inputString;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Lexer::addRegexString(std::string regExString) {
|
void Lexer::addRegEx(std::string regExString) {
|
||||||
regExs.push_back(new RegEx(regExString));
|
regExs.push_back(new RegEx(regExString));
|
||||||
}
|
}
|
||||||
|
|
||||||
Symbol* Lexer::next() {
|
Symbol* Lexer::next() {
|
||||||
|
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
|
||||||
|
//If we're at the end, return an eof
|
||||||
|
if (currentPosition == input.length()-1)
|
||||||
|
return new Symbol("$EOF$", false);
|
||||||
int longestMatch = 0;
|
int longestMatch = 0;
|
||||||
RegEx * longestRegEx = NULL;
|
RegEx* longestRegEx = NULL;
|
||||||
std::string remainingString = input.substr(currentPosition,input.length()-1);
|
std::string remainingString = input.substr(currentPosition,input.length()-1);
|
||||||
for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
|
for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
|
||||||
|
std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
|
||||||
int currentMatch = regExs[i]->longMatch(remainingString);
|
int currentMatch = regExs[i]->longMatch(remainingString);
|
||||||
if (currentMatch > longestMatch) {
|
if (currentMatch > longestMatch) {
|
||||||
longestMatch = currentMatch;
|
longestMatch = currentMatch;
|
||||||
longestRegEx = regExs[i];
|
longestRegEx = regExs[i];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
currentPosition += longestMatch;
|
if (longestRegEx != NULL) {
|
||||||
return new Symbol(longestRegEx->getPattern(), true);
|
currentPosition += longestMatch + 1;
|
||||||
|
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
|
||||||
|
return new Symbol(longestRegEx->getPattern(), true);
|
||||||
|
} else {
|
||||||
|
std::cout << "Found no applicable regex" << std::endl;
|
||||||
|
std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) {
|
|||||||
//Add the right side, adding new Symbols to symbol map.
|
//Add the right side, adding new Symbols to symbol map.
|
||||||
currToken = reader.word();
|
currToken = reader.word();
|
||||||
while (currToken != ";") {
|
while (currToken != ";") {
|
||||||
currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal
|
if (currToken[0] == '\"') {
|
||||||
|
//Remove the quotes
|
||||||
|
currToken = currToken.substr(1,currToken.length()-2);
|
||||||
|
lexer.addRegEx(currToken);
|
||||||
|
currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
|
||||||
|
} else {
|
||||||
|
currentRule->appendToRight(getOrAddSymbol(currToken, false));
|
||||||
|
}
|
||||||
|
|
||||||
currToken = reader.word();
|
currToken = reader.word();
|
||||||
//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
|
//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
|
||||||
if (currToken == "|") {
|
if (currToken == "|") {
|
||||||
@@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) {
|
|||||||
return (action);
|
return (action);
|
||||||
}
|
}
|
||||||
|
|
||||||
NodeTree* Parser::parseInput(Lexer* lexer) {
|
NodeTree* Parser::parseInput(std::string inputString) {
|
||||||
Symbol* token = lexer->next();
|
lexer.setInput(inputString);
|
||||||
|
Symbol* token = lexer.next();
|
||||||
ParseAction* action;
|
ParseAction* action;
|
||||||
|
|
||||||
stateStack.push(0);
|
stateStack.push(0);
|
||||||
@@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) {
|
|||||||
std::cout << "Shift " << token->toString() << std::endl;
|
std::cout << "Shift " << token->toString() << std::endl;
|
||||||
|
|
||||||
symbolStack.push(token);
|
symbolStack.push(token);
|
||||||
token = lexer->next();
|
token = lexer.next();
|
||||||
stateStack.push(action->shiftState);
|
stateStack.push(action->shiftState);
|
||||||
break;
|
break;
|
||||||
case ParseAction::ACCEPT:
|
case ParseAction::ACCEPT:
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
RegEx::RegEx(std::string inPattern) {
|
RegEx::RegEx(std::string inPattern) {
|
||||||
pattern = inPattern;
|
pattern = inPattern;
|
||||||
RegExState* current;
|
RegExState* current;
|
||||||
begin = new RegExState(pattern.at(0));
|
begin = new RegExState(pattern[0]);
|
||||||
current = begin;
|
current = begin;
|
||||||
for (int i = 1; i < pattern.length(); i++) {
|
for (int i = 1; i < pattern.length(); i++) {
|
||||||
RegExState* next = new RegExState(pattern.at(i));
|
RegExState* next = new RegExState(pattern.at(i));
|
||||||
@@ -17,21 +17,30 @@ RegEx::~RegEx() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
int RegEx::longMatch(std::string stringToMatch) {
|
int RegEx::longMatch(std::string stringToMatch) {
|
||||||
|
//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
|
||||||
|
if (!begin->characterIs(stringToMatch[0]))
|
||||||
|
return -1;
|
||||||
|
std::cout << "Matched first character: " << stringToMatch[0] << std::endl;
|
||||||
int lastMatch = 0;
|
int lastMatch = 0;
|
||||||
currentStates = *(begin->advance(stringToMatch.at(0)));
|
currentStates = *(begin->advance(stringToMatch[1]));
|
||||||
std::vector<RegExState*> nextStates;
|
std::vector<RegExState*> nextStates;
|
||||||
|
|
||||||
for (int i = 1; i < stringToMatch.size(); i++) {
|
for (int i = 2; i < stringToMatch.size(); i++) {
|
||||||
//Go through every current state. Check to see if it is goal, if so update last goal.
|
//Go through every current state. Check to see if it is goal, if so update last goal.
|
||||||
//Also, add each state's advance to nextStates
|
//Also, add each state's advance to nextStates
|
||||||
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
|
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
|
||||||
if (currentStates[j]->isGoal())
|
if (currentStates[j]->isGoal()) {
|
||||||
lastMatch = i-1;
|
lastMatch = i-1;
|
||||||
|
std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
|
||||||
|
}
|
||||||
std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
|
std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
|
||||||
nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
|
nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
|
||||||
delete addStates;
|
delete addStates;
|
||||||
}
|
}
|
||||||
//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
|
//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
|
||||||
|
|
||||||
currentStates.clear();
|
currentStates.clear();
|
||||||
for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
|
for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
|
||||||
bool inCurrStates = false;
|
bool inCurrStates = false;
|
||||||
@@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) {
|
|||||||
if (!inCurrStates)
|
if (!inCurrStates)
|
||||||
currentStates.push_back(nextStates[j]);
|
currentStates.push_back(nextStates[j]);
|
||||||
}
|
}
|
||||||
|
if (currentStates.size() != 0)
|
||||||
|
std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
|
||||||
|
/*
|
||||||
|
std::cout << "Current states are: ";
|
||||||
|
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
|
||||||
|
std::cout << currentStates[j]->toString() << " ";
|
||||||
|
std::cout << std::endl;
|
||||||
|
*/
|
||||||
nextStates.clear();
|
nextStates.clear();
|
||||||
//If we can't continue matching, just return our last matched
|
//If we can't continue matching, just return our last matched
|
||||||
if (currentStates.size() == 0)
|
if (currentStates.size() == 0)
|
||||||
@@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) {
|
|||||||
std::string RegEx::getPattern() {
|
std::string RegEx::getPattern() {
|
||||||
return pattern;
|
return pattern;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string RegEx::toString() {
|
||||||
|
return pattern + " -> " + begin->toString();
|
||||||
|
}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) {
|
|||||||
|
|
||||||
RegExState::RegExState(char inCharacter) {
|
RegExState::RegExState(char inCharacter) {
|
||||||
character = inCharacter;
|
character = inCharacter;
|
||||||
|
inner = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
RegExState::~RegExState() {
|
RegExState::~RegExState() {
|
||||||
@@ -33,4 +34,13 @@ bool RegExState::isGoal() {
|
|||||||
return inner == NULL && nextStates.size() == 0;
|
return inner == NULL && nextStates.size() == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string RegExState::toString() {
|
||||||
|
std::string string = "";
|
||||||
|
string += character;
|
||||||
|
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
|
||||||
|
string += "->" + nextStates[i]->toString() + " EC ";
|
||||||
|
//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ Symbol::Symbol(std::string name, bool isTerminal) {
|
|||||||
this->name = name;
|
this->name = name;
|
||||||
this->terminal = isTerminal;
|
this->terminal = isTerminal;
|
||||||
this->subTree = NULL;
|
this->subTree = NULL;
|
||||||
value = "HAHAHA VALUE";
|
value = "NoValue";
|
||||||
}
|
}
|
||||||
|
|
||||||
Symbol::Symbol(std::string name, bool isTerminal, std::string value) {
|
Symbol::Symbol(std::string name, bool isTerminal, std::string value) {
|
||||||
|
|||||||
Reference in New Issue
Block a user