Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power.
This commit is contained in:
@@ -13,7 +13,7 @@ class Lexer {
|
||||
Lexer();
|
||||
Lexer(std::string inputString);
|
||||
~Lexer();
|
||||
void addRegexString(std::string regExString);
|
||||
void addRegEx(std::string regExString);
|
||||
void setInput(std::string inputString);
|
||||
Symbol* next();
|
||||
private:
|
||||
|
||||
@@ -32,7 +32,7 @@ class Parser {
|
||||
std::string stateSetToString();
|
||||
void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
|
||||
ParseAction* getTable(int state, Symbol* token);
|
||||
NodeTree* parseInput(Lexer* lexer);
|
||||
NodeTree* parseInput(std::string inputString);
|
||||
|
||||
std::string grammerToString();
|
||||
std::string grammerToDOT();
|
||||
@@ -41,6 +41,7 @@ class Parser {
|
||||
|
||||
private:
|
||||
StringReader reader;
|
||||
Lexer lexer;
|
||||
std::map<std::string, Symbol*> symbols;
|
||||
std::vector<ParseRule*> loadedGrammer;
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ class RegEx {
|
||||
|
||||
int longMatch(std::string stringToMatch);
|
||||
std::string getPattern();
|
||||
std::string toString();
|
||||
private:
|
||||
std::string pattern;
|
||||
RegExState* begin;
|
||||
|
||||
@@ -18,6 +18,7 @@ class RegExState {
|
||||
bool characterIs(char inCharacter);
|
||||
std::vector<RegExState*>* advance(char advanceCharacter);
|
||||
bool isGoal();
|
||||
std::string toString();
|
||||
|
||||
private:
|
||||
std::vector<RegExState*> nextStates;
|
||||
|
||||
2
main.cpp
2
main.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) {
|
||||
//outFile << parser.grammerToDOT() << std::endl;
|
||||
|
||||
std::cout << programInputFileString << std::endl;
|
||||
NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString));
|
||||
NodeTree* parseTree = parser.parseInput(programInputFileString);
|
||||
|
||||
if (parseTree) {
|
||||
std::cout << parseTree->DOTGraphString() << std::endl;
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
Lexer::Lexer() {
|
||||
//Do nothing
|
||||
currentPosition = 0;
|
||||
}
|
||||
|
||||
Lexer::Lexer(std::string inputString) {
|
||||
@@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) {
|
||||
input = inputString;
|
||||
}
|
||||
|
||||
void Lexer::addRegexString(std::string regExString) {
|
||||
void Lexer::addRegEx(std::string regExString) {
|
||||
regExs.push_back(new RegEx(regExString));
|
||||
}
|
||||
|
||||
Symbol* Lexer::next() {
|
||||
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
|
||||
//If we're at the end, return an eof
|
||||
if (currentPosition == input.length()-1)
|
||||
return new Symbol("$EOF$", false);
|
||||
int longestMatch = 0;
|
||||
RegEx* longestRegEx = NULL;
|
||||
std::string remainingString = input.substr(currentPosition,input.length()-1);
|
||||
for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
|
||||
std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
|
||||
int currentMatch = regExs[i]->longMatch(remainingString);
|
||||
if (currentMatch > longestMatch) {
|
||||
longestMatch = currentMatch;
|
||||
longestRegEx = regExs[i];
|
||||
}
|
||||
}
|
||||
currentPosition += longestMatch;
|
||||
if (longestRegEx != NULL) {
|
||||
currentPosition += longestMatch + 1;
|
||||
std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
|
||||
return new Symbol(longestRegEx->getPattern(), true);
|
||||
} else {
|
||||
std::cout << "Found no applicable regex" << std::endl;
|
||||
std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) {
|
||||
//Add the right side, adding new Symbols to symbol map.
|
||||
currToken = reader.word();
|
||||
while (currToken != ";") {
|
||||
currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal
|
||||
if (currToken[0] == '\"') {
|
||||
//Remove the quotes
|
||||
currToken = currToken.substr(1,currToken.length()-2);
|
||||
lexer.addRegEx(currToken);
|
||||
currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
|
||||
} else {
|
||||
currentRule->appendToRight(getOrAddSymbol(currToken, false));
|
||||
}
|
||||
|
||||
currToken = reader.word();
|
||||
//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
|
||||
if (currToken == "|") {
|
||||
@@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) {
|
||||
return (action);
|
||||
}
|
||||
|
||||
NodeTree* Parser::parseInput(Lexer* lexer) {
|
||||
Symbol* token = lexer->next();
|
||||
NodeTree* Parser::parseInput(std::string inputString) {
|
||||
lexer.setInput(inputString);
|
||||
Symbol* token = lexer.next();
|
||||
ParseAction* action;
|
||||
|
||||
stateStack.push(0);
|
||||
@@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) {
|
||||
std::cout << "Shift " << token->toString() << std::endl;
|
||||
|
||||
symbolStack.push(token);
|
||||
token = lexer->next();
|
||||
token = lexer.next();
|
||||
stateStack.push(action->shiftState);
|
||||
break;
|
||||
case ParseAction::ACCEPT:
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
RegEx::RegEx(std::string inPattern) {
|
||||
pattern = inPattern;
|
||||
RegExState* current;
|
||||
begin = new RegExState(pattern.at(0));
|
||||
begin = new RegExState(pattern[0]);
|
||||
current = begin;
|
||||
for (int i = 1; i < pattern.length(); i++) {
|
||||
RegExState* next = new RegExState(pattern.at(i));
|
||||
@@ -17,21 +17,30 @@ RegEx::~RegEx() {
|
||||
}
|
||||
|
||||
int RegEx::longMatch(std::string stringToMatch) {
|
||||
//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
|
||||
if (!begin->characterIs(stringToMatch[0]))
|
||||
return -1;
|
||||
std::cout << "Matched first character: " << stringToMatch[0] << std::endl;
|
||||
int lastMatch = 0;
|
||||
currentStates = *(begin->advance(stringToMatch.at(0)));
|
||||
currentStates = *(begin->advance(stringToMatch[1]));
|
||||
std::vector<RegExState*> nextStates;
|
||||
|
||||
for (int i = 1; i < stringToMatch.size(); i++) {
|
||||
for (int i = 2; i < stringToMatch.size(); i++) {
|
||||
//Go through every current state. Check to see if it is goal, if so update last goal.
|
||||
//Also, add each state's advance to nextStates
|
||||
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
|
||||
if (currentStates[j]->isGoal())
|
||||
if (currentStates[j]->isGoal()) {
|
||||
lastMatch = i-1;
|
||||
std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
|
||||
} else {
|
||||
std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
|
||||
}
|
||||
std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
|
||||
nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
|
||||
delete addStates;
|
||||
}
|
||||
//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
|
||||
|
||||
currentStates.clear();
|
||||
for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
|
||||
bool inCurrStates = false;
|
||||
@@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) {
|
||||
if (!inCurrStates)
|
||||
currentStates.push_back(nextStates[j]);
|
||||
}
|
||||
if (currentStates.size() != 0)
|
||||
std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
|
||||
/*
|
||||
std::cout << "Current states are: ";
|
||||
for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
|
||||
std::cout << currentStates[j]->toString() << " ";
|
||||
std::cout << std::endl;
|
||||
*/
|
||||
nextStates.clear();
|
||||
//If we can't continue matching, just return our last matched
|
||||
if (currentStates.size() == 0)
|
||||
@@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) {
|
||||
std::string RegEx::getPattern() {
|
||||
return pattern;
|
||||
}
|
||||
|
||||
std::string RegEx::toString() {
|
||||
return pattern + " -> " + begin->toString();
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) {
|
||||
|
||||
RegExState::RegExState(char inCharacter) {
|
||||
character = inCharacter;
|
||||
inner = NULL;
|
||||
}
|
||||
|
||||
RegExState::~RegExState() {
|
||||
@@ -33,4 +34,13 @@ bool RegExState::isGoal() {
|
||||
return inner == NULL && nextStates.size() == 0;
|
||||
}
|
||||
|
||||
std::string RegExState::toString() {
|
||||
std::string string = "";
|
||||
string += character;
|
||||
for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
|
||||
string += "->" + nextStates[i]->toString() + " EC ";
|
||||
//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
|
||||
return string;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ Symbol::Symbol(std::string name, bool isTerminal) {
|
||||
this->name = name;
|
||||
this->terminal = isTerminal;
|
||||
this->subTree = NULL;
|
||||
value = "HAHAHA VALUE";
|
||||
value = "NoValue";
|
||||
}
|
||||
|
||||
Symbol::Symbol(std::string name, bool isTerminal, std::string value) {
|
||||
|
||||
Reference in New Issue
Block a user