Lexer/RegExs now work on a very basic level. Regular expressions have not been fully implemented, but the structure is there. It creates trivial regexs that only accept a specified sequence, no actual regular expression power.

2013-07-02 01:47:42 -04:00
parent 94a7739bd9
commit 85da0bf646
10 changed files with 72 additions and 16 deletions
--- a/include/Lexer.h
+++ b/include/Lexer.h
@@ -13,7 +13,7 @@ class Lexer {
 		Lexer();
 		Lexer(std::string inputString);
 		~Lexer();
-		void addRegexString(std::string regExString);
+		void addRegEx(std::string regExString);
 		void setInput(std::string inputString);
 		Symbol* next();
 	private:
--- a/include/Parser.h
+++ b/include/Parser.h
@@ -32,7 +32,7 @@ class Parser {
 		std::string stateSetToString();
 		void addToTable(State* fromState, Symbol* tranSymbol, ParseAction* action);
 		ParseAction* getTable(int state, Symbol* token);
-		NodeTree* parseInput(Lexer* lexer);
+		NodeTree* parseInput(std::string inputString);
 		std::string grammerToString();
 		std::string grammerToDOT();
@@ -41,6 +41,7 @@ class Parser {
 	private:
 		StringReader reader;
 		Lexer lexer;
 		std::map<std::string, Symbol*> symbols;
 		std::vector<ParseRule*> loadedGrammer;
--- a/include/RegEx.h
+++ b/include/RegEx.h
@@ -15,6 +15,7 @@ class RegEx {
 		int longMatch(std::string stringToMatch);
 		std::string getPattern();
 		std::string toString();
 	private:
 		std::string pattern;
 		RegExState* begin;
--- a/include/RegExState.h
+++ b/include/RegExState.h
@@ -18,6 +18,7 @@ class RegExState {
 		bool characterIs(char inCharacter);
 		std::vector<RegExState*>* advance(char advanceCharacter);
 		bool isGoal();
 		std::string toString();
 	private:
 		std::vector<RegExState*> nextStates;
--- a/main.cpp
+++ b/main.cpp
@@ -59,7 +59,7 @@ int main(int argc, char* argv[]) {
 	//outFile << parser.grammerToDOT() << std::endl;
 	std::cout << programInputFileString << std::endl;
-	NodeTree* parseTree = parser.parseInput(new Lexer(programInputFileString));
+	NodeTree* parseTree = parser.parseInput(programInputFileString);
 	if (parseTree) {
 		std::cout << parseTree->DOTGraphString() << std::endl;
--- a/src/Lexer.cpp
+++ b/src/Lexer.cpp
@@ -2,6 +2,7 @@
 Lexer::Lexer() {
 	//Do nothing
 	currentPosition = 0;
 }
 Lexer::Lexer(std::string inputString) {
@@ -17,21 +18,33 @@ void Lexer::setInput(std::string inputString) {
 	input = inputString;
 }
-void Lexer::addRegexString(std::string regExString) {
+void Lexer::addRegEx(std::string regExString) {
 	regExs.push_back(new RegEx(regExString));
 }
 Symbol* Lexer::next() {
 	std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
 	//If we're at the end, return an eof
 	if (currentPosition == input.length()-1)
 		return new Symbol("$EOF$", false);
 	int longestMatch = 0;
-	RegEx * longestRegEx = NULL;
+	RegEx* longestRegEx = NULL;
 	std::string remainingString = input.substr(currentPosition,input.length()-1);
 	for (std::vector<RegEx*>::size_type i = 0; i < regExs.size(); i++) {
 		std::cout << "Trying regex " << regExs[i]->toString() << std::endl;
 		int currentMatch = regExs[i]->longMatch(remainingString);
 		if (currentMatch > longestMatch) {
 			longestMatch = currentMatch;
 			longestRegEx = regExs[i];
 		}
 	}
-	currentPosition += longestMatch;
+	if (longestRegEx != NULL) {
-	return new Symbol(longestRegEx->getPattern(), true);
+		currentPosition += longestMatch + 1;
 		std::cout << "Current at is " << input.substr(currentPosition,input.length()-1) << " currentPos is " << currentPosition <<std::endl;
 		return new Symbol(longestRegEx->getPattern(), true);
 	} else {
 		std::cout << "Found no applicable regex" << std::endl;
 		std::cout << "Remaining is " << input.substr(currentPosition,input.length()-1) << std::endl;
 		return NULL;
 	}
 }
--- a/src/Parser.cpp
+++ b/src/Parser.cpp
@@ -33,7 +33,15 @@ void Parser::loadGrammer(std::string grammerInputString) {
 		//Add the right side, adding new Symbols to symbol map.
 		currToken = reader.word();
 		while (currToken != ";") {
-			currentRule->appendToRight(getOrAddSymbol(currToken, currToken.at(0)=='\"')); //If first character is a ", then is a terminal
+			if (currToken[0] == '\"') {
 				//Remove the quotes
 				currToken = currToken.substr(1,currToken.length()-2);
 				lexer.addRegEx(currToken);
 				currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
 			} else {
 				currentRule->appendToRight(getOrAddSymbol(currToken, false));
 			}
 			currToken = reader.word();
 			//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
 			if (currToken == "|") {
@@ -344,8 +352,9 @@ ParseAction* Parser::getTable(int state, Symbol* token) {
 	return (action);
 }
-NodeTree* Parser::parseInput(Lexer* lexer) {
+NodeTree* Parser::parseInput(std::string inputString) {
-	Symbol* token = lexer->next();
+	lexer.setInput(inputString);
 	Symbol* token = lexer.next();
 	ParseAction* action;
 	stateStack.push(0);
@@ -383,7 +392,7 @@ NodeTree* Parser::parseInput(Lexer* lexer) {
 				std::cout << "Shift " << token->toString() << std::endl;
 				symbolStack.push(token);
-				token = lexer->next();
+				token = lexer.next();
 				stateStack.push(action->shiftState);
 				break;
 			case ParseAction::ACCEPT:
--- a/src/RegEx.cpp
+++ b/src/RegEx.cpp
@@ -3,7 +3,7 @@
 RegEx::RegEx(std::string inPattern) {
 	pattern = inPattern;
 	RegExState* current;
-	begin = new RegExState(pattern.at(0));
+	begin = new RegExState(pattern[0]);
 	current = begin;
 	for (int i = 1; i < pattern.length(); i++) {
 		RegExState* next = new RegExState(pattern.at(i));
@@ -17,21 +17,30 @@ RegEx::~RegEx() {
 }
 int RegEx::longMatch(std::string stringToMatch) {
 	//If the beginning character is wrong, exit immediantly. Otherwise, get all the states we can get from adding the second character to the state where we accepted the first
 	if (!begin->characterIs(stringToMatch[0]))
 		return -1;
 	std::cout << "Matched first character: " << stringToMatch[0] << std::endl;
 	int lastMatch = 0;
-	currentStates = *(begin->advance(stringToMatch.at(0)));
+	currentStates = *(begin->advance(stringToMatch[1]));
 	std::vector<RegExState*> nextStates;
-	for (int i = 1; i < stringToMatch.size(); i++) {
+	for (int i = 2; i < stringToMatch.size(); i++) {
 		//Go through every current state. Check to see if it is goal, if so update last goal.
 		//Also, add each state's advance to nextStates
 		for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++) {
-			if (currentStates[j]->isGoal())
+			if (currentStates[j]->isGoal()) {
 				lastMatch = i-1;
 				std::cout << "Hit goal at " << i << " character: " << stringToMatch[i-1] << std::endl;
 			} else {
 				std::cout << "currentState " << j << ", " << currentStates[j]->toString() << " is not goal" <<std::endl;
 			}
 			std::vector<RegExState*>* addStates = currentStates[j]->advance(stringToMatch.at(i));
 			nextStates.insert(nextStates.end(), addStates->begin(), addStates->end());
 			delete addStates;
 		}
 		//Now, clear our current states and add eaczh one of our addStates if it is not already in current states
 		currentStates.clear();
 		for (std::vector<RegExState*>::size_type j = 0; j < nextStates.size(); j++) {
 			bool inCurrStates = false;
@@ -42,6 +51,14 @@ int RegEx::longMatch(std::string stringToMatch) {
 			if (!inCurrStates)
 				currentStates.push_back(nextStates[j]);
 		}
 		if (currentStates.size() != 0)
 			std::cout << "Matched " << i << " character: " << stringToMatch[i-1] << std::endl;
 /*
 		std::cout << "Current states are: ";
 		for (std::vector<RegExState*>::size_type j = 0; j < currentStates.size(); j++)
 			std::cout << currentStates[j]->toString() << " ";
 		std::cout << std::endl;
 */
 		nextStates.clear();
 		//If we can't continue matching, just return our last matched
 		if (currentStates.size() == 0)
@@ -58,3 +75,7 @@ int RegEx::longMatch(std::string stringToMatch) {
 std::string RegEx::getPattern() {
 	return pattern;
 }
 std::string RegEx::toString() {
 	return pattern + " -> " + begin->toString();
 }
--- a/src/RegExState.cpp
+++ b/src/RegExState.cpp
@@ -6,6 +6,7 @@ RegExState::RegExState(RegExState* inInnerState) {
 RegExState::RegExState(char inCharacter) {
 	character = inCharacter;
 	inner = NULL;
 }
 RegExState::~RegExState() {
@@ -33,4 +34,13 @@ bool RegExState::isGoal() {
 	return inner == NULL && nextStates.size() == 0;
 }
 std::string RegExState::toString() {
 	std::string string = "";
 	string += character;
 	for (std::vector<RegExState*>::size_type i = 0; i < nextStates.size(); i++)
 		string += "->" + nextStates[i]->toString() + " EC ";
 	//std::cout << "inner = " << inner << " nextStates size = " << nextStates.size() <<std::endl;
 	return string;
 }
--- a/src/Symbol.cpp
+++ b/src/Symbol.cpp
@@ -4,7 +4,7 @@ Symbol::Symbol(std::string name, bool isTerminal) {
 	this->name = name;
 	this->terminal = isTerminal;
 	this->subTree = NULL;
-	value = "HAHAHA VALUE";
+	value = "NoValue";
 }
 Symbol::Symbol(std::string name, bool isTerminal, std::string value) {