408 lines
15 KiB
C++
408 lines
15 KiB
C++
#include "Parser.h"
|
|
|
|
Parser::Parser() : EOFSymbol("$EOF$", true), nullSymbol("$NULL$", true), invalidSymbol("$INVALID$", true){
|
|
table.setSymbols(EOFSymbol, nullSymbol);
|
|
}
|
|
|
|
Parser::~Parser() {
|
|
}
|
|
|
|
void Parser::exportTable(std::ofstream &file) {
|
|
//Do table
|
|
table.exportTable(file);
|
|
}
|
|
void Parser::importTable(char* tableData) {
|
|
//Do table
|
|
table.importTable(tableData);
|
|
return;
|
|
}
|
|
|
|
Symbol Parser::getOrAddSymbol(std::string symbolString, bool isTerminal) {
|
|
Symbol symbol;
|
|
std::pair<std::string, bool> entry = std::make_pair(symbolString, isTerminal);
|
|
if (symbols.find(entry) == symbols.end()) {
|
|
symbol = Symbol(symbolString, isTerminal);
|
|
symbols[entry] = symbol;
|
|
} else {
|
|
symbol = symbols[entry];
|
|
}
|
|
return(symbol);
|
|
}
|
|
|
|
void Parser::loadGrammer(std::string grammerInputString) {
|
|
reader.setString(grammerInputString);
|
|
|
|
std::string currToken = reader.word(false); //Don't truncate so we can find the newline correctly (needed for comments)
|
|
|
|
while(currToken != "") {
|
|
//First, if this starts with a '#', skip this
|
|
if (currToken.front() == '#') {
|
|
//If this line is more than one token long, eat it
|
|
//std::cout << "Ate: " << currToken << std::endl;
|
|
if (currToken.back() != '\n') {
|
|
std::string ate = reader.line();
|
|
//std::cout << "Eating " << ate << " b/c grammer comment" << std::endl;
|
|
}
|
|
currToken = reader.word(false);
|
|
continue;
|
|
}
|
|
if (currToken.back() == '\n' || currToken.back() == ' ' || currToken.back() == '\t')
|
|
currToken.erase(currToken.size()-1);
|
|
|
|
//Load the left of the rule
|
|
ParseRule* currentRule = new ParseRule();
|
|
Symbol leftSide = getOrAddSymbol(currToken, false); //Left handle is never a terminal
|
|
currentRule->setLeftHandle(leftSide);
|
|
reader.word(); //Remove the =
|
|
//Add the right side, adding Symbols to symbol map.
|
|
currToken = reader.word();
|
|
while (currToken != ";") {
|
|
|
|
//If there are multiple endings to this rule, finish this rule and start a new one with same left handle
|
|
while (currToken == "|") {
|
|
//If we haven't added anything, that means that this is a null rule
|
|
if (currentRule->getRightSide().size() == 0)
|
|
currentRule->appendToRight(nullSymbol);
|
|
|
|
loadedGrammer.push_back(currentRule);
|
|
currentRule = new ParseRule();
|
|
currentRule->setLeftHandle(leftSide);
|
|
currToken = reader.word();
|
|
}
|
|
|
|
if (currToken == ";")
|
|
break;
|
|
|
|
if (currToken[0] == '\"') {
|
|
//Remove the quotes
|
|
currToken = currToken.substr(1,currToken.length()-2);
|
|
lexer.addRegEx(currToken);
|
|
currentRule->appendToRight(getOrAddSymbol(currToken, true)); //If first character is a ", then is a terminal
|
|
} else {
|
|
currentRule->appendToRight(getOrAddSymbol(currToken, false));
|
|
}
|
|
currToken = reader.word();
|
|
}
|
|
//Add new rule to grammer
|
|
//If we haven't added anything, that means that this is a null rule
|
|
if (currentRule->getRightSide().size() == 0)
|
|
currentRule->appendToRight(nullSymbol);
|
|
|
|
loadedGrammer.push_back(currentRule);
|
|
//Get next token
|
|
currToken = reader.word(false);
|
|
}
|
|
//std::cout << "Parsed!\n";
|
|
|
|
// for (std::vector<ParseRule*>::size_type i = 0; i < loadedGrammer.size(); i++)
|
|
// std::cout << loadedGrammer[i]->toString() << std::endl;
|
|
}
|
|
|
|
void Parser::createStateSet() {
|
|
std::cout << "Begining creation of stateSet" << std::endl;
|
|
//First state has no parents
|
|
|
|
//Set the first state's basis to be the goal rule with lookahead EOF
|
|
ParseRule* goalRule = loadedGrammer[0]->clone();
|
|
std::vector<Symbol> goalRuleLookahead;
|
|
goalRuleLookahead.push_back(EOFSymbol);
|
|
goalRule->setLookahead(goalRuleLookahead);
|
|
State* zeroState = new State(0, goalRule);
|
|
stateSets.push_back(zeroState);
|
|
std::queue<State*> toDo;
|
|
toDo.push(zeroState);
|
|
//std::cout << "Begining for main set for loop" << std::endl;
|
|
int count = 0;
|
|
while (toDo.size()) {
|
|
if (count % 200 == 0)
|
|
std::cout << "while count: " << count << std::endl;
|
|
count++;
|
|
//closure
|
|
closure(toDo.front());
|
|
//Add the new states
|
|
addStates(&stateSets, toDo.front(), &toDo);
|
|
toDo.pop();
|
|
}
|
|
table.remove(1, EOFSymbol);
|
|
}
|
|
|
|
int Parser::stateNum(State* state) {
|
|
for (std::vector<State*>::size_type i = 0; i < stateSets.size(); i++) {
|
|
if (*(stateSets[i]) == *state) {
|
|
return i;
|
|
}
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
std::vector<Symbol> Parser::firstSet(Symbol token, std::vector<Symbol> avoidList, bool addNewTokens) {
|
|
if (tokenFirstSet.find(token) != tokenFirstSet.end())
|
|
return tokenFirstSet[token];
|
|
//If we've already done this token, don't do it again
|
|
for (std::vector<Symbol>::size_type i = 0; i < avoidList.size(); i++)
|
|
if (avoidList[i] == token)
|
|
return std::vector<Symbol>();
|
|
avoidList.push_back(token);
|
|
|
|
std::vector<Symbol> first;
|
|
//First, if the symbol is a terminal, than it's first set is just itself.
|
|
if (token.isTerminal()) {
|
|
first.push_back(token);
|
|
return(first);
|
|
}
|
|
//Otherwise....
|
|
//Ok, to make a first set, go through the grammer, if the token it's left side, add it's production's first token's first set.
|
|
//If that one includes mull, do the next one too (if it exists).
|
|
Symbol rightToken;
|
|
std::vector<Symbol> recursiveFirstSet;
|
|
for (std::vector<ParseRule*>::size_type i = 0; i < loadedGrammer.size(); i++) {
|
|
if (token == loadedGrammer[i]->getLeftSide()) {
|
|
//Loop through the rule adding first sets for each token if the previous token contained NULL
|
|
int j = 0;
|
|
do {
|
|
rightToken = loadedGrammer[i]->getRightSide()[j]; //Get token of the right side of this rule
|
|
if (rightToken.isTerminal()) {
|
|
recursiveFirstSet.push_back(rightToken);
|
|
} else {
|
|
//Add the entire set
|
|
recursiveFirstSet = firstSet(rightToken, avoidList, false);//Don't add children to cache, as early termination may cause them to be incomplete
|
|
}
|
|
first.insert(first.end(), recursiveFirstSet.begin(), recursiveFirstSet.end());
|
|
j++;
|
|
} while (isNullable(rightToken) && loadedGrammer[i]->getRightSide().size() > j);
|
|
}
|
|
}
|
|
if (addNewTokens)
|
|
tokenFirstSet[token] = first;
|
|
return(first);
|
|
}
|
|
|
|
bool Parser::isNullable(Symbol token) {
|
|
if (tokenNullable.find(token) != tokenNullable.end())
|
|
return tokenNullable[token];
|
|
bool nullable = isNullableHelper(token, std::set<Symbol>());
|
|
tokenNullable[token] = nullable;
|
|
return nullable;
|
|
}
|
|
//We use this helper function to recurse because it is possible to wind up with loops, and if so we want
|
|
//early termination. However, this means that nullable determinations in the middle of the loop are inaccurate
|
|
//(since we terminated early), so we don't want to save them. Thus, for simplicity, only the main method will
|
|
//add to the cache. This is somewhat unfortunate for preformance, but the necessary additions to keep track of
|
|
//invalidated state are more complicated than it's worth.
|
|
bool Parser::isNullableHelper(Symbol token, std::set<Symbol> done) {
|
|
if (token.isTerminal())
|
|
return token == nullSymbol;
|
|
if (done.find(token) != done.end())
|
|
return false;
|
|
done.insert(token);
|
|
if (tokenNullable.find(token) != tokenNullable.end())
|
|
return tokenNullable[token];
|
|
|
|
for (std::vector<ParseRule*>::size_type i = 0; i < loadedGrammer.size(); i++) {
|
|
if (token == loadedGrammer[i]->getLeftSide()) {
|
|
auto rightSide = loadedGrammer[i]->getRightSide();
|
|
bool ruleNullable = true;
|
|
for (int j = 0; j < rightSide.size(); j++) {
|
|
if (!isNullableHelper(rightSide[j], done)) {
|
|
ruleNullable = false;
|
|
break;
|
|
}
|
|
}
|
|
if (ruleNullable)
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
//Return the correct lookahead. This followSet is built based on the current rule's lookahead if at end, or the next Symbol's first set.
|
|
std::vector<Symbol> Parser::incrementiveFollowSet(ParseRule* rule) {
|
|
//Advance the pointer past the current Symbol (the one we want the followset for) to the next symbol (which might be in our follow set, or might be the end)
|
|
rule = rule->clone();
|
|
rule->advancePointer();
|
|
|
|
//Get the first set of the next Symbol. If it contains nullSymbol, keep doing for the next one
|
|
std::vector<Symbol> followSet;
|
|
std::vector<Symbol> symbolFirstSet;
|
|
bool symbolFirstSetHasNull = true;
|
|
while (symbolFirstSetHasNull && !rule->isAtEnd()) {
|
|
symbolFirstSetHasNull = false;
|
|
symbolFirstSet = firstSet(rule->getAtNextIndex());
|
|
for (std::vector<Symbol>::size_type i = 0; i < symbolFirstSet.size(); i++) {
|
|
if (symbolFirstSet[i] == nullSymbol) {
|
|
symbolFirstSetHasNull = true;
|
|
symbolFirstSet.erase(symbolFirstSet.begin()+i);
|
|
break;
|
|
}
|
|
}
|
|
followSet.insert(followSet.end(), symbolFirstSet.begin(), symbolFirstSet.end());
|
|
rule->advancePointer();
|
|
}
|
|
if (rule->isAtEnd()) {
|
|
symbolFirstSet = rule->getLookahead();
|
|
followSet.insert(followSet.end(), symbolFirstSet.begin(), symbolFirstSet.end());
|
|
}
|
|
std::vector<Symbol> followSetReturn;
|
|
for (std::vector<Symbol>::size_type i = 0; i < followSet.size(); i++) {
|
|
bool alreadyIn = false;
|
|
for (std::vector<Symbol>::size_type j = 0; j < followSetReturn.size(); j++)
|
|
if (followSet[i] == followSetReturn[j]) {
|
|
alreadyIn = true;
|
|
break;
|
|
}
|
|
if (!alreadyIn)
|
|
followSetReturn.push_back(followSet[i]);
|
|
}
|
|
delete rule;
|
|
return followSetReturn;
|
|
}
|
|
|
|
void Parser::closure(State* state) {
|
|
//Add all the applicable rules.
|
|
//std::cout << "Closure on " << state->toString() << " is" << std::endl;
|
|
std::vector<ParseRule*> stateTotal = state->getTotal();
|
|
for (std::vector<ParseRule*>::size_type i = 0; i < stateTotal.size(); i++) {
|
|
ParseRule* currentStateRule = stateTotal[i];
|
|
//If it's at it's end, move on. We can't advance it.
|
|
if(currentStateRule->isAtEnd())
|
|
continue;
|
|
for (std::vector<ParseRule*>::size_type j = 0; j < loadedGrammer.size(); j++) {
|
|
//If the current symbol in the rule is not null (rule completed) and it equals a grammer's left side
|
|
ParseRule* currentGramRule = loadedGrammer[j]->clone();
|
|
if (currentStateRule->getAtNextIndex() == currentGramRule->getLeftSide()) {
|
|
//std::cout << (*stateTotal)[i]->getAtNextIndex()->toString() << " has an applicable production " << loadedGrammer[j]->toString() << std::endl;
|
|
//Now, add the correct lookahead. This followSet is built based on the current rule's lookahead if at end, or the next Symbol's first set.
|
|
//std::cout << "Setting lookahead for " << currentGramRule->toString() << " in state " << state->toString() << std::endl;
|
|
currentGramRule->setLookahead(incrementiveFollowSet(currentStateRule));
|
|
|
|
//Check to make sure not already in
|
|
bool isAlreadyInState = false;
|
|
for (std::vector<ParseRule*>::size_type k = 0; k < stateTotal.size(); k++) {
|
|
if (stateTotal[k]->equalsExceptLookahead(*currentGramRule)) {
|
|
//std::cout << (*stateTotal)[k]->toString() << std::endl;
|
|
stateTotal[k]->addLookahead(currentGramRule->getLookahead());
|
|
isAlreadyInState = true;
|
|
delete currentGramRule;
|
|
break;
|
|
}
|
|
}
|
|
if (!isAlreadyInState) {
|
|
state->remaining.push_back(currentGramRule);
|
|
stateTotal = state->getTotal();
|
|
}
|
|
} else {
|
|
delete currentGramRule;
|
|
}
|
|
}
|
|
}
|
|
//std::cout << state->toString() << std::endl;
|
|
}
|
|
|
|
//Adds state if it doesn't already exist.
|
|
void Parser::addStates(std::vector< State* >* stateSets, State* state, std::queue<State*>* toDo) {
|
|
std::vector< State* > newStates;
|
|
//For each rule in the state we already have
|
|
std::vector<ParseRule*> currStateTotal = state->getTotal();
|
|
for (std::vector<ParseRule*>::size_type i = 0; i < currStateTotal.size(); i++) {
|
|
//Clone the current rule
|
|
ParseRule* advancedRule = currStateTotal[i]->clone();
|
|
//Try to advance the pointer, if sucessful see if it is the correct next symbol
|
|
if (advancedRule->advancePointer()) {
|
|
//Technically, it should be the set of rules sharing this symbol advanced past in the basis for new state
|
|
|
|
//So search our new states to see if any of them use this advanced symbol as a base.
|
|
//If so, add this rule to them.
|
|
//If not, create it.
|
|
bool symbolAlreadyInState = false;
|
|
for (std::vector< State* >::size_type j = 0; j < newStates.size(); j++) {
|
|
if (newStates[j]->basis[0]->getAtIndex() == advancedRule->getAtIndex()) {
|
|
symbolAlreadyInState = true;
|
|
//So now check to see if this exact rule is in this state
|
|
if (!newStates[j]->containsRule(advancedRule))
|
|
newStates[j]->basis.push_back(advancedRule);
|
|
//We found a state with the same symbol, so stop searching
|
|
break;
|
|
}
|
|
}
|
|
if (!symbolAlreadyInState) {
|
|
State* newState = new State(stateSets->size()+newStates.size(),advancedRule, state);
|
|
newStates.push_back(newState);
|
|
}
|
|
} else {
|
|
delete advancedRule;
|
|
}
|
|
//Also add any completed rules as reduces in the action table
|
|
//See if reduce
|
|
//Also, this really only needs to be done for the state's basis, but we're already iterating through, so...
|
|
std::vector<Symbol> lookahead = currStateTotal[i]->getLookahead();
|
|
if (currStateTotal[i]->isAtEnd()) {
|
|
for (std::vector<Symbol>::size_type j = 0; j < lookahead.size(); j++)
|
|
table.add(stateNum(state), lookahead[j], new ParseAction(ParseAction::REDUCE, currStateTotal[i]));
|
|
} else if (currStateTotal[i]->getAtNextIndex() == nullSymbol) {
|
|
//If is a rule that produces only NULL, add in the approprite reduction, but use a new rule with a right side of length 0. (so we don't pop off stack)
|
|
ParseRule* nullRule = currStateTotal[i]->clone();
|
|
nullRule->setRightSide(std::vector<Symbol>());
|
|
for (std::vector<Symbol>::size_type j = 0; j < lookahead.size(); j++)
|
|
table.add(stateNum(state), lookahead[j], new ParseAction(ParseAction::REDUCE, nullRule));
|
|
}
|
|
}
|
|
//Put all our new states in the set of states only if they're not already there.
|
|
bool stateAlreadyInAllStates = false;
|
|
Symbol currStateSymbol;
|
|
for (std::vector< State * >::size_type i = 0; i < newStates.size(); i++) {
|
|
stateAlreadyInAllStates = false;
|
|
currStateSymbol = (*(newStates[i]->getBasis()))[0]->getAtIndex();
|
|
for (std::vector< State * >::size_type j = 0; j < stateSets->size(); j++) {
|
|
if (newStates[i]->basisEquals(*((*stateSets)[j]))) {
|
|
stateAlreadyInAllStates = true;
|
|
//If it does exist, we should add it as the shift/goto in the action table
|
|
(*stateSets)[j]->addParents(newStates[i]->getParents());
|
|
table.add(stateNum(state), currStateSymbol, new ParseAction(ParseAction::SHIFT, j));
|
|
break;
|
|
}
|
|
}
|
|
if (!stateAlreadyInAllStates) {
|
|
//If the state does not already exist, add it and add it as the shift/goto in the action table
|
|
stateSets->push_back(newStates[i]);
|
|
toDo->push(newStates[i]);
|
|
table.add(stateNum(state), currStateSymbol, new ParseAction(ParseAction::SHIFT, stateSets->size()-1));
|
|
}
|
|
}
|
|
}
|
|
|
|
std::string Parser::stateSetToString() {
|
|
std::string concat = "";
|
|
for (std::vector< State *>::size_type i = 0; i < stateSets.size(); i++) {
|
|
concat += intToString(i) + " is " + stateSets[i]->toString();
|
|
}
|
|
return concat;
|
|
}
|
|
|
|
|
|
std::string Parser::tableToString() {
|
|
return table.toString();
|
|
}
|
|
|
|
//parseInput is now pure virtual
|
|
|
|
std::string Parser::grammerToString() {
|
|
//Iterate through the vector, adding string representation of each grammer rule
|
|
std::cout << "About to toString\n";
|
|
std::string concat = "";
|
|
for (int i = 0; i < loadedGrammer.size(); i++) {
|
|
concat += loadedGrammer[i]->toString() + "\n";
|
|
}
|
|
return(concat);
|
|
}
|
|
|
|
std::string Parser::grammerToDOT() {
|
|
//Iterate through the vector, adding DOT representation of each grammer rule
|
|
//std::cout << "About to DOT export\n";
|
|
std::string concat = "";
|
|
for (int i = 0; i < loadedGrammer.size(); i++) {
|
|
concat += loadedGrammer[i]->toDOT();
|
|
}
|
|
return("digraph Kraken_Grammer { \n" + concat + "}");
|
|
}
|
|
|