547 lines
20 KiB
Plaintext
547 lines
20 KiB
Plaintext
import string
|
|
import vector
|
|
import set
|
|
import stack
|
|
import map
|
|
import symbol
|
|
import regex
|
|
import io
|
|
import util
|
|
|
|
fun split_into_words(gram_str: string::string): vector::vector<string::string> {
|
|
var out.construct(): vector::vector<string>
|
|
var begin = 0
|
|
for (var i = 0; i < gram_str.length(); i++;) {
|
|
if (gram_str[i] == '#') {
|
|
while(gram_str[i] != '\n') i++
|
|
i++
|
|
io::print("comment: "); io::print(gram_str.slice(begin, i))
|
|
begin = i
|
|
}
|
|
if (gram_str[i] == '"') {
|
|
i++
|
|
while (gram_str[i] != '"') {
|
|
i++
|
|
// if we hit a " we check to see if an odd number of backslashes preceed it
|
|
// (meaning that the " is escaped), and if so, we move on. Otherwise, we found
|
|
// the end of the quoted string
|
|
if (gram_str[i] == '"') {
|
|
var escaped = 0
|
|
while (gram_str[i-(1+escaped)] == '\\') escaped++
|
|
if (escaped % 2)
|
|
i++
|
|
}
|
|
}
|
|
}
|
|
if (gram_str[i] == ' ') {
|
|
out.add(gram_str.slice(begin, i))
|
|
// allow multiple spaces between words
|
|
while (gram_str[i] == ' ') i++
|
|
begin = i
|
|
i--
|
|
}
|
|
if (gram_str[i] == '\n') {
|
|
if (i != begin)
|
|
out.add(gram_str.slice(begin, i))
|
|
begin = i + 1
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
fun load_grammer(gram_str: string::string): grammer {
|
|
var gram.construct(): grammer
|
|
var leftSide = symbol::symbol("", false)
|
|
var doLeftSide = true
|
|
var rightSide = vector::vector<symbol::symbol>()
|
|
/*split_into_words(io::read_file(path)).for_each(fun(word: string::string) {*/
|
|
/*io::print("word: "); io::println(word);*/
|
|
/*})*/
|
|
/*return gram*/
|
|
split_into_words(gram_str).for_each(fun(word: string::string) {
|
|
io::print("word: "); io::println(word)
|
|
if (word == "=") {
|
|
// do nothing
|
|
} else if (word == "|") {
|
|
gram.rules.add(rule(leftSide, rightSide))
|
|
rightSide = vector::vector<symbol::symbol>()
|
|
} else if (word == ";") {
|
|
gram.rules.add(rule(leftSide, rightSide))
|
|
rightSide = vector::vector<symbol::symbol>()
|
|
doLeftSide = true
|
|
} else {
|
|
if (doLeftSide) {
|
|
leftSide = symbol::symbol(word, false)
|
|
gram.non_terminals.add(leftSide)
|
|
} else {
|
|
if (word[0] == '"') {
|
|
// ok, we support both plain terminals "hia*"
|
|
// and decorated terminals "hia*":hi_with_as
|
|
// so first check to find the ending " and see if it's
|
|
// the end of the string
|
|
var last_quote = word.length()-1
|
|
while(word[last_quote] != '"') last_quote--
|
|
if (last_quote != word.length()-1) {
|
|
rightSide.add(symbol::symbol(word.slice(last_quote+2, -1), true))
|
|
gram.terminals.add(util::make_pair(symbol::symbol(word.slice(last_quote+2, -1), true), regex::regex(word.slice(1,last_quote))))
|
|
} else {
|
|
rightSide.add(symbol::symbol(word, true))
|
|
gram.terminals.add(util::make_pair(symbol::symbol(word, true), regex::regex(word.slice(1,last_quote))))
|
|
}
|
|
} else {
|
|
var non_term = symbol::symbol(word, false)
|
|
rightSide.add(non_term)
|
|
gram.non_terminals.add(non_term)
|
|
}
|
|
}
|
|
doLeftSide = false
|
|
}
|
|
})
|
|
return gram
|
|
}
|
|
|
|
obj grammer (Object) {
|
|
var rules: vector::vector<rule>
|
|
var non_terminals: set::set<symbol::symbol>
|
|
var terminals: vector::vector<util::pair<symbol::symbol, regex::regex>>
|
|
var first_set_map: map::map<symbol::symbol, set::set<symbol::symbol>>
|
|
var parse_table: table
|
|
|
|
fun construct(): *grammer {
|
|
rules.construct()
|
|
non_terminals.construct()
|
|
terminals.construct()
|
|
first_set_map.construct()
|
|
parse_table.construct()
|
|
}
|
|
fun copy_construct(old: *grammer) {
|
|
rules.copy_construct(&old->rules)
|
|
non_terminals.copy_construct(&old->non_terminals)
|
|
terminals.copy_construct(&old->terminals)
|
|
first_set_map.copy_construct(&old->first_set_map)
|
|
parse_table.copy_construct(&old->parse_table)
|
|
}
|
|
fun operator=(other: grammer) {
|
|
destruct()
|
|
copy_construct(&other)
|
|
}
|
|
fun destruct() {
|
|
rules.destruct()
|
|
non_terminals.destruct()
|
|
terminals.destruct()
|
|
first_set_map.destruct()
|
|
parse_table.destruct()
|
|
}
|
|
|
|
fun calculate_first_set() {
|
|
// the first set of a terminal is itself
|
|
terminals.for_each( fun(terminal: util::pair<symbol::symbol, regex::regex>)
|
|
first_set_map[terminal.first] = set::set(terminal.first)
|
|
)
|
|
// start out the non-terminals as empty sets
|
|
non_terminals.for_each( fun(non_terminal: symbol::symbol)
|
|
first_set_map[non_terminal] = set::set<symbol::symbol>()
|
|
)
|
|
var changed = true
|
|
while (changed) {
|
|
changed = false
|
|
rules.for_each( fun(r: rule) {
|
|
var rule_lookahead = first_vector(r.rhs)
|
|
if (!changed) {
|
|
changed = !first_set_map[r.lhs].contains(rule_lookahead)
|
|
}
|
|
first_set_map[r.lhs].add(rule_lookahead)
|
|
})
|
|
}
|
|
}
|
|
fun first_vector(rhs: ref vector::vector<symbol::symbol>): set::set<symbol::symbol> {
|
|
var toRet = set::set<symbol::symbol>()
|
|
if (rhs.size) {
|
|
for (var i = 0; i < rhs.size; i++;) {
|
|
var lookahead = first_set_map[rhs[i]]
|
|
if (lookahead.contains(symbol::null_symbol())) {
|
|
// remove the null if this is not the last in the rule
|
|
if (i != rhs.size-1)
|
|
lookahead.remove(symbol::null_symbol())
|
|
toRet.add(lookahead)
|
|
} else {
|
|
toRet.add(lookahead)
|
|
break
|
|
}
|
|
}
|
|
} else {
|
|
toRet.add(symbol::null_symbol())
|
|
}
|
|
return toRet
|
|
}
|
|
|
|
fun calculate_state_automaton() {
|
|
var first_state = closure(state(vector::vector(rules[0].with_lookahead(set::set(symbol::eof_symbol())))))
|
|
var states = vector::vector(first_state) // vector instead of set because we need to iterate by index
|
|
var newItems = stack::stack(0) // 0 is the index of the first and only item in states
|
|
var count = 0
|
|
while (newItems.size()) {
|
|
if (count%200 == 0) {
|
|
io::print("calculate_state_automaton while")
|
|
io::println(count)
|
|
}
|
|
count++
|
|
var I = newItems.pop()
|
|
var possGoto = set::set<symbol::symbol>()
|
|
states[I].items.for_each(fun(r: ref rule) {
|
|
if (!r.at_end())
|
|
possGoto.add(r.next())
|
|
// if r is at end or the rest reduces to null, add a reduce for each lookahead symbol
|
|
if ( r.at_end() || first_vector(r.after_next()).contains(symbol::null_symbol()) ) {
|
|
var rule_no = rules.find(r.plain())
|
|
r.lookahead.for_each(fun(sym: ref symbol::symbol) {
|
|
parse_table.add_reduce(I, sym, rule_no)
|
|
})
|
|
}
|
|
})
|
|
possGoto.for_each(fun(X: ref symbol::symbol) {
|
|
var goneState = goto(states[I], X)
|
|
if (goneState.items.size) {
|
|
var already_state = states.find(goneState)
|
|
if (already_state == -1) {
|
|
parse_table.add_push(I, X, states.size)
|
|
newItems.push(states.size)
|
|
states.add(goneState)
|
|
} else {
|
|
parse_table.add_push(I, X, already_state)
|
|
}
|
|
}
|
|
})
|
|
}
|
|
io::println("ALL STATES:\n")
|
|
states.for_each(fun(i: ref state) {
|
|
io::println("STATE:\n")
|
|
i.items.for_each(fun(r: ref rule) {
|
|
io::println(string::string("\t") + r.to_string())
|
|
})
|
|
})
|
|
io::println(" there were : states")
|
|
io::println(states.size)
|
|
io::println(" there were : table")
|
|
/*io::println(parse_table.to_string())*/
|
|
parse_table.print_string()
|
|
}
|
|
|
|
fun closure(initial: ref state): state {
|
|
initial.items = closure(initial.items)
|
|
return initial
|
|
}
|
|
fun closure(initial: ref vector::vector<rule>): vector::vector<rule> {
|
|
var continueIt = true
|
|
//var count = 0
|
|
while (continueIt) {
|
|
//io::print("closure while")
|
|
//io::println(count)
|
|
//count++
|
|
continueIt = false
|
|
for (var i = 0; i < initial.size; i++;) {
|
|
if (initial[i].at_end()) {
|
|
continue
|
|
}
|
|
rules.for_each(fun(r: ref rule) {
|
|
// if i is |a::=c . Bb, a|, we're doing each B::=... in rules
|
|
if (r.lhs != initial[i].next())
|
|
return // continue the for-each
|
|
// add r with lookahead
|
|
var newLookahead = first_vector(initial[i].after_next())
|
|
if (newLookahead.contains(symbol::null_symbol())) {
|
|
newLookahead.remove(symbol::null_symbol())
|
|
newLookahead.add(initial[i].lookahead)
|
|
}
|
|
var alreadyInInSomeForm = false
|
|
for (var index = 0; index < initial.size; index++;) {
|
|
if (initial[index].equals_but_lookahead(r)) {
|
|
alreadyInInSomeForm = true
|
|
if (!initial[index].lookahead.contains(newLookahead)) {
|
|
//io::println("\n\n\n")
|
|
//io::println(initial[index].to_string())
|
|
//io::println("and")
|
|
//io::println(r.to_string())
|
|
//io::println("with")
|
|
//var result = string::string("|lookahead {")
|
|
//newLookahead.for_each(fun(i: symbol::symbol) {
|
|
//result += i.to_string()
|
|
//})
|
|
//io::println(result)
|
|
//io::println("are the same with different lookaheads")
|
|
initial[index].lookahead += newLookahead
|
|
//io::println("so now it's")
|
|
//io::println(initial[index].to_string())
|
|
//io::println("contineu because equal_but_different")
|
|
continueIt = true
|
|
return // continue the rules for-each
|
|
}
|
|
}
|
|
}
|
|
if (!alreadyInInSomeForm) {
|
|
continueIt = true
|
|
//io::println("\n\n\n")
|
|
//io::println("contineu because not contains")
|
|
//io::println(newRule.to_string())
|
|
initial.add(r.with_lookahead(newLookahead))
|
|
}
|
|
})
|
|
}
|
|
}
|
|
return initial
|
|
}
|
|
|
|
fun goto(I: ref state, X: ref symbol::symbol): state {
|
|
// loop through i, find all that have thing::= something . X more,
|
|
// add thing ::= something X . more
|
|
var jPrime = vector::vector<rule>()
|
|
I.items.for_each(fun(i: ref rule) {
|
|
if (!i.at_end() && i.next() == X)
|
|
jPrime.add(i.advanced())
|
|
})
|
|
// return closure(that)?
|
|
return state(closure(jPrime))
|
|
}
|
|
|
|
fun to_string(): string::string {
|
|
var result = string::string("grammer rules:")
|
|
rules.for_each( fun(i : rule) { result += string::string("\n\t") + i.to_string(); } )
|
|
result += "\nnon_terminals:"
|
|
non_terminals.for_each( fun(i : symbol::symbol) { result += string::string("\n\t") + i.to_string(); } )
|
|
result += "\nterminals:"
|
|
terminals.for_each( fun(i : util::pair<symbol::symbol, regex::regex>) { result += string::string("\n\t") + i.first.to_string() + ": " + i.second.regexString; } )
|
|
return result
|
|
}
|
|
}
|
|
|
|
fun rule(lhs: symbol::symbol, rhs: vector::vector<symbol::symbol>): rule {
|
|
var toRet.construct(): rule
|
|
toRet.lhs = lhs
|
|
toRet.rhs = rhs
|
|
return toRet
|
|
}
|
|
|
|
obj rule (Object) {
|
|
var lhs: symbol::symbol
|
|
var rhs: vector::vector<symbol::symbol>
|
|
var position: int
|
|
var lookahead: set::set<symbol::symbol>
|
|
|
|
fun construct(): *rule {
|
|
lhs.construct()
|
|
rhs.construct()
|
|
position = 0
|
|
lookahead.construct()
|
|
}
|
|
fun copy_construct(other: *rule) {
|
|
lhs.copy_construct(&other->lhs)
|
|
rhs.copy_construct(&other->rhs)
|
|
position = other->position
|
|
lookahead.copy_construct(&other->lookahead)
|
|
}
|
|
fun operator=(other: rule) {
|
|
destruct()
|
|
copy_construct(&other)
|
|
}
|
|
fun operator==(other: ref rule):bool {
|
|
return lhs == other.lhs && rhs == other.rhs &&
|
|
position == other.position && lookahead == other.lookahead
|
|
}
|
|
fun equals_but_lookahead(other: ref rule):bool {
|
|
return lhs == other.lhs && rhs == other.rhs &&
|
|
position == other.position
|
|
}
|
|
fun destruct() {
|
|
lhs.destruct()
|
|
rhs.destruct()
|
|
lookahead.destruct()
|
|
}
|
|
|
|
fun next(): ref symbol::symbol {
|
|
return rhs[position]
|
|
}
|
|
fun after_next(): vector::vector<symbol::symbol> {
|
|
return rhs.slice(position + 1, -1)
|
|
}
|
|
fun at_end(): bool {
|
|
return position >= rhs.size
|
|
}
|
|
fun plain(): rule {
|
|
return rule(lhs, rhs)
|
|
}
|
|
fun with_lookahead(newLookahead: set::set<symbol::symbol>): rule {
|
|
var toRet = rule(lhs, rhs)
|
|
toRet.position = position
|
|
toRet.lookahead = newLookahead
|
|
return toRet
|
|
}
|
|
fun advanced(): rule {
|
|
var toRet = rule(lhs, rhs)
|
|
toRet.position = position+1
|
|
toRet.lookahead = lookahead
|
|
return toRet
|
|
}
|
|
|
|
fun to_string(): string::string {
|
|
var result = lhs.name + " -> "
|
|
for (var i = 0; i < rhs.size; i++;)
|
|
if (i == position)
|
|
result += string::string(" . ") + rhs[i].to_string() + ", ";
|
|
else
|
|
result += rhs[i].to_string() + ", ";
|
|
if (position == rhs.size)
|
|
result += " . "
|
|
result += "|lookahead {"
|
|
lookahead.for_each(fun(i: symbol::symbol) {
|
|
result += i.to_string()
|
|
})
|
|
result += "}"
|
|
return result
|
|
}
|
|
}
|
|
|
|
fun state(itemsIn: ref vector::vector<rule>): state {
|
|
var toRet.construct(itemsIn): state
|
|
return toRet
|
|
}
|
|
|
|
obj state (Object) {
|
|
var items: vector::vector<rule>
|
|
|
|
fun construct(): *state {
|
|
items.construct()
|
|
}
|
|
fun construct(itemsIn: ref vector::vector<rule>): *state {
|
|
items.copy_construct(&itemsIn)
|
|
}
|
|
fun copy_construct(other: *state) {
|
|
items.copy_construct(&other->items)
|
|
}
|
|
fun operator=(other: state) {
|
|
destruct()
|
|
copy_construct(&other)
|
|
}
|
|
fun destruct() {
|
|
items.destruct()
|
|
}
|
|
fun operator==(other: ref state):bool {
|
|
return items == other.items
|
|
}
|
|
fun to_string(): string::string {
|
|
return string::string("woo a state")
|
|
}
|
|
}
|
|
|
|
// REALLY need those enums
|
|
var push = 0
|
|
var reduce = 1
|
|
// note that these two are not actually currently used
|
|
// accept is the reduce of the goal rule and reject is the
|
|
// absence of actions
|
|
var accept = 2
|
|
var reject = 3
|
|
fun action(act: int, state_or_rule: int): action {
|
|
var toRet: action
|
|
toRet.act = act
|
|
toRet.state_or_rule = state_or_rule
|
|
return toRet
|
|
}
|
|
obj action {
|
|
var act: int // really need those enums
|
|
var state_or_rule: int // sigh
|
|
fun operator==(other: action): bool {
|
|
return act == other.act && state_or_rule == other.state_or_rule
|
|
}
|
|
}
|
|
|
|
obj table (Object) {
|
|
// a 2 dimensional table made of a vector and a map that maps from stateno & symbol to a vector of parse actions
|
|
var items: vector::vector<map::map<symbol::symbol, vector::vector<action>>>
|
|
|
|
fun construct(): *table {
|
|
items.construct()
|
|
}
|
|
fun copy_construct(other: *table) {
|
|
items.copy_construct(&other->items)
|
|
}
|
|
fun operator=(other: table) {
|
|
destruct()
|
|
copy_construct(&other)
|
|
}
|
|
fun destruct() {
|
|
items.destruct()
|
|
}
|
|
fun expand_to(include_state: int) {
|
|
while (include_state >= items.size)
|
|
items.addEnd(map::map<symbol::symbol, vector::vector<action>>())
|
|
}
|
|
// we always "clean" the symbol before using it so that having different data doesn't
|
|
// prevent us from finding the symbol in the table
|
|
fun clean_symbol(sym: ref symbol::symbol): symbol::symbol {
|
|
return symbol::symbol(sym.name, sym.terminal)
|
|
}
|
|
fun add_push(from_state: int, on_symbol: ref symbol::symbol, to_state: int) {
|
|
expand_to(from_state)
|
|
var cleaned_symbol = clean_symbol(on_symbol)
|
|
if (items[from_state].contains_key(cleaned_symbol))
|
|
items[from_state][cleaned_symbol].addEnd(action(push, to_state))
|
|
else
|
|
items[from_state].set(cleaned_symbol, vector::vector(action(push, to_state)))
|
|
}
|
|
fun add_reduce(from_state: int, on_symbol: ref symbol::symbol, by_rule_no: int) {
|
|
expand_to(from_state)
|
|
var cleaned_symbol = clean_symbol(on_symbol)
|
|
if (items[from_state].contains_key(cleaned_symbol))
|
|
items[from_state][cleaned_symbol].addEnd(action(reduce, by_rule_no))
|
|
else
|
|
items[from_state].set(cleaned_symbol, vector::vector(action(reduce, by_rule_no)))
|
|
}
|
|
fun add_accept(from_state: int, on_symbol: ref symbol::symbol) {
|
|
expand_to(from_state)
|
|
var cleaned_symbol = clean_symbol(on_symbol)
|
|
if (items[from_state].contains_key(cleaned_symbol))
|
|
items[from_state][cleaned_symbol].addEnd(action(accept, 0))
|
|
else
|
|
items[from_state].set(cleaned_symbol, vector::vector(action(accept, 0)))
|
|
}
|
|
fun get(state: int, on_symbol: symbol::symbol): vector::vector<action> {
|
|
var cleaned_symbol = clean_symbol(on_symbol)
|
|
return items[state][cleaned_symbol]
|
|
}
|
|
fun get_shift(state: int, on_symbol: symbol::symbol): action {
|
|
var actions = get(state, on_symbol)
|
|
for (var i = 0; i < actions.size; i++;)
|
|
if (actions[i].act == push)
|
|
return actions[i]
|
|
io::println("tried to get a shift when none existed")
|
|
return action(-1,-1)
|
|
}
|
|
fun print_string() {
|
|
/*return string::string("woo a table of size: ") + items.size*/
|
|
io::print("woo a table of size: ")
|
|
io::println(items.size)
|
|
for (var i = 0; i < items.size; i++;) {
|
|
io::print("for state: ")
|
|
io::println(i)
|
|
items[i].for_each(fun(sym: symbol::symbol, actions: vector::vector<action>) {
|
|
actions.for_each(fun(action: action) {
|
|
io::print("\ton symbol: ")
|
|
io::print(sym.to_string())
|
|
io::print(" do action: ")
|
|
if (action.act == push)
|
|
io::print("push ")
|
|
else if (action.act == reduce)
|
|
io::print("reduce ")
|
|
else if (action.act == accept)
|
|
io::print("accept ")
|
|
else if (action.act == reject)
|
|
io::print("reject ")
|
|
io::print(action.state_or_rule)
|
|
io::println()
|
|
})
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|