import str import vec import set import stack import map import symbol import regex import io import util import serialize fun split_into_words(gram_str: str::str): vec::vec { // var out.construct(): vec::vec var out.construct(): vec::vec var begin = 0 for (var i = 0; i < gram_str.length(); i++;) { if (gram_str[i] == '#') { while(gram_str[i] != '\n') i++ i++ io::print("comment: "); io::print(gram_str.slice(begin, i)) begin = i } if (gram_str[i] == '"') { i++ while (gram_str[i] != '"') { i++ // if we hit a " we check to see if an odd number of backslashes preceed it // (meaning that the " is escaped), and if so, we move on. Otherwise, we found // the end of the quoted str if (gram_str[i] == '"') { var escaped = 0 while (gram_str[i-(1+escaped)] == '\\') escaped++ if (escaped % 2) i++ } } } if (gram_str[i] == ' ') { out.add(gram_str.slice(begin, i)) // allow multiple spaces between words while (gram_str[i] == ' ') i++ begin = i i-- } if (gram_str[i] == '\n') { if (i != begin) out.add(gram_str.slice(begin, i)) begin = i + 1 } } return out } fun load_grammer(gram_str: str::str): grammer { var gram.construct(): grammer var leftSide = symbol::symbol("", false) var doLeftSide = true var rightSide = vec::vec() /*split_into_words(io::read_file(path)).for_each(fun(word: str::str) {*/ /*io::print("word: "); io::println(word);*/ /*})*/ /*return gram*/ split_into_words(gram_str).for_each(fun(word: str::str) { /*io::print("word: "); io::println(word)*/ if (word == "=") { // do nothing } else if (word == "|") { gram.rules.add(rule(leftSide, rightSide)) rightSide = vec::vec() } else if (word == ";") { gram.rules.add(rule(leftSide, rightSide)) rightSide = vec::vec() doLeftSide = true } else { if (doLeftSide) { leftSide = symbol::symbol(word, false) gram.non_terminals.add(leftSide) } else { if (word[0] == '"') { // ok, we support both plain terminals "hia*" // and decorated terminals "hia*":hi_with_as // so first check to find the ending " and see if it's // the end of the str var last_quote = word.length()-1 while(word[last_quote] != '"') last_quote-- if (last_quote != word.length()-1) { rightSide.add(symbol::symbol(word.slice(last_quote+2, -1), true)) gram.terminals.add(util::make_pair(symbol::symbol(word.slice(last_quote+2, -1), true), regex::regex(word.slice(1,last_quote)))) } else { rightSide.add(symbol::symbol(word, true)) gram.terminals.add(util::make_pair(symbol::symbol(word, true), regex::regex(word.slice(1,last_quote)))) } } else { var non_term = symbol::symbol(word, false) rightSide.add(non_term) gram.non_terminals.add(non_term) } } doLeftSide = false } }) return gram } obj grammer (Object, Serializable) { var rules: vec::vec var non_terminals: set::set var terminals: vec::vec> var first_set_map: map::map> var parse_table: table fun construct(): *grammer { rules.construct() non_terminals.construct() terminals.construct() first_set_map.construct() parse_table.construct() } fun copy_construct(old: *grammer) { rules.copy_construct(&old->rules) non_terminals.copy_construct(&old->non_terminals) terminals.copy_construct(&old->terminals) first_set_map.copy_construct(&old->first_set_map) parse_table.copy_construct(&old->parse_table) } fun operator=(other: grammer) { destruct() copy_construct(&other) } fun destruct() { rules.destruct() non_terminals.destruct() terminals.destruct() first_set_map.destruct() parse_table.destruct() } fun serialize(): vec::vec { return serialize::serialize(rules) + serialize::serialize(non_terminals) + serialize::serialize(terminals) + serialize::serialize(first_set_map) + serialize::serialize(parse_table) } fun unserialize(it: ref vec::vec, pos: int): int { // get everything constructed before the assignment /*construct()*/ /*util::unpack(rules, pos) = serialize::unserialize>(it, pos)*/ /*util::unpack(non_terminals, pos) = serialize::unserialize>(it, pos)*/ /*util::unpack(terminals, pos) = serialize::unserialize>>(it, pos)*/ /*util::unpack(first_set_map, pos) = serialize::unserialize>>(it, pos)*/ /*util::unpack(parse_table, pos) = serialize::unserialize(it, pos)*/ // do it in place. Actually looks nicer too pos = rules.unserialize(it, pos) pos = non_terminals.unserialize(it, pos) pos = terminals.unserialize(it, pos) pos = first_set_map.unserialize(it, pos) pos = parse_table.unserialize(it, pos) return pos } fun calculate_first_set() { // the first set of a terminal is itself terminals.for_each( fun(terminal: util::pair) first_set_map[terminal.first] = set::set(terminal.first) ) // start out the non-terminals as empty sets non_terminals.for_each( fun(non_terminal: symbol::symbol) first_set_map[non_terminal] = set::set() ) var changed = true while (changed) { changed = false rules.for_each( fun(r: rule) { var rule_lookahead = first_vector(r.rhs) if (!changed) { changed = !first_set_map[r.lhs].contains(rule_lookahead) } first_set_map[r.lhs].add(rule_lookahead) }) } } fun first_vector(rhs: ref vec::vec): set::set { var toRet = set::set() if (rhs.size) { for (var i = 0; i < rhs.size; i++;) { var lookahead = first_set_map[rhs[i]] if (lookahead.contains(symbol::null_symbol())) { // remove the null if this is not the last in the rule if (i != rhs.size-1) lookahead.remove(symbol::null_symbol()) toRet.add(lookahead) } else { toRet.add(lookahead) break } } } else { toRet.add(symbol::null_symbol()) } return toRet } fun calculate_state_automaton() { var first_state = closure(state(vec::vec(rules[0].with_lookahead(set::set(symbol::eof_symbol()))))) var states = vec::vec(first_state) // vec instead of set because we need to iterate by index var newItems = stack::stack(0) // 0 is the index of the first and only item in states var count = 0 while (newItems.size()) { if (count%200 == 0) { io::print("calculate_state_automaton while") io::println(count) } count++ var I = newItems.pop() var possGoto = set::set() states[I].items.for_each(fun(r: ref rule) { if (!r.at_end()) possGoto.add(r.next()) // if r is at end or the rest reduces to null, add a reduce for each lookahead symbol if ( r.at_end() || first_vector(r.after()).contains(symbol::null_symbol()) ) { var rule_no = rules.find(r.plain()) r.lookahead.for_each(fun(sym: ref symbol::symbol) { parse_table.add_reduce(I, sym, rule_no, r.position) }) } }) possGoto.for_each(fun(X: ref symbol::symbol) { var goneState = goto(states[I], X) if (goneState.items.size) { var already_state = states.find(goneState) if (already_state == -1) { parse_table.add_push(I, X, states.size) newItems.push(states.size) states.add(goneState) } else { parse_table.add_push(I, X, already_state) } } }) } /*io::println("ALL STATES:\n")*/ /*states.for_each(fun(i: ref state) {*/ /*io::println("STATE:\n")*/ /*i.items.for_each(fun(r: ref rule) {*/ /*io::println(str::str("\t") + r.to_string())*/ /*})*/ /*})*/ io::println(" there were : states") io::println(states.size) /*io::println(" there were : table")*/ /*io::println(parse_table.to_string())*/ /*parse_table.print_string()*/ } fun closure(initial: ref state): state { initial.items = closure(initial.items) return initial } fun closure(initial: ref vec::vec): vec::vec { var continueIt = true //var count = 0 while (continueIt) { //io::print("closure while") //io::println(count) //count++ continueIt = false for (var i = 0; i < initial.size; i++;) { if (initial[i].at_end()) { continue } rules.for_each(fun(r: ref rule) { // if i is |a::=c . Bb, a|, we're doing each B::=... in rules if (r.lhs != initial[i].next()) return // continue the for-each // add r with lookahead var newLookahead = first_vector(initial[i].after_next()) if (newLookahead.contains(symbol::null_symbol())) { newLookahead.remove(symbol::null_symbol()) newLookahead.add(initial[i].lookahead) } var alreadyInInSomeForm = false for (var index = 0; index < initial.size; index++;) { if (initial[index].equals_but_lookahead(r)) { alreadyInInSomeForm = true if (!initial[index].lookahead.contains(newLookahead)) { //io::println("\n\n\n") //io::println(initial[index].to_string()) //io::println("and") //io::println(r.to_string()) //io::println("with") //var result = str::str("|lookahead {") //newLookahead.for_each(fun(i: symbol::symbol) { //result += i.to_string() //}) //io::println(result) //io::println("are the same with different lookaheads") initial[index].lookahead += newLookahead //io::println("so now it's") //io::println(initial[index].to_string()) //io::println("contineu because equal_but_different") continueIt = true return // continue the rules for-each } } } if (!alreadyInInSomeForm) { continueIt = true //io::println("\n\n\n") //io::println("contineu because not contains") //io::println(newRule.to_string()) initial.add(r.with_lookahead(newLookahead)) } }) } } return initial } fun goto(I: ref state, X: ref symbol::symbol): state { // loop through i, find all that have thing::= something . X more, // add thing ::= something X . more var jPrime = vec::vec() I.items.for_each(fun(i: ref rule) { if (!i.at_end() && i.next() == X) jPrime.add(i.advanced()) }) // return closure(that)? return state(closure(jPrime)) } fun to_string(): str::str { var result = str::str("grammer rules:") rules.for_each( fun(i : rule) { result += str::str("\n\t") + i.to_string(); } ) result += "\nnon_terminals:" non_terminals.for_each( fun(i : symbol::symbol) { result += str::str("\n\t") + i.to_string(); } ) result += "\nterminals:" terminals.for_each( fun(i : util::pair) { result += str::str("\n\t") + i.first.to_string() + ": " + i.second.regexString; } ) return result } } fun rule(lhs: symbol::symbol, rhs: vec::vec): rule { var toRet.construct(): rule toRet.lhs = lhs toRet.rhs = rhs return toRet } obj rule (Object, Serializable) { var lhs: symbol::symbol var rhs: vec::vec var position: int var lookahead: set::set fun serialize(): vec::vec { return serialize::serialize(lhs) + serialize::serialize(rhs) + serialize::serialize(position) + serialize::serialize(lookahead) } fun unserialize(it: ref vec::vec, pos: int): int { /*var tempLhs = symbol::invalid_symbol()*/ /*var tempRhs = vec::vec()*/ /*var tempLookahead = set::set()*/ /*util::unpack(tempLhs, pos) = serialize::unserialize(it, pos)*/ /*util::unpack(tempRhs, pos) = serialize::unserialize>(it, pos)*/ /*util::unpack(position, pos) = serialize::unserialize(it, pos)*/ /*util::unpack(tempLookahead, pos) = serialize::unserialize>(it, pos)*/ /*lhs.copy_construct(&tempLhs)*/ /*rhs.copy_construct(&tempRhs)*/ /*lookahead.copy_construct(&tempLookahead)*/ pos = lhs.unserialize(it, pos) pos = rhs.unserialize(it, pos) util::unpack(position, pos) = serialize::unserialize(it, pos) return lookahead.unserialize(it, pos) } fun construct(): *rule { lhs.construct() rhs.construct() position = 0 lookahead.construct() } fun copy_construct(other: *rule) { lhs.copy_construct(&other->lhs) rhs.copy_construct(&other->rhs) position = other->position lookahead.copy_construct(&other->lookahead) } fun operator=(other: rule) { destruct() copy_construct(&other) } fun operator==(other: ref rule):bool { return lhs == other.lhs && rhs == other.rhs && position == other.position && lookahead == other.lookahead } fun equals_but_lookahead(other: ref rule):bool { return lhs == other.lhs && rhs == other.rhs && position == other.position } fun destruct() { lhs.destruct() rhs.destruct() lookahead.destruct() } fun next(): ref symbol::symbol { return rhs[position] } fun after(): vec::vec { return rhs.slice(position, -1) } fun after_next(): vec::vec { return rhs.slice(position + 1, -1) } fun at_end(): bool { return position >= rhs.size } fun plain(): rule { return rule(lhs, rhs) } fun with_lookahead(newLookahead: set::set): rule { var toRet = rule(lhs, rhs) toRet.position = position toRet.lookahead = newLookahead return toRet } fun advanced(): rule { var toRet = rule(lhs, rhs) toRet.position = position+1 toRet.lookahead = lookahead return toRet } fun to_string(): str::str { var result = lhs.name + " -> " for (var i = 0; i < rhs.size; i++;) if (i == position) result += str::str(" . ") + rhs[i].to_string() + ", "; else result += rhs[i].to_string() + ", "; if (position == rhs.size) result += " . " result += "|lookahead {" lookahead.for_each(fun(i: symbol::symbol) { result += i.to_string() }) result += "}" return result } } fun state(itemsIn: ref vec::vec): state { var toRet.construct(itemsIn): state return toRet } obj state (Object) { var items: vec::vec fun construct(): *state { items.construct() } fun construct(itemsIn: ref vec::vec): *state { items.copy_construct(&itemsIn) } fun copy_construct(other: *state) { items.copy_construct(&other->items) } fun operator=(other: state) { destruct() copy_construct(&other) } fun destruct() { items.destruct() } fun operator==(other: ref state):bool { return items == other.items } fun to_string(): str::str { return str::str("woo a state") } } adt action_type { push, reduce, // note that these two are not actually currently used // accept is the reduce of the goal rule and reject is the // absence of actions accept, reject, invalid } fun action(act: action_type, state_or_rule: int): action { var toRet: action toRet.act = act toRet.state_or_rule = state_or_rule toRet.rule_position = -1 return toRet } fun action(act: action_type, state_or_rule: int, rule_position: int): action { var toRet: action toRet.act = act toRet.state_or_rule = state_or_rule toRet.rule_position = rule_position return toRet } obj action { var act: action_type var state_or_rule: int // sigh var rule_position: int // sigh fun operator==(other: action): bool { return act == other.act && state_or_rule == other.state_or_rule && rule_position == other.rule_position } fun print() { match (act) { action_type::push() io::print("push ") action_type::reduce() io::print("reduce ") action_type::accept() io::print("accept ") action_type::reject() io::print("reject ") } /*if (act == action_type::push)*/ /*io::print("push ")*/ /*else if (act == action_type::reduce)*/ /*io::print("reduce ")*/ /*else if (act == action_type::accept)*/ /*io::print("accept ")*/ /*else if (act == action_type::reject)*/ /*io::print("reject ")*/ io::print(state_or_rule) io::print(" ") io::print(rule_position) io::println() } } obj table (Object, Serializable) { // a 2 dimensional table made of a vec and a map that maps from stateno & symbol to a vec of parse actions var items: vec::vec>> fun construct(): *table { items.construct() } fun copy_construct(other: *table) { items.copy_construct(&other->items) } fun operator=(other: table) { destruct() copy_construct(&other) } fun destruct() { items.destruct() } fun serialize(): vec::vec { return serialize::serialize(items) } fun unserialize(it: ref vec::vec, pos: int): int { /*construct()*/ /*util::unpack(items, pos) = serialize::unserialize>>>(it, pos)*/ pos = items.unserialize(it, pos) return pos } fun expand_to(include_state: int) { while (include_state >= items.size) items.addEnd(map::map>()) } // we always "clean" the symbol before using it so that having different data doesn't // prevent us from finding the symbol in the table fun clean_symbol(sym: ref symbol::symbol): symbol::symbol { return symbol::symbol(sym.name, sym.terminal) } fun add_push(from_state: int, on_symbol: ref symbol::symbol, to_state: int) { expand_to(from_state) var cleaned_symbol = clean_symbol(on_symbol) if (items[from_state].contains_key(cleaned_symbol)) items[from_state][cleaned_symbol].addEnd(action(action_type::push(), to_state)) else items[from_state].set(cleaned_symbol, vec::vec(action(action_type::push(), to_state))) } fun add_reduce(from_state: int, on_symbol: ref symbol::symbol, by_rule_no: int, rule_position: int) { expand_to(from_state) var cleaned_symbol = clean_symbol(on_symbol) if (items[from_state].contains_key(cleaned_symbol)) items[from_state][cleaned_symbol].addEnd(action(action_type::reduce(), by_rule_no, rule_position)) else items[from_state].set(cleaned_symbol, vec::vec(action(action_type::reduce(), by_rule_no, rule_position))) } fun add_accept(from_state: int, on_symbol: ref symbol::symbol) { expand_to(from_state) var cleaned_symbol = clean_symbol(on_symbol) if (items[from_state].contains_key(cleaned_symbol)) items[from_state][cleaned_symbol].addEnd(action(action_type::accept(), 0)) else items[from_state].set(cleaned_symbol, vec::vec(action(action_type::accept(), 0))) } fun get(state: int, on_symbol: ref symbol::symbol): vec::vec { var cleaned_symbol = clean_symbol(on_symbol) if (items[state].contains_key(cleaned_symbol)) return items[state][cleaned_symbol] return vec::vec() } fun get_shift(state: int, on_symbol: ref symbol::symbol): action { var actions = get(state, on_symbol) for (var i = 0; i < actions.size; i++;) if (actions[i].act == action_type::push()) return actions[i] io::println("tried to get a shift when none existed") io::print("for state ") io::print(state) io::print(" and symbol ") io::println(on_symbol.to_string()) return action(action_type::invalid(),-1) } fun get_reduces(state: int, on_symbol: ref symbol::symbol): vec::vec { return get(state, on_symbol).filter(fun(act: action):bool { return act.act == action_type::reduce(); }) } fun print_string() { /*return str::str("woo a table of size: ") + items.size*/ io::print("woo a table of size: ") io::println(items.size) for (var i = 0; i < items.size; i++;) { io::print("for state: ") io::println(i) items[i].for_each(fun(sym: symbol::symbol, actions: vec::vec) { actions.for_each(fun(action: action) { io::print("\ton symbol: ") io::print(sym.to_string()) io::print(" do action: ") action.print() }) }) } } }