Files
kraken/stdlib/grammer.krak

421 lines
15 KiB
Plaintext
Raw Normal View History

import string
import vector
import set
import stack
2015-07-13 12:16:30 -04:00
import map
import symbol
import regex
import io
import util
2015-07-06 12:49:29 -04:00
fun split_into_words(gram_str: string::string): vector::vector<string::string> {
var out.construct(): vector::vector<string>
var begin = 0
2015-07-06 12:49:29 -04:00
for (var i = 0; i < gram_str.length(); i++;) {
if (gram_str[i] == '#') {
while(gram_str[i] != '\n') i++
i++
io::print("comment: "); io::print(gram_str.slice(begin, i))
begin = i
}
2015-07-06 12:49:29 -04:00
if (gram_str[i] == '"') {
i++
while (gram_str[i] != '"') {
i++
// if we hit a " we check to see if an odd number of backslashes preceed it
// (meaning that the " is escaped), and if so, we move on. Otherwise, we found
// the end of the quoted string
if (gram_str[i] == '"') {
var escaped = 0
while (gram_str[i-(1+escaped)] == '\\') escaped++
if (escaped % 2)
i++
}
}
}
if (gram_str[i] == ' ') {
out.add(gram_str.slice(begin, i))
// allow multiple spaces between words
while (gram_str[i] == ' ') i++
begin = i
i--
}
if (gram_str[i] == '\n') {
if (i != begin)
out.add(gram_str.slice(begin, i))
begin = i + 1
}
}
return out
}
fun load_grammer(gram_str: string::string): grammer {
var gram.construct(): grammer
2015-07-06 12:49:29 -04:00
var leftSide = symbol::symbol("", false)
var doLeftSide = true
var rightSide = vector::vector<symbol::symbol>()
/*split_into_words(io::read_file(path)).for_each(fun(word: string::string) {*/
/*io::print("word: "); io::println(word);*/
/*})*/
/*return gram*/
split_into_words(gram_str).for_each(fun(word: string::string) {
io::print("word: "); io::println(word)
2015-07-06 12:49:29 -04:00
if (word == "=") {
// do nothing
} else if (word == "|") {
gram.rules.add(rule(leftSide, rightSide))
rightSide = vector::vector<symbol::symbol>()
} else if (word == ";") {
gram.rules.add(rule(leftSide, rightSide))
rightSide = vector::vector<symbol::symbol>()
doLeftSide = true
} else {
2015-07-13 12:16:30 -04:00
if (doLeftSide) {
leftSide = symbol::symbol(word, false)
2015-07-13 12:16:30 -04:00
gram.non_terminals.add(leftSide)
} else {
if (word[0] == '"') {
// ok, we support both plain terminals "hia*"
// and decorated terminals "hia*":hi_with_as
// so first check to find the ending " and see if it's
// the end of the string
var last_quote = word.length()-1
while(word[last_quote] != '"') last_quote--
2015-07-13 12:16:30 -04:00
if (last_quote != word.length()-1) {
rightSide.add(symbol::symbol(word.slice(last_quote+2, -1), true))
gram.terminals.add(util::make_pair(symbol::symbol(word.slice(last_quote+2, -1), true), regex::regex(word.slice(1,last_quote))))
} else {
rightSide.add(symbol::symbol(word, true))
gram.terminals.add(util::make_pair(symbol::symbol(word, true), regex::regex(word.slice(1,last_quote))))
}
} else {
2015-07-13 12:16:30 -04:00
var non_term = symbol::symbol(word, false)
rightSide.add(non_term)
gram.non_terminals.add(non_term)
}
2015-07-13 12:16:30 -04:00
}
2015-07-06 12:49:29 -04:00
doLeftSide = false
}
})
return gram
}
obj grammer (Object) {
var rules: vector::vector<rule>
2015-07-13 12:16:30 -04:00
var non_terminals: set::set<symbol::symbol>
var terminals: vector::vector<util::pair<symbol::symbol, regex::regex>>
var first_set_map: map::map<symbol::symbol, set::set<symbol::symbol>>
var state_automata: state
fun construct(): *grammer {
rules.construct()
2015-07-13 12:16:30 -04:00
non_terminals.construct()
terminals.construct()
first_set_map.construct()
state_automata.construct()
}
fun copy_construct(old: *grammer) {
rules.copy_construct(&old->rules)
2015-07-13 12:16:30 -04:00
non_terminals.copy_construct(&old->non_terminals)
terminals.copy_construct(&old->terminals)
first_set_map.copy_construct(&old->first_set_map)
state_automata.copy_construct(&old->state_automata)
}
fun operator=(other: grammer) {
destruct()
copy_construct(&other)
}
fun destruct() {
rules.destruct()
2015-07-13 12:16:30 -04:00
non_terminals.destruct()
terminals.destruct()
first_set_map.destruct()
state_automata.destruct()
2015-07-13 12:16:30 -04:00
}
fun calculate_first_set() {
// the first set of a terminal is itself
terminals.for_each( fun(terminal: util::pair<symbol::symbol, regex::regex>)
first_set_map[terminal.first] = set::set(terminal.first)
)
// start out the non-terminals as empty sets
non_terminals.for_each( fun(non_terminal: symbol::symbol)
first_set_map[non_terminal] = set::set<symbol::symbol>()
)
var changed = true
while (changed) {
changed = false
rules.for_each( fun(r: rule) {
var rule_lookahead = first_vector(r.rhs)
2015-07-13 12:16:30 -04:00
if (!changed) {
changed = !first_set_map[r.lhs].contains(rule_lookahead)
}
first_set_map[r.lhs].add(rule_lookahead)
})
}
}
fun first_vector(rhs: vector::vector<symbol::symbol>): set::set<symbol::symbol> {
var toRet = set::set<symbol::symbol>()
if (rhs.size) {
for (var i = 0; i < rhs.size; i++;) {
var lookahead = first_set_map[rhs[i]]
if (lookahead.contains(symbol::null_symbol())) {
// remove the null if this is not the last in the rule
if (i != rhs.size-1)
lookahead.remove(symbol::null_symbol())
toRet.add(lookahead)
} else {
toRet.add(lookahead)
break
}
}
} else {
toRet.add(symbol::null_symbol())
}
return toRet
}
fun calculate_state_automaton() {
2015-08-04 01:57:53 -04:00
state_automata.items = vector::vector(rules[0].with_lookahead(set::set(symbol::eof_symbol())))
io::println("pre first closure")
state_automata = closure(state_automata)
io::println("post first closure")
var states = vector::vector(state_automata) // vector instead of set because we need to iterate by index
var newItems = stack::stack(0) // 0 is the index of the first and only item in states
var count = 0
while (newItems.size()) {
if (count%200 == 0) {
io::print("calculate_state_automaton while")
io::println(count)
}
count++
var I = newItems.pop()
var possGoto = set::set<symbol::symbol>()
states[I].items.for_each(fun(r: ref rule) {
if (!r.at_end())
possGoto.add(r.next())
})
possGoto.for_each(fun(X: ref symbol::symbol) {
var goneState = goto(states[I], X)
if (goneState.items.size && !states.contains(goneState)) {
newItems.push(states.size)
states.add(goneState)
}
})
}
io::println("ALL STATES:\n")
states.for_each(fun(i: ref state) {
io::println("STATE:\n")
i.items.for_each(fun(r: ref rule) {
2015-08-04 01:57:53 -04:00
io::println(string::string("\t") + r.to_string())
})
})
io::println(" there were : states")
io::println(states.size)
}
fun closure(initial: ref state): state {
2015-08-04 01:57:53 -04:00
initial.items = closure(initial.items)
return initial
}
fun closure(initial: ref vector::vector<rule>): vector::vector<rule> {
var continueIt = true
//var count = 0
while (continueIt) {
//io::print("closure while")
//io::println(count)
//count++
continueIt = false
for (var i = 0; i < initial.size; i++;) {
if (initial[i].at_end()) {
continue
2015-08-04 01:57:53 -04:00
}
rules.for_each(fun(r: ref rule) {
// if i is |a::=c . Bb, a|, we're doing each B::=... in rules
if (r.lhs != initial[i].next())
return // continue the for-each
// add r with lookahead
var newLookahead = first_vector(initial[i].after_next())
if (newLookahead.contains(symbol::null_symbol())) {
newLookahead.remove(symbol::null_symbol())
newLookahead.add(initial[i].lookahead)
}
var alreadyInInSomeForm = false
for (var index = 0; index < initial.size; index++;) {
if (initial[index].equals_but_lookahead(r)) {
alreadyInInSomeForm = true
if (!initial[index].lookahead.contains(newLookahead)) {
//io::println("\n\n\n")
//io::println(initial[index].to_string())
//io::println("and")
//io::println(r.to_string())
//io::println("with")
//var result = string::string("|lookahead {")
//newLookahead.for_each(fun(i: symbol::symbol) {
//result += i.to_string()
//})
//io::println(result)
//io::println("are the same with different lookaheads")
initial[index].lookahead += newLookahead
//io::println("so now it's")
//io::println(initial[index].to_string())
//io::println("contineu because equal_but_different")
continueIt = true
return // continue the rules for-each
}
}
}
if (!alreadyInInSomeForm) {
continueIt = true
//io::println("\n\n\n")
2015-08-04 01:57:53 -04:00
//io::println("contineu because not contains")
//io::println(newRule.to_string())
initial.add(r.with_lookahead(newLookahead))
}
})
}
}
return initial
}
fun goto(I: ref state, X: ref symbol::symbol): state {
// loop through i, find all that have thing::= something . X more,
// add thing ::= something X . more
2015-08-04 01:57:53 -04:00
var jPrime = vector::vector<rule>()
I.items.for_each(fun(i: ref rule) {
2015-08-04 01:57:53 -04:00
if (!i.at_end() && i.next() == X)
jPrime.add(i.advanced())
})
// return closure(that)?
2015-08-04 01:57:53 -04:00
return state(closure(jPrime))
}
fun to_string(): string::string {
var result = string::string("grammer rules:")
rules.for_each( fun(i : rule) { result += string::string("\n\t") + i.to_string(); } )
2015-07-13 12:16:30 -04:00
result += "\nnon_terminals:"
non_terminals.for_each( fun(i : symbol::symbol) { result += string::string("\n\t") + i.to_string(); } )
result += "\nterminals:"
terminals.for_each( fun(i : util::pair<symbol::symbol, regex::regex>) { result += string::string("\n\t") + i.first.to_string() + ": " + i.second.regexString; } )
result += "\nstate:"
result += state_automata.to_string()
return result
}
}
fun rule(lhs: symbol::symbol, rhs: vector::vector<symbol::symbol>): rule {
var toRet.construct(): rule
toRet.lhs = lhs
toRet.rhs = rhs
return toRet
}
obj rule (Object) {
var lhs: symbol::symbol
var rhs: vector::vector<symbol::symbol>
var position: int
var lookahead: set::set<symbol::symbol>
fun construct(): *rule {
lhs.construct()
rhs.construct()
position = 0
lookahead.construct()
}
fun copy_construct(other: *rule) {
lhs.copy_construct(&other->lhs)
rhs.copy_construct(&other->rhs)
position = other->position
lookahead.copy_construct(&other->lookahead)
}
fun operator=(other: rule) {
destruct()
copy_construct(&other)
}
fun operator==(other: ref rule):bool {
return lhs == other.lhs && rhs == other.rhs &&
position == other.position && lookahead == other.lookahead
}
fun equals_but_lookahead(other: ref rule):bool {
return lhs == other.lhs && rhs == other.rhs &&
position == other.position
}
fun destruct() {
lhs.destruct()
rhs.destruct()
lookahead.destruct()
}
fun next(): ref symbol::symbol {
return rhs[position]
}
fun after_next(): vector::vector<symbol::symbol> {
return rhs.slice(position + 1, -1)
}
fun at_end(): bool {
2015-08-04 01:57:53 -04:00
return position >= rhs.size
}
fun with_lookahead(newLookahead: set::set<symbol::symbol>): rule {
var toRet = rule(lhs, rhs)
toRet.position = position
toRet.lookahead = newLookahead
return toRet
}
fun advanced(): rule {
var toRet = rule(lhs, rhs)
toRet.position = position+1
toRet.lookahead = lookahead
return toRet
}
fun to_string(): string::string {
var result = lhs.name + " -> "
for (var i = 0; i < rhs.size; i++;)
if (i == position)
result += string::string(" . ") + rhs[i].to_string() + ", ";
else
result += rhs[i].to_string() + ", ";
2015-08-04 01:57:53 -04:00
if (position == rhs.size)
result += " . "
result += "|lookahead {"
lookahead.for_each(fun(i: symbol::symbol) {
result += i.to_string()
})
result += "}"
return result
}
}
2015-08-04 01:57:53 -04:00
fun state(itemsIn: ref vector::vector<rule>): state {
var toRet.construct(itemsIn): state
return toRet
}
2015-08-03 14:38:17 -04:00
obj state (Object) {
2015-08-04 01:57:53 -04:00
var items: vector::vector<rule>
2015-08-03 14:38:17 -04:00
fun construct(): *state {
2015-08-04 01:57:53 -04:00
items.construct()
2015-08-03 14:38:17 -04:00
}
2015-08-04 01:57:53 -04:00
fun construct(itemsIn: ref vector::vector<rule>): *state {
items.copy_construct(&itemsIn)
}
2015-08-03 14:38:17 -04:00
fun copy_construct(other: *state) {
2015-08-04 01:57:53 -04:00
items.copy_construct(&other->items)
2015-08-03 14:38:17 -04:00
}
fun operator=(other: state) {
destruct()
copy_construct(&other)
}
fun destruct() {
2015-08-04 01:57:53 -04:00
items.destruct()
2015-08-03 14:38:17 -04:00
}
fun operator==(other: ref state):bool {
2015-08-04 01:57:53 -04:00
return items == other.items
}
fun to_string(): string::string {
return string::string("woo a state")
}
2015-08-03 14:38:17 -04:00
}