kraken/stdlib/grammer.krak

import string
import vector
import set
import map
import symbol
import regex
import io
import util

fun split_into_words(gram_str: string::string): vector::vector<string::string> {
    var out.construct(): vector::vector<string>
    var begin = 0
    for (var i = 0; i < gram_str.length(); i++;) {
        if (gram_str[i] == '#') {
            while(gram_str[i] != '\n') i++
            i++
            io::print("comment: "); io::print(gram_str.slice(begin, i))
            begin = i
        }
        if (gram_str[i] == '"') {
            i++
            while (gram_str[i] != '"') {
                i++
                // if we hit a " we check to see if an odd number of backslashes preceed it
                // (meaning that the " is escaped), and if so, we move on. Otherwise, we found
                // the end of the quoted string
                if (gram_str[i] == '"') {
                    var escaped = 0
                    while (gram_str[i-(1+escaped)] == '\\') escaped++
                    if (escaped % 2)
                        i++
                }
            }
        }
        if (gram_str[i] == ' ') {
            out.add(gram_str.slice(begin, i))
            // allow multiple spaces between words
            while (gram_str[i] == ' ') i++
            begin = i
            i--
        }
        if (gram_str[i] == '\n') {
            if (i != begin)
                out.add(gram_str.slice(begin, i))
            begin = i + 1
        }
    }
    return out
}

fun load_grammer(gram_str: string::string): grammer {
    var gram.construct(): grammer
    var leftSide = symbol::symbol("", false)
    var doLeftSide = true
    var rightSide = vector::vector<symbol::symbol>()
    /*split_into_words(io::read_file(path)).for_each(fun(word: string::string) {*/
        /*io::print("word: "); io::println(word);*/
    /*})*/
    /*return gram*/
    split_into_words(gram_str).for_each(fun(word: string::string) {
        io::print("word: "); io::println(word)
        if (word == "=") {
            // do nothing
        } else if (word == "|") {
            gram.rules.add(rule(leftSide, rightSide))
            rightSide = vector::vector<symbol::symbol>()
        } else if (word == ";") {
            gram.rules.add(rule(leftSide, rightSide))
            rightSide = vector::vector<symbol::symbol>()
            doLeftSide = true
        } else {
            if (doLeftSide) {
                leftSide = symbol::symbol(word, false)
                gram.non_terminals.add(leftSide)
            } else {
                if (word[0] == '"') {
                    // ok, we support both plain terminals "hia*"
                    // and decorated terminals "hia*":hi_with_as
                    // so first check to find the ending " and see if it's
                    // the end of the string
                    var last_quote = word.length()-1
                    while(word[last_quote] != '"') last_quote--
                    if (last_quote != word.length()-1) {
                        rightSide.add(symbol::symbol(word.slice(last_quote+2, -1), true))
                        gram.terminals.add(util::make_pair(symbol::symbol(word.slice(last_quote+2, -1), true), regex::regex(word.slice(1,last_quote))))
                    } else {
                        rightSide.add(symbol::symbol(word, true))
                        gram.terminals.add(util::make_pair(symbol::symbol(word, true), regex::regex(word.slice(1,last_quote))))
                    }
                } else {
                    var non_term = symbol::symbol(word, false)
                    rightSide.add(non_term)
                    gram.non_terminals.add(non_term)
                }
            }
            doLeftSide = false
        }
    })
    return gram
}

obj grammer (Object) {
    var rules: vector::vector<rule>
    var non_terminals: set::set<symbol::symbol>
    var terminals: vector::vector<util::pair<symbol::symbol, regex::regex>>
    var first_set_map: map::map<symbol::symbol, set::set<symbol::symbol>>

    fun construct(): *grammer {
        rules.construct()
        non_terminals.construct()
        terminals.construct()
        first_set_map.construct()
    }
    fun copy_construct(old: *grammer) {
        rules.copy_construct(&old->rules)
        non_terminals.copy_construct(&old->non_terminals)
        terminals.copy_construct(&old->terminals)
        first_set_map.copy_construct(&old->first_set_map)
    }
    fun operator=(other: grammer) {
        destruct()
        copy_construct(&other)
    }
    fun destruct() {
        rules.destruct()
        non_terminals.destruct()
        terminals.destruct()
        first_set_map.destruct()
    }

    fun calculate_first_set() {
        // the first set of a terminal is itself
        terminals.for_each( fun(terminal: util::pair<symbol::symbol, regex::regex>)
            first_set_map[terminal.first] = set::set(terminal.first)
        )
        // start out the non-terminals as empty sets
        non_terminals.for_each( fun(non_terminal: symbol::symbol)
            first_set_map[non_terminal] = set::set<symbol::symbol>()
        )
        var first_helper = fun(rhs: vector::vector<symbol::symbol>): set::set<symbol::symbol> {
            var toRet = set::set<symbol::symbol>()
            if (rhs.size) {
                for (var i = 0; i < rhs.size; i++;) {
                    var lookahead = first_set_map[rhs[i]]
                    if (lookahead.contains(symbol::null_symbol())) {
                        // remove the null if this is not the last in the rule
                        if (i != rhs.size-1)
                            lookahead.remove(symbol::null_symbol())
                        toRet.add(lookahead)
                    } else {
                        toRet.add(lookahead)
                        break
                    }
                }
            } else {
                toRet.add(symbol::null_symbol())
            }
            return toRet
        }
        var changed = true
        while (changed) {
            /*io::println("//////////current state of map/////////////")*/
            first_set_map.keys.for_each(fun(sym: symbol::symbol) {
                /*io::print("for ")*/
                /*io::println(sym.to_string())*/
                /*io::println("map is:")*/
                /*first_set_map[sym].for_each(fun(look: symbol::symbol) {*/
                    /*io::print("lookahead: "); io::println(look.to_string())*/
                /*})*/
            })
            changed = false
            rules.for_each( fun(r: rule) {
                var rule_lookahead = first_helper(r.rhs)
                if (!changed) {
                    /*io::println(r.to_string())*/
                    changed = !first_set_map[r.lhs].contains(rule_lookahead)
                    /*io::print("changed: "); io::println(changed)*/
                    /*io::print("\tcurrent lookahead is sized:")*/
                    /*io::println(first_set_map[r.lhs].size())*/
                    /*io::println("\tcurrent lookahead is:")*/
                    /*first_set_map[r.lhs].for_each(fun(look: symbol::symbol) {*/
                        /*io::print("\t\tlookahead: "); io::println(look.to_string())*/
                    /*})*/
                    /*io::println()*/
                    /*io::print("\rule lookahead is sized:")*/
                    /*io::println(rule_lookahead.size())*/
                    /*io::println("\trule lookahead is:")*/
                    /*rule_lookahead.for_each(fun(look: symbol::symbol) {*/
                        /*io::print("\t\tlookahead: "); io::println(look.to_string())*/
                    /*})*/
                }
                first_set_map[r.lhs].add(rule_lookahead)
            })
        }
    }

    fun to_string(): string::string {
        var result = string::string("grammer rules:")
        rules.for_each( fun(i : rule) { result += string::string("\n\t") + i.to_string(); } )
        result += "\nnon_terminals:"
        non_terminals.for_each( fun(i : symbol::symbol) { result += string::string("\n\t") + i.to_string(); } )
        result += "\nterminals:"
        terminals.for_each( fun(i : util::pair<symbol::symbol, regex::regex>) { result += string::string("\n\t") + i.first.to_string() + ": " + i.second.regexString; } )
        return result
    }
}

fun rule(lhs: symbol::symbol, rhs: vector::vector<symbol::symbol>): rule {
    var toRet.construct(): rule
    toRet.lhs = lhs
    toRet.rhs = rhs
    return toRet
}

obj rule (Object) {
    var lhs: symbol::symbol
    var rhs: vector::vector<symbol::symbol>
    var position: int
    var lookahead: set::set<symbol::symbol>

    fun construct(): *rule {
        lhs.construct()
        rhs.construct()
        position = 0
        lookahead.construct()
    }
    fun copy_construct(other: *rule) {
        lhs.copy_construct(&other->lhs)
        rhs.copy_construct(&other->rhs)
        position = other->position
        lookahead.copy_construct(&other->lookahead)
    }
    fun operator=(other: rule) {
        destruct()
        copy_construct(&other)
    }
    fun destruct() {
        lhs.destruct()
        rhs.destruct()
        lookahead.destruct()
    }

    fun to_string(): string::string {
        var result = lhs.name + " -> "
        rhs.for_each( fun(i : symbol::symbol) { result += i.to_string() + ", "; } )
        return result
    }
}