kraken/stdlib/regex.krak

import io
import string
import util
import vector
import string
import mem
import set
import util
import serialize

fun regex(in: *char):regex {
    return regex(string::string(in))
}
fun regex(in: string::string):regex {
    var out.construct(in):regex
    return out
}

obj regexState (Object) {
    // if only one character, both are the same
    var characterBegin: char
    var characterEnd: char
    var next_states: set::set<*regexState>
    fun construct(charIn:char): *regexState {
        return construct(charIn, charIn)
    }
    fun construct(charFirst:char, charSecond:char): *regexState {
        characterBegin = charFirst
        characterEnd = charSecond
        next_states.construct()
        return this
    }
    fun construct(): *regexState {
        return construct((0) cast char)
    }
    fun copy_construct(old:*regexState): void {
        characterBegin = old->characterBegin
        characterEnd = old->characterEnd
        next_states.copy_construct(&old->next_states)
    }
    fun destruct():void {
        next_states.destruct()
    }
    fun match_char(input: char): set::set<*regexState> {
        return next_states.filter(fun(it:*regexState):bool { return it->characterBegin <= input && input <= it->characterEnd; })
    }
    fun is_end():bool {
        return next_states.any_true(fun(state: *regexState):bool { return state->characterBegin == 1; })
    }
}

obj regex (Object, Serializable) {
    var regexString: string::string
    var begin: *regexState
    var referenceCounter: *int
    var is_straight_string: bool

    fun construct(): *regex {
        regexString.construct()
        return this
    }
    fun construct(regexStringIn: string::string): *regex {
        regexString.copy_construct(&regexStringIn)
        is_straight_string = true
        for (var i = 0; i < regexString.length(); i++;) {
            // simple implementation doesn't count escaped characters as straight string
            if (regexString[i] == '\\' || regexString[i] == '(' || regexString[i] == ')' || regexString[i] == '[' || regexString[i] == '*' || regexString[i] == '+' || regexString[i] == '?' || regexString[i] == '|') {
                is_straight_string = false
                break
            }
        }
        if (!is_straight_string) {
            referenceCounter = mem::new<int>()
            *referenceCounter = 1
            var beginningAndEnd = compile(regexStringIn)
            // init our begin, and the end state as the next state of each end
            begin = beginningAndEnd.first
            var end = mem::new<regexState>()->construct((1) cast char)
            beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); })
        }
        return this
    }

    fun copy_construct(old:*regex):void {
        regexString.copy_construct(&old->regexString)
        is_straight_string = old->is_straight_string
        if (!is_straight_string) {
            begin = old->begin
            referenceCounter = old->referenceCounter
            *referenceCounter += 1
        }
    }

    fun destruct():void {
        regexString.destruct()
        if (!is_straight_string) {
            *referenceCounter -= 1
            if (*referenceCounter == 0) {
                util::safe_recursive_delete(begin, fun(it: *regexState): set::set<*regexState> { return it->next_states; } )
                mem::delete(referenceCounter)
            }
        }
    }
    fun serialize(): vector::vector<char> {
        return serialize::serialize(regexString)
    }
    fun unserialize(it: ref vector::vector<char>, pos: int): int {
        var temp = string::string()
        util::unpack(temp, pos) = serialize::unserialize<string::string>(it, pos)
        construct(temp)
        return pos
    }

    fun operator==(other: regex):bool {
        return regexString == other.regexString
    }

    fun operator=(other: regex):void {
        destruct()
        copy_construct(&other)
    }

    fun compile(regex_string: string::string): util::pair<*regexState, set::set<*regexState>> {
        /*io::println(regex_string)*/
        var first = mem::new<regexState>()->construct()
        var previous_begin = set::set<*regexState>()
        var previous_end = set::set<*regexState>()
        var current_begin = set::set(first)
        var current_end = set::set(first)
        var alternating = false
        var escapeing = false

        for (var i = 0; i < regex_string.length(); i++;) {
            if (regex_string[i] == '*' && !escapeing) {
                current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)
                current_begin.add_all(previous_begin)
                current_end.add_all(previous_end)

            } else if (regex_string[i] == '+' && !escapeing) {
                current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)

            } else if (regex_string[i] == '?' && !escapeing) {
                current_begin.add_all(previous_begin)
                current_end.add_all(previous_end)

            } else if (regex_string[i] == '|' && !escapeing) {
                alternating = true

            } else if (regex_string[i] == '(' && !escapeing) {
                // note that we don't have a ')' case, as we skip past it with our indicies
                var perenEnd = i + 1
                for (var depth = 1; depth > 0; perenEnd++;) {
                    if (perenEnd >= regex_string.length())
                        util::error(string::string("can't find matching peren in: ") + regex_string)
                    // be careful, this isn't quite right yet
                    /*var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '\\' && regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))*/
                    var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))
                    if (regex_string[perenEnd] == '(' && not_non_special)
                        depth++
                    else if (regex_string[perenEnd] == ')' && not_non_special)
                        depth--
                }
                var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1))
                // NOTE: perenEnd is one past the close peren
                i = perenEnd-1

                if (alternating) {
                    previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
                    current_begin.add_all(innerBeginEnd.first->next_states)
                    current_end.add_all(innerBeginEnd.second)
                } else {
                    current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
                    previous_begin = current_begin
                    previous_end = current_end
                    current_begin = innerBeginEnd.first->next_states
                    current_end = innerBeginEnd.second
                }
                alternating = false

            } else if (regex_string[i] == '\\' && !escapeing) {
                escapeing = true

            } else {
                var next: *regexState
                if (regex_string[i] == '[' && !escapeing) {
                    next = mem::new<regexState>()->construct(regex_string[i+1], regex_string[i+3])
                    i += 4 // [a-b] is 5, i++ adds one
                } else {
                    next = mem::new<regexState>()->construct(regex_string[i])
                }
                if (alternating) {
                    previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
                    current_begin.add(next)
                    current_end.add(next)
                } else {
                    current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
                    previous_begin = current_begin
                    previous_end = current_end
                    current_begin = set::set(next)
                    current_end = set::set(next)
                }
                escapeing = false
                alternating = false
            }
        }
        var beginAndEnd = util::make_pair(first, current_end)
        return beginAndEnd
    }

    fun long_match(to_match: *char): int { return long_match(string::string(to_match)); }
    fun long_match(to_match: string::string): int return long_match(to_match.getBackingMemory(), 0, to_match.length())
    fun long_match(to_match: *char, position: int, end: int): int {
        if (is_straight_string) {
            if (regexString.length() > end-position)
                return -1
            for (var i = 0; i < regexString.length(); i++;)
                if (regexString[i] != to_match[position+i])
                    return -1
            return regexString.length();
        }
        /*var next = set::set(begin)*/
        var next.construct(): set::set<*regexState>
        next.add(begin)
        var longest = -1
        for (var i = 0; i < end-position; i++;) {
            if (next.size() == 0)
                return longest
            if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
                longest = i
            next = next.flatten_map(fun(state: *regexState): set::set<*regexState> { return state->match_char(to_match[position+i]); })
        }
        if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
            return end-position
        return longest
    }
}