kraken/stdlib/regex.krak

import io
import vector
import string
import mem
import set
import util
import conversions

fun regex(in: *char):regex {
    return regex(string::string(in))
}
fun regex(in: string::string):regex {
    var out.construct(in):regex
    return out
}

obj regexState (Object) {
    var character: char
    var next_states: vector::vector<*regexState>
    fun construct(charIn:char): *regexState {
        character = charIn
        next_states.construct()
        return this
    }
    fun construct(): *regexState {
        return construct(conversions::to_char(0))
    }
    fun copy_construct(old:*regexState): void {
        character = old->character
        next_states.copy_construct(&old->next_states)
    }
    fun destruct():void {
        next_states.destruct()
    }
    fun match(input: char): vector::vector<*regexState> {
        return next_states.filter(fun(it:*regexState):bool { return it->character == input; })
    }
    fun is_end():bool {
        return next_states.any_true(fun(state: *regexState):bool { return state->character == 1; })
    }
}

obj regex (Object) {
    var regexString: string::string
    var begin: *regexState
    var referenceCounter: *int

    fun construct(): *regex {
        regexString.construct()
        return this
    }
    fun construct(regexStringIn: string::string): *regex {
        regexString.copy_construct(&regexStringIn)
        referenceCounter = mem::new<int>()
        *referenceCounter = 1

        var beginningAndEnd = compile(regexStringIn)
        // init our begin, and the end state as the next state of each end
        begin = beginningAndEnd.first
        var end = mem::new<regexState>()->construct(conversions::to_char(1))
        beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); })
        return this
    }

    fun copy_construct(old:*regex):void {
        regexString.copy_construct(&old->regexString)
        begin = old->begin
        referenceCounter = old->referenceCounter
        *referenceCounter += 1
        /*construct(old->regexString)*/
        /*begin = mem::safe_recursive_clone(old->begin, fun(it: *regexState, cloner: fun(*regexState):*regexState, register: fun(*regexState):void): void {*/
            /*var newOne = mem::new<regexState>()->construct(it->character)*/
            /*register(newOne)*/
            /*it->next_states.for_each(fun(next_state: *regexState) {*/
                /*newOne->next_states.add(cloner(next_state))*/
            /*})*/
        /*})*/
    }

    fun destruct():void {
        regexString.destruct()
        *referenceCounter -= 1
        if (*referenceCounter == 0) {
            mem::safe_recursive_delete(begin, fun(it: *regexState): set::set<*regexState> { return set::from_vector(it->next_states); } )
            mem::delete(referenceCounter)
        }
    }

    fun operator==(other: regex):bool {
        return regexString == other.regexString
    }

    fun operator=(other: regex):void {
        destruct()
        copy_construct(&other)
    }

    fun compile(regex_string: string::string): util::pair<*regexState, vector::vector<*regexState>> {
        var first = mem::new<regexState>()->construct()
        var previous_begin = vector::vector<*regexState>()
        var previous_end = vector::vector<*regexState>()
        var current_begin = vector::vector(first)
        var current_end = vector::vector(first)
        var alternating = false
        var escapeing = false

        for (var i = 0; i < regex_string.length(); i++;) {
            if (regex_string[i] == '*' && !escapeing) {
                for (var j = 0; j < current_end.size; j++;)
                    current_end[j]->next_states.add_all(current_begin)
                current_begin.add_all(previous_begin)
                current_end.add_all(previous_end)

            } else if (regex_string[i] == '+' && !escapeing) {
                for (var j = 0; j < current_end.size; j++;)
                    current_end[j]->next_states.add_all(current_begin)

            } else if (regex_string[i] == '?' && !escapeing) {
                current_begin.add_all(previous_begin)
                current_end.add_all(previous_end)

            } else if (regex_string[i] == '|' && !escapeing) {
                alternating = true

            } else if (regex_string[i] == '(' && !escapeing) {
                // note that we don't have a ')' case, as we skip past it with our indicies
                var perenEnd = i + 1
                for (var depth = 1; depth > 0; perenEnd++;)
                    if (regex_string[perenEnd] == '(')
                        depth++
                    else if (regex_string[perenEnd] == ')')
                        depth--
                var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1))
                // NOTE: perenEnd is one past the close peren
                i = perenEnd-1

                if (alternating) {
                    previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
                    current_begin.add_all(innerBeginEnd.first->next_states)
                    current_end.add_all(innerBeginEnd.second)
                } else {
                    current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
                    previous_begin = current_begin
                    previous_end = current_end
                    current_begin = innerBeginEnd.first->next_states
                    current_end = innerBeginEnd.second
                }
                alternating = false

            } else if (regex_string[i] == '\\' && !escapeing) {
                escapeing = true

            } else {
                var next = mem::new<regexState>()->construct(regex_string[i])
                if (alternating) {
                    previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
                    current_begin.add(next)
                    current_end.add(next)
                } else {
                    current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
                    previous_begin = current_begin
                    previous_end = current_end
                    current_begin = vector::vector(next)
                    current_end = vector::vector(next)
                }
                escapeing = false
                alternating = false
            }
        }
        var beginAndEnd = util::make_pair(first, current_end)
        return beginAndEnd
    }

    fun long_match(to_match: *char): int { return long_match(string::string(to_match)); }
    fun long_match(to_match: string::string): int {
        var next = vector::vector(begin)
        var longest = -1
        for (var i = 0; i < to_match.length(); i++;) {
            if (next.size == 0)
                return longest
            if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
                longest = i
            //next = next.flatten_map<*regexState>(fun(state: *regexState): vector::vector<*regexState> { return state->match(to_match[i]); })
            next = next.flatten_map(fun(state: *regexState): vector::vector<*regexState> { return state->match(to_match[i]); })
        }
        if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
            return to_match.length()
        return longest
    }
}
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`import io`
			`import vector`
			`import string`
Tons of stuff. Regex still a work in progress, along with related template member function scoping bugs 2015-06-09 20:02:02 -04:00			`import mem`
Fixed the close over methods and member vars bug, but there's something remaining causing the safe_recursive_delete not to work. Gotta save progress and do other stuff 2015-06-27 18:06:02 -04:00			`import set`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`import util`
Tons of stuff. Regex still a work in progress, along with related template member function scoping bugs 2015-06-09 20:02:02 -04:00			`import conversions`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun regex(in: *char):regex {`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`return regex(string::string(in))`
			`}`
			`fun regex(in: string::string):regex {`
			`var out.construct(in):regex`
			`return out`
			`}`

Little break work on grammer, added Object trait to other stdlib objects 2015-06-30 02:40:46 -04:00			`obj regexState (Object) {`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`var character: char`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`var next_states: vector::vector<*regexState>`
			`fun construct(charIn:char): *regexState {`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`character = charIn`
			`next_states.construct()`
			`return this`
			`}`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun construct(): *regexState {`
Tons of stuff. Regex still a work in progress, along with related template member function scoping bugs 2015-06-09 20:02:02 -04:00			`return construct(conversions::to_char(0))`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun copy_construct(old:*regexState): void {`
Tons of stuff. Regex still a work in progress, along with related template member function scoping bugs 2015-06-09 20:02:02 -04:00			`character = old->character`
			`next_states.copy_construct(&old->next_states)`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
			`fun destruct():void {`
			`next_states.destruct()`
			`}`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun match(input: char): vector::vector<*regexState> {`
			`return next_states.filter(fun(it:*regexState):bool { return it->character == input; })`
Fixed bug where no parameter function calls were not typechecked and function/struct name collision. Improved regex library to where it can do straight-line regexs 2015-06-12 14:16:28 -04:00			`}`
			`fun is_end():bool {`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`return next_states.any_true(fun(state: *regexState):bool { return state->character == 1; })`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
			`}`

Little break work on grammer, added Object trait to other stdlib objects 2015-06-30 02:40:46 -04:00			`obj regex (Object) {`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`var regexString: string::string`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`var begin: *regexState`
Changed regex to reference count internal structure instead of cloning because it too way too long. Added terminal decorators to grammer and lexer 2015-07-08 13:43:06 -04:00			`var referenceCounter: *int`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00
Changed regex to reference count internal structure instead of cloning because it too way too long. Added terminal decorators to grammer and lexer 2015-07-08 13:43:06 -04:00			`fun construct(): *regex {`
			`regexString.construct()`
			`return this`
			`}`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun construct(regexStringIn: string::string): *regex {`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`regexString.copy_construct(&regexStringIn)`
Changed regex to reference count internal structure instead of cloning because it too way too long. Added terminal decorators to grammer and lexer 2015-07-08 13:43:06 -04:00			`referenceCounter = mem::new<int>()`
			`*referenceCounter = 1`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`var beginningAndEnd = compile(regexStringIn)`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`// init our begin, and the end state as the next state of each end`
			`begin = beginningAndEnd.first`
Closures work\! 2015-06-26 13:29:37 -04:00			`var end = mem::new<regexState>()->construct(conversions::to_char(1))`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); })`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`return this`
			`}`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun copy_construct(old:*regex):void {`
Changed regex to reference count internal structure instead of cloning because it too way too long. Added terminal decorators to grammer and lexer 2015-07-08 13:43:06 -04:00			`regexString.copy_construct(&old->regexString)`
			`begin = old->begin`
			`referenceCounter = old->referenceCounter`
			`*referenceCounter += 1`
			`/construct(old->regexString)/`
Added mem::safe_recursive_clone, and while it works for regex, it's actually slower then remaking it. Hmmmm, maybe because some of the stdlib is inefficent 2015-07-07 00:46:00 -04:00			`/begin = mem::safe_recursive_clone(old->begin, fun(it: regexState, cloner: fun(regexState):regexState, register: fun(regexState):void): void {/`
			`/var newOne = mem::new<regexState>()->construct(it->character)/`
			`/register(newOne)/`
			`/it->next_states.for_each(fun(next_state: regexState) {*/`
			`/newOne->next_states.add(cloner(next_state))/`
			`/})/`
			`/})/`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`fun destruct():void {`
			`regexString.destruct()`
Changed regex to reference count internal structure instead of cloning because it too way too long. Added terminal decorators to grammer and lexer 2015-07-08 13:43:06 -04:00			`*referenceCounter -= 1`
			`if (*referenceCounter == 0) {`
			`mem::safe_recursive_delete(begin, fun(it: regexState): set::set<regexState> { return set::from_vector(it->next_states); } )`
			`mem::delete(referenceCounter)`
			`}`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00
More work on grammer and standard library! It can kinda load grammer now! Kinda. Marcus ran into the function pointer returns pointer ambiguity, so that'll have to be done tomorrow. 2015-07-04 03:21:36 -04:00			`fun operator==(other: regex):bool {`
			`return regexString == other.regexString`
			`}`

More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`fun operator=(other: regex):void {`
			`destruct()`
Added mem::safe_recursive_clone, and while it works for regex, it's actually slower then remaking it. Hmmmm, maybe because some of the stdlib is inefficent 2015-07-07 00:46:00 -04:00			`copy_construct(&other)`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`}`

Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun compile(regex_string: string::string): util::pair<regexState, vector::vector<regexState>> {`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`var first = mem::new<regexState>()->construct()`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`var previous_begin = vector::vector<*regexState>()`
			`var previous_end = vector::vector<*regexState>()`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`var current_begin = vector::vector(first)`
			`var current_end = vector::vector(first)`
			`var alternating = false`
			`var escapeing = false`

			`for (var i = 0; i < regex_string.length(); i++;) {`
			`if (regex_string[i] == '*' && !escapeing) {`
			`for (var j = 0; j < current_end.size; j++;)`
			`current_end[j]->next_states.add_all(current_begin)`
			`current_begin.add_all(previous_begin)`
			`current_end.add_all(previous_end)`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else if (regex_string[i] == '+' && !escapeing) {`
			`for (var j = 0; j < current_end.size; j++;)`
			`current_end[j]->next_states.add_all(current_begin)`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else if (regex_string[i] == '?' && !escapeing) {`
			`current_begin.add_all(previous_begin)`
			`current_end.add_all(previous_end)`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else if (regex_string[i] == '\|' && !escapeing) {`
			`alternating = true`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else if (regex_string[i] == '(' && !escapeing) {`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`// note that we don't have a ')' case, as we skip past it with our indicies`
			`var perenEnd = i + 1`
			`for (var depth = 1; depth > 0; perenEnd++;)`
			`if (regex_string[perenEnd] == '(')`
			`depth++`
			`else if (regex_string[perenEnd] == ')')`
			`depth--`
			`var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1))`
			`// NOTE: perenEnd is one past the close peren`
			`i = perenEnd-1`

			`if (alternating) {`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`current_begin.add_all(innerBeginEnd.first->next_states)`
			`current_end.add_all(innerBeginEnd.second)`
			`} else {`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`previous_begin = current_begin`
			`previous_end = current_end`
			`current_begin = innerBeginEnd.first->next_states`
			`current_end = innerBeginEnd.second`
			`}`
			`alternating = false`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else if (regex_string[i] == '\\' && !escapeing) {`
			`escapeing = true`
Clean up debugging, a little more test 2015-06-15 21:32:09 -04:00
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`} else {`
			`var next = mem::new<regexState>()->construct(regex_string[i])`
			`if (alternating) {`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`current_begin.add(next)`
			`current_end.add(next)`
			`} else {`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`previous_begin = current_begin`
			`previous_end = current_end`
			`current_begin = vector::vector(next)`
			`current_end = vector::vector(next)`
			`}`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`escapeing = false`
			`alternating = false`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`}`
			`}`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`var beginAndEnd = util::make_pair(first, current_end)`
More work on regex, fixed whitespace around && and operator= for vector 2015-06-14 11:13:30 -04:00			`return beginAndEnd`
			`}`

Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`fun long_match(to_match: *char): int { return long_match(string::string(to_match)); }`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`fun long_match(to_match: string::string): int {`
Some more bugfixes, got regex working as well as the cpp version. (leaks memory like craaazy) 2015-06-14 18:13:52 -04:00			`var next = vector::vector(begin)`
			`var longest = -1`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`for (var i = 0; i < to_match.length(); i++;) {`
			`if (next.size == 0)`
			`return longest`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`longest = i`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`//next = next.flatten_map<regexState>(fun(state: regexState): vector::vector<*regexState> { return state->match(to_match[i]); })`
			`next = next.flatten_map(fun(state: regexState): vector::vector<regexState> { return state->match(to_match[i]); })`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`}`
Swapped pointers to the other side for types to prevent ambiguity, i.e. int instead of int 2015-07-04 17:02:51 -04:00			`if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))`
Some bugfixes, allow overloading of [] and add that to vector and string, work on regex. Need closures before that finishes.... 2015-06-08 21:47:02 -04:00			`return to_match.length()`
			`return longest`
			`}`
			`}`