Files
kraken/stdlib/regex.krak
2016-04-22 19:11:11 -04:00

199 lines
7.5 KiB
Plaintext

import io
import vector
import string
import mem
import set
import util
import serialize
fun regex(in: *char):regex {
return regex(string::string(in))
}
fun regex(in: string::string):regex {
var out.construct(in):regex
return out
}
obj regexState (Object) {
var character: char
var next_states: set::set<*regexState>
fun construct(charIn:char): *regexState {
character = charIn
next_states.construct()
return this
}
fun construct(): *regexState {
return construct((0) cast char)
}
fun copy_construct(old:*regexState): void {
character = old->character
next_states.copy_construct(&old->next_states)
}
fun destruct():void {
next_states.destruct()
}
fun match_char(input: char): set::set<*regexState> {
return next_states.filter(fun(it:*regexState):bool { return it->character == input; })
}
fun is_end():bool {
return next_states.any_true(fun(state: *regexState):bool { return state->character == 1; })
}
}
obj regex (Object, Serializable) {
var regexString: string::string
var begin: *regexState
var referenceCounter: *int
fun construct(): *regex {
regexString.construct()
return this
}
fun construct(regexStringIn: string::string): *regex {
regexString.copy_construct(&regexStringIn)
referenceCounter = mem::new<int>()
*referenceCounter = 1
var beginningAndEnd = compile(regexStringIn)
// init our begin, and the end state as the next state of each end
begin = beginningAndEnd.first
var end = mem::new<regexState>()->construct((1) cast char)
beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); })
return this
}
fun copy_construct(old:*regex):void {
regexString.copy_construct(&old->regexString)
begin = old->begin
referenceCounter = old->referenceCounter
*referenceCounter += 1
/*construct(old->regexString)*/
/*begin = mem::safe_recursive_clone(old->begin, fun(it: *regexState, cloner: fun(*regexState):*regexState, register: fun(*regexState):void): void {*/
/*var newOne = mem::new<regexState>()->construct(it->character)*/
/*register(newOne)*/
/*it->next_states.for_each(fun(next_state: *regexState) {*/
/*newOne->next_states.add(cloner(next_state))*/
/*})*/
/*})*/
}
fun destruct():void {
regexString.destruct()
*referenceCounter -= 1
if (*referenceCounter == 0) {
util::safe_recursive_delete(begin, fun(it: *regexState): set::set<*regexState> { return it->next_states; } )
mem::delete(referenceCounter)
}
}
fun serialize(): vector::vector<char> {
return serialize::serialize(regexString)
}
fun unserialize(it: ref vector::vector<char>, pos: int): int {
var temp = string::string()
util::unpack(temp, pos) = serialize::unserialize<string::string>(it, pos)
construct(temp)
return pos
}
fun operator==(other: regex):bool {
return regexString == other.regexString
}
fun operator=(other: regex):void {
destruct()
copy_construct(&other)
}
fun compile(regex_string: string::string): util::pair<*regexState, set::set<*regexState>> {
var first = mem::new<regexState>()->construct()
var previous_begin = set::set<*regexState>()
var previous_end = set::set<*regexState>()
var current_begin = set::set(first)
var current_end = set::set(first)
var alternating = false
var escapeing = false
for (var i = 0; i < regex_string.length(); i++;) {
if (regex_string[i] == '*' && !escapeing) {
current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)
current_begin.add_all(previous_begin)
current_end.add_all(previous_end)
} else if (regex_string[i] == '+' && !escapeing) {
current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)
} else if (regex_string[i] == '?' && !escapeing) {
current_begin.add_all(previous_begin)
current_end.add_all(previous_end)
} else if (regex_string[i] == '|' && !escapeing) {
alternating = true
} else if (regex_string[i] == '(' && !escapeing) {
// note that we don't have a ')' case, as we skip past it with our indicies
var perenEnd = i + 1
for (var depth = 1; depth > 0; perenEnd++;)
if (regex_string[perenEnd] == '(')
depth++
else if (regex_string[perenEnd] == ')')
depth--
var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1))
// NOTE: perenEnd is one past the close peren
i = perenEnd-1
if (alternating) {
previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
current_begin.add_all(innerBeginEnd.first->next_states)
current_end.add_all(innerBeginEnd.second)
} else {
current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
previous_begin = current_begin
previous_end = current_end
current_begin = innerBeginEnd.first->next_states
current_end = innerBeginEnd.second
}
alternating = false
} else if (regex_string[i] == '\\' && !escapeing) {
escapeing = true
} else {
var next = mem::new<regexState>()->construct(regex_string[i])
if (alternating) {
previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
current_begin.add(next)
current_end.add(next)
} else {
current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
previous_begin = current_begin
previous_end = current_end
current_begin = set::set(next)
current_end = set::set(next)
}
escapeing = false
alternating = false
}
}
var beginAndEnd = util::make_pair(first, current_end)
return beginAndEnd
}
fun long_match(to_match: *char): int { return long_match(string::string(to_match)); }
fun long_match(to_match: string::string): int return long_match(to_match.getBackingMemory(), 0, to_match.length())
fun long_match(to_match: *char, position: int, end: int): int {
var next = set::set(begin)
var longest = -1
for (var i = 0; i < end-position; i++;) {
if (next.size() == 0)
return longest
if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
longest = i
next = next.flatten_map(fun(state: *regexState): set::set<*regexState> { return state->match_char(to_match[position+i]); })
}
if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
return end-position
return longest
}
}