import io import str import util import vec import str import mem import set import util import serialize fun regex(in: *char):regex { return regex(str::str(in)) } fun regex(in: str::str):regex { var out.construct(in):regex return out } fun regexState(): regexState { var to_ret.construct(): regexState return to_ret } fun regexState(charIn: char): regexState { var to_ret.construct(charIn): regexState return to_ret } fun regexState(first: char, last: char): regexState { var to_ret.construct(first, last): regexState return to_ret } obj regexState (Object) { // if only one character, both are the same var characterBegin: char var characterEnd: char var next_states: set::set fun construct(charIn:char): *regexState { return construct(charIn, charIn) } fun construct(charFirst:char, charSecond:char): *regexState { characterBegin = charFirst characterEnd = charSecond next_states.construct() return this } fun construct(): *regexState { return construct((0) cast char) } fun copy_construct(old:*regexState): void { characterBegin = old->characterBegin characterEnd = old->characterEnd next_states.copy_construct(&old->next_states) } fun destruct():void { next_states.destruct() } fun match_char(input: char, states: ref vec::vec, flags: *vec::vec, num_states: *int) { next_states.for_each(fun(it:int) { if (states[it].characterBegin <= input && input <= states[it].characterEnd) { (*flags)[it] = true (*num_states)++ } }) } fun is_end(states: ref vec::vec):bool { return next_states.any_true(fun(state: int):bool { return states[state].characterBegin == 1; }) } } obj regex (Object, Serializable) { var regexString: str::str var states: vec::vec var flagsA: vec::vec var flagsB: vec::vec var is_straight_string: bool fun construct(): *regex { regexString.construct() states.construct() flagsA.construct() flagsB.construct() is_straight_string = false return this } fun construct(regexStringIn: str::str): *regex { regexString.copy_construct(®exStringIn) states.construct() is_straight_string = true for (var i = 0; i < regexString.length(); i++;) { // simple implementation doesn't count escaped characters as straight str if (regexString[i] == '\\' || regexString[i] == '(' || regexString[i] == ')' || regexString[i] == '[' || regexString[i] == '*' || regexString[i] == '+' || regexString[i] == '?' || regexString[i] == '|') { is_straight_string = false break } } if (!is_straight_string) { var beginningAndEnd = compile(regexStringIn) // init our begin, and the end state as the next state of each end var end = states.size states.add(regexState((1) cast char)) beginningAndEnd.second.for_each(fun(it: int): void { states[it].next_states.add(end); }) } flagsA.construct(states.size); flagsA.size = states.size flagsB.construct(states.size); flagsB.size = states.size return this } fun copy_construct(old:*regex):void { regexString.copy_construct(&old->regexString) is_straight_string = old->is_straight_string states.copy_construct(&old->states) flagsA.construct(states.size); flagsA.size = states.size flagsB.construct(states.size); flagsB.size = states.size } fun destruct():void { regexString.destruct() states.destruct() flagsA.destruct() flagsB.destruct() } fun serialize(): vec::vec { return serialize::serialize(regexString) } fun unserialize(it: ref vec::vec, pos: int): int { pos = regexString.unserialize(it, pos) states.construct() construct(regexString) flagsA.construct(states.size); flagsA.size = states.size flagsB.construct(states.size); flagsB.size = states.size return pos } fun operator==(other: regex):bool { return regexString == other.regexString } fun operator=(other: regex):void { destruct() copy_construct(&other) } fun compile(regex_string: str::str): util::pair> { /*io::println(regex_string)*/ var first = states.size; states.add(regexState()) var previous_begin = set::set() var previous_end = set::set() var current_begin = set::set(first) var current_end = set::set(first) var alternating = false var escapeing = false for (var i = 0; i < regex_string.length(); i++;) { if (regex_string[i] == '*' && !escapeing) { current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);) current_begin.add_all(previous_begin) current_end.add_all(previous_end) } else if (regex_string[i] == '+' && !escapeing) { current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);) } else if (regex_string[i] == '?' && !escapeing) { current_begin.add_all(previous_begin) current_end.add_all(previous_end) } else if (regex_string[i] == '|' && !escapeing) { alternating = true } else if (regex_string[i] == '(' && !escapeing) { // note that we don't have a ')' case, as we skip past it with our indicies var perenEnd = i + 1 for (var depth = 1; depth > 0; perenEnd++;) { if (perenEnd >= regex_string.length()) util::error(str::str("can't find matching peren in: ") + regex_string) // be careful, this isn't quite right yet /*var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '\\' && regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))*/ var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']')) if (regex_string[perenEnd] == '(' && not_non_special) depth++ else if (regex_string[perenEnd] == ')' && not_non_special) depth-- } var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1)) // NOTE: perenEnd is one past the close peren i = perenEnd-1 if (alternating) { previous_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } ) current_begin.add_all(states[innerBeginEnd.first].next_states) current_end.add_all(innerBeginEnd.second) } else { current_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } ) previous_begin = current_begin previous_end = current_end current_begin = states[innerBeginEnd.first].next_states current_end = innerBeginEnd.second } alternating = false } else if (regex_string[i] == '\\' && !escapeing) { escapeing = true } else { var next: int if (regex_string[i] == '[' && !escapeing) { next = states.size; states.add(regexState(regex_string[i+1], regex_string[i+3])) i += 4 // [a-b] is 5, i++ adds one } else { next = states.size; states.add(regexState(regex_string[i])) } if (alternating) { previous_end.for_each(fun(it: int):void { states[it].next_states.add(next); }) current_begin.add(next) current_end.add(next) } else { current_end.for_each(fun(it: int):void { states[it].next_states.add(next); }) previous_begin = current_begin previous_end = current_end current_begin = set::set(next) current_end = set::set(next) } escapeing = false alternating = false } } var beginAndEnd = util::make_pair(first, current_end) return beginAndEnd } fun long_match(to_match: *char): int { return long_match(str::str(to_match)); } fun long_match(to_match: str::str): int return long_match(to_match.getBackingMemory(), 0, to_match.length()) fun long_match(to_match: *char, position: int, end: int): int { if (is_straight_string) { if (regexString.length() > end-position) return -1 for (var i = 0; i < regexString.length(); i++;) if (regexString[i] != to_match[position+i]) return -1 return regexString.length(); } for (var i = 1; i < flagsA.size; i++;) flagsA[i] = false; flagsA[0] = true var num_active = 1 var longest = -1 var flags = &flagsA var next_flags = &flagsB for (var i = 0; i < end-position; i++;) { if (num_active == 0) return longest num_active = 0 for (var state = 0; state < flags->size; state++;) { if ((*flags)[state] && states[state].is_end(states)) { longest = i break } } for (var j = 0; j < next_flags->size; j++;) (*next_flags)[j] = false; for (var state = 0; state < flags->size; state++;) if ((*flags)[state]) states[state].match_char(to_match[position+i], states, next_flags, &num_active) var tmp = flags flags = next_flags next_flags = tmp } for (var state = 0; state < flags->size; state++;) if ((*flags)[state] && states[state].is_end(states)) return end-position return longest } }