import io import string import util import vector import string import mem import set import util import serialize fun regex(in: *char):regex { return regex(string::string(in)) } fun regex(in: string::string):regex { var out.construct(in):regex return out } obj regexState (Object) { // if only one character, both are the same var characterBegin: char var characterEnd: char var next_states: set::set<*regexState> fun construct(charIn:char): *regexState { return construct(charIn, charIn) } fun construct(charFirst:char, charSecond:char): *regexState { characterBegin = charFirst characterEnd = charSecond next_states.construct() return this } fun construct(): *regexState { return construct((0) cast char) } fun copy_construct(old:*regexState): void { characterBegin = old->characterBegin characterEnd = old->characterEnd next_states.copy_construct(&old->next_states) } fun destruct():void { next_states.destruct() } fun match_char(input: char): set::set<*regexState> { return next_states.filter(fun(it:*regexState):bool { return it->characterBegin <= input && input <= it->characterEnd; }) } fun is_end():bool { return next_states.any_true(fun(state: *regexState):bool { return state->characterBegin == 1; }) } } obj regex (Object, Serializable) { var regexString: string::string var begin: *regexState var referenceCounter: *int var is_straight_string: bool fun construct(): *regex { regexString.construct() return this } fun construct(regexStringIn: string::string): *regex { regexString.copy_construct(®exStringIn) is_straight_string = true for (var i = 0; i < regexString.length(); i++;) { // simple implementation doesn't count escaped characters as straight string if (regexString[i] == '\\' || regexString[i] == '(' || regexString[i] == ')' || regexString[i] == '[' || regexString[i] == '*' || regexString[i] == '+' || regexString[i] == '?' || regexString[i] == '|') { is_straight_string = false break } } if (!is_straight_string) { referenceCounter = mem::new() *referenceCounter = 1 var beginningAndEnd = compile(regexStringIn) // init our begin, and the end state as the next state of each end begin = beginningAndEnd.first var end = mem::new()->construct((1) cast char) beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); }) } return this } fun copy_construct(old:*regex):void { regexString.copy_construct(&old->regexString) is_straight_string = old->is_straight_string if (!is_straight_string) { begin = old->begin referenceCounter = old->referenceCounter *referenceCounter += 1 } } fun destruct():void { regexString.destruct() if (!is_straight_string) { *referenceCounter -= 1 if (*referenceCounter == 0) { util::safe_recursive_delete(begin, fun(it: *regexState): set::set<*regexState> { return it->next_states; } ) mem::delete(referenceCounter) } } } fun serialize(): vector::vector { return serialize::serialize(regexString) } fun unserialize(it: ref vector::vector, pos: int): int { var temp = string::string() util::unpack(temp, pos) = serialize::unserialize(it, pos) construct(temp) return pos } fun operator==(other: regex):bool { return regexString == other.regexString } fun operator=(other: regex):void { destruct() copy_construct(&other) } fun compile(regex_string: string::string): util::pair<*regexState, set::set<*regexState>> { /*io::println(regex_string)*/ var first = mem::new()->construct() var previous_begin = set::set<*regexState>() var previous_end = set::set<*regexState>() var current_begin = set::set(first) var current_end = set::set(first) var alternating = false var escapeing = false for (var i = 0; i < regex_string.length(); i++;) { if (regex_string[i] == '*' && !escapeing) { current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);) current_begin.add_all(previous_begin) current_end.add_all(previous_end) } else if (regex_string[i] == '+' && !escapeing) { current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);) } else if (regex_string[i] == '?' && !escapeing) { current_begin.add_all(previous_begin) current_end.add_all(previous_end) } else if (regex_string[i] == '|' && !escapeing) { alternating = true } else if (regex_string[i] == '(' && !escapeing) { // note that we don't have a ')' case, as we skip past it with our indicies var perenEnd = i + 1 for (var depth = 1; depth > 0; perenEnd++;) { if (perenEnd >= regex_string.length()) util::error(string::string("can't find matching peren in: ") + regex_string) // be careful, this isn't quite right yet /*var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '\\' && regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))*/ var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']')) if (regex_string[perenEnd] == '(' && not_non_special) depth++ else if (regex_string[perenEnd] == ')' && not_non_special) depth-- } var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1)) // NOTE: perenEnd is one past the close peren i = perenEnd-1 if (alternating) { previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } ) current_begin.add_all(innerBeginEnd.first->next_states) current_end.add_all(innerBeginEnd.second) } else { current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } ) previous_begin = current_begin previous_end = current_end current_begin = innerBeginEnd.first->next_states current_end = innerBeginEnd.second } alternating = false } else if (regex_string[i] == '\\' && !escapeing) { escapeing = true } else { var next: *regexState if (regex_string[i] == '[' && !escapeing) { next = mem::new()->construct(regex_string[i+1], regex_string[i+3]) i += 4 // [a-b] is 5, i++ adds one } else { next = mem::new()->construct(regex_string[i]) } if (alternating) { previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); }) current_begin.add(next) current_end.add(next) } else { current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); }) previous_begin = current_begin previous_end = current_end current_begin = set::set(next) current_end = set::set(next) } escapeing = false alternating = false } } var beginAndEnd = util::make_pair(first, current_end) return beginAndEnd } fun long_match(to_match: *char): int { return long_match(string::string(to_match)); } fun long_match(to_match: string::string): int return long_match(to_match.getBackingMemory(), 0, to_match.length()) fun long_match(to_match: *char, position: int, end: int): int { if (is_straight_string) { if (regexString.length() > end-position) return -1 for (var i = 0; i < regexString.length(); i++;) if (regexString[i] != to_match[position+i]) return -1 return regexString.length(); } /*var next = set::set(begin)*/ var next.construct(): set::set<*regexState> next.add(begin) var longest = -1 for (var i = 0; i < end-position; i++;) { if (next.size() == 0) return longest if (next.any_true(fun(state: *regexState):bool { return state->is_end(); })) longest = i next = next.flatten_map(fun(state: *regexState): set::set<*regexState> { return state->match_char(to_match[position+i]); }) } if (next.any_true(fun(state: *regexState):bool { return state->is_end(); })) return end-position return longest } }