271 lines
11 KiB
Plaintext
271 lines
11 KiB
Plaintext
import io
|
|
import string
|
|
import util
|
|
import vector
|
|
import string
|
|
import mem
|
|
import set
|
|
import util
|
|
import serialize
|
|
|
|
fun regex(in: *char):regex {
|
|
return regex(string::string(in))
|
|
}
|
|
fun regex(in: string::string):regex {
|
|
var out.construct(in):regex
|
|
return out
|
|
}
|
|
fun regexState(): regexState {
|
|
var to_ret.construct(): regexState
|
|
return to_ret
|
|
}
|
|
fun regexState(charIn: char): regexState {
|
|
var to_ret.construct(charIn): regexState
|
|
return to_ret
|
|
}
|
|
fun regexState(first: char, last: char): regexState {
|
|
var to_ret.construct(first, last): regexState
|
|
return to_ret
|
|
}
|
|
obj regexState (Object) {
|
|
// if only one character, both are the same
|
|
var characterBegin: char
|
|
var characterEnd: char
|
|
var next_states: set::set<int>
|
|
fun construct(charIn:char): *regexState {
|
|
return construct(charIn, charIn)
|
|
}
|
|
fun construct(charFirst:char, charSecond:char): *regexState {
|
|
characterBegin = charFirst
|
|
characterEnd = charSecond
|
|
next_states.construct()
|
|
return this
|
|
}
|
|
fun construct(): *regexState {
|
|
return construct((0) cast char)
|
|
}
|
|
fun copy_construct(old:*regexState): void {
|
|
characterBegin = old->characterBegin
|
|
characterEnd = old->characterEnd
|
|
next_states.copy_construct(&old->next_states)
|
|
}
|
|
fun destruct():void {
|
|
next_states.destruct()
|
|
}
|
|
fun match_char(input: char, states: ref vector::vector<regexState>, flags: *vector::vector<bool>, num_states: *int) {
|
|
next_states.for_each(fun(it:int) {
|
|
if (states[it].characterBegin <= input && input <= states[it].characterEnd) {
|
|
(*flags)[it] = true
|
|
(*num_states)++
|
|
}
|
|
})
|
|
}
|
|
fun is_end(states: ref vector::vector<regexState>):bool {
|
|
return next_states.any_true(fun(state: int):bool { return states[state].characterBegin == 1; })
|
|
}
|
|
}
|
|
|
|
obj regex (Object, Serializable) {
|
|
var regexString: string::string
|
|
var states: vector::vector<regexState>
|
|
var flagsA: vector::vector<bool>
|
|
var flagsB: vector::vector<bool>
|
|
var is_straight_string: bool
|
|
|
|
fun construct(): *regex {
|
|
regexString.construct()
|
|
states.construct()
|
|
flagsA.construct()
|
|
flagsB.construct()
|
|
is_straight_string = false
|
|
return this
|
|
}
|
|
fun construct(regexStringIn: string::string): *regex {
|
|
regexString.copy_construct(®exStringIn)
|
|
states.construct()
|
|
is_straight_string = true
|
|
for (var i = 0; i < regexString.length(); i++;) {
|
|
// simple implementation doesn't count escaped characters as straight string
|
|
if (regexString[i] == '\\' || regexString[i] == '(' || regexString[i] == ')' || regexString[i] == '[' || regexString[i] == '*' || regexString[i] == '+' || regexString[i] == '?' || regexString[i] == '|') {
|
|
is_straight_string = false
|
|
break
|
|
}
|
|
}
|
|
if (!is_straight_string) {
|
|
var beginningAndEnd = compile(regexStringIn)
|
|
// init our begin, and the end state as the next state of each end
|
|
var end = states.size
|
|
states.add(regexState((1) cast char))
|
|
beginningAndEnd.second.for_each(fun(it: int): void { states[it].next_states.add(end); })
|
|
}
|
|
flagsA.construct(states.size); flagsA.size = states.size
|
|
flagsB.construct(states.size); flagsB.size = states.size
|
|
return this
|
|
}
|
|
|
|
fun copy_construct(old:*regex):void {
|
|
regexString.copy_construct(&old->regexString)
|
|
is_straight_string = old->is_straight_string
|
|
states.copy_construct(&old->states)
|
|
flagsA.construct(states.size); flagsA.size = states.size
|
|
flagsB.construct(states.size); flagsB.size = states.size
|
|
}
|
|
|
|
fun destruct():void {
|
|
regexString.destruct()
|
|
states.destruct()
|
|
flagsA.destruct()
|
|
flagsB.destruct()
|
|
}
|
|
fun serialize(): vector::vector<char> {
|
|
return serialize::serialize(regexString)
|
|
}
|
|
fun unserialize(it: ref vector::vector<char>, pos: int): int {
|
|
pos = regexString.unserialize(it, pos)
|
|
states.construct()
|
|
construct(regexString)
|
|
flagsA.construct(states.size); flagsA.size = states.size
|
|
flagsB.construct(states.size); flagsB.size = states.size
|
|
return pos
|
|
}
|
|
|
|
fun operator==(other: regex):bool {
|
|
return regexString == other.regexString
|
|
}
|
|
|
|
fun operator=(other: regex):void {
|
|
destruct()
|
|
copy_construct(&other)
|
|
}
|
|
|
|
fun compile(regex_string: string::string): util::pair<int, set::set<int>> {
|
|
/*io::println(regex_string)*/
|
|
var first = states.size; states.add(regexState())
|
|
var previous_begin = set::set<int>()
|
|
var previous_end = set::set<int>()
|
|
var current_begin = set::set(first)
|
|
var current_end = set::set(first)
|
|
var alternating = false
|
|
var escapeing = false
|
|
|
|
for (var i = 0; i < regex_string.length(); i++;) {
|
|
if (regex_string[i] == '*' && !escapeing) {
|
|
current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);)
|
|
current_begin.add_all(previous_begin)
|
|
current_end.add_all(previous_end)
|
|
|
|
} else if (regex_string[i] == '+' && !escapeing) {
|
|
current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);)
|
|
|
|
} else if (regex_string[i] == '?' && !escapeing) {
|
|
current_begin.add_all(previous_begin)
|
|
current_end.add_all(previous_end)
|
|
|
|
} else if (regex_string[i] == '|' && !escapeing) {
|
|
alternating = true
|
|
} else if (regex_string[i] == '(' && !escapeing) {
|
|
// note that we don't have a ')' case, as we skip past it with our indicies
|
|
var perenEnd = i + 1
|
|
for (var depth = 1; depth > 0; perenEnd++;) {
|
|
if (perenEnd >= regex_string.length())
|
|
util::error(string::string("can't find matching peren in: ") + regex_string)
|
|
// be careful, this isn't quite right yet
|
|
/*var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '\\' && regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))*/
|
|
var not_non_special = perenEnd == 0 || (regex_string[perenEnd-1] != '[' && (perenEnd+1 >= regex_string.length() || regex_string[perenEnd+1] != ']'))
|
|
if (regex_string[perenEnd] == '(' && not_non_special)
|
|
depth++
|
|
else if (regex_string[perenEnd] == ')' && not_non_special)
|
|
depth--
|
|
}
|
|
var innerBeginEnd = compile(regex_string.slice(i+1, perenEnd-1))
|
|
// NOTE: perenEnd is one past the close peren
|
|
i = perenEnd-1
|
|
|
|
if (alternating) {
|
|
previous_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } )
|
|
current_begin.add_all(states[innerBeginEnd.first].next_states)
|
|
current_end.add_all(innerBeginEnd.second)
|
|
} else {
|
|
current_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } )
|
|
previous_begin = current_begin
|
|
previous_end = current_end
|
|
current_begin = states[innerBeginEnd.first].next_states
|
|
current_end = innerBeginEnd.second
|
|
}
|
|
alternating = false
|
|
|
|
} else if (regex_string[i] == '\\' && !escapeing) {
|
|
escapeing = true
|
|
} else {
|
|
var next: int
|
|
if (regex_string[i] == '[' && !escapeing) {
|
|
next = states.size; states.add(regexState(regex_string[i+1], regex_string[i+3]))
|
|
i += 4 // [a-b] is 5, i++ adds one
|
|
} else {
|
|
next = states.size; states.add(regexState(regex_string[i]))
|
|
}
|
|
if (alternating) {
|
|
previous_end.for_each(fun(it: int):void { states[it].next_states.add(next); })
|
|
current_begin.add(next)
|
|
current_end.add(next)
|
|
} else {
|
|
current_end.for_each(fun(it: int):void { states[it].next_states.add(next); })
|
|
previous_begin = current_begin
|
|
previous_end = current_end
|
|
current_begin = set::set(next)
|
|
current_end = set::set(next)
|
|
}
|
|
escapeing = false
|
|
alternating = false
|
|
}
|
|
}
|
|
var beginAndEnd = util::make_pair(first, current_end)
|
|
return beginAndEnd
|
|
}
|
|
|
|
fun long_match(to_match: *char): int { return long_match(string::string(to_match)); }
|
|
fun long_match(to_match: string::string): int return long_match(to_match.getBackingMemory(), 0, to_match.length())
|
|
fun long_match(to_match: *char, position: int, end: int): int {
|
|
if (is_straight_string) {
|
|
if (regexString.length() > end-position)
|
|
return -1
|
|
for (var i = 0; i < regexString.length(); i++;)
|
|
if (regexString[i] != to_match[position+i])
|
|
return -1
|
|
return regexString.length();
|
|
}
|
|
/*var next = set::set(begin)*/
|
|
for (var i = 1; i < flagsA.size; i++;)
|
|
flagsA[i] = false;
|
|
flagsA[0] = true
|
|
var num_active = 1
|
|
var longest = -1
|
|
var flags = &flagsA
|
|
var next_flags = &flagsB
|
|
for (var i = 0; i < end-position; i++;) {
|
|
if (num_active == 0)
|
|
return longest
|
|
num_active = 0
|
|
for (var state = 0; state < flags->size; state++;) {
|
|
if ((*flags)[state] && states[state].is_end(states)) {
|
|
longest = i
|
|
break
|
|
}
|
|
}
|
|
for (var j = 0; j < next_flags->size; j++;)
|
|
(*next_flags)[j] = false;
|
|
for (var state = 0; state < flags->size; state++;)
|
|
if ((*flags)[state])
|
|
states[state].match_char(to_match[position+i], states, next_flags, &num_active)
|
|
var tmp = flags
|
|
flags = next_flags
|
|
next_flags = tmp
|
|
}
|
|
for (var state = 0; state < flags->size; state++;)
|
|
if ((*flags)[state] && states[state].is_end(states))
|
|
return end-position
|
|
return longest
|
|
}
|
|
}
|
|
|