Regex rewrite for big speed gain, some extras (--no-c-compile), -O2 is default now
This commit is contained in:
13
kraken.krak
13
kraken.krak
@@ -45,8 +45,9 @@ fun main(argc: int, argv: **char):int {
|
|||||||
}
|
}
|
||||||
var input_file_offset = 1
|
var input_file_offset = 1
|
||||||
var interpret_instead = false
|
var interpret_instead = false
|
||||||
var opt_str = string("-O3")
|
var opt_str = string("-O2")
|
||||||
var line_ctrl = false
|
var line_ctrl = false
|
||||||
|
var compile_c = true
|
||||||
var positional_args = vector<string>()
|
var positional_args = vector<string>()
|
||||||
var flags = set<string>()
|
var flags = set<string>()
|
||||||
for (var i = 1; i < argc; i++;) {
|
for (var i = 1; i < argc; i++;) {
|
||||||
@@ -57,6 +58,8 @@ fun main(argc: int, argv: **char):int {
|
|||||||
opt_str = arg_str
|
opt_str = arg_str
|
||||||
} else if (arg_str == "-g") {
|
} else if (arg_str == "-g") {
|
||||||
line_ctrl = true
|
line_ctrl = true
|
||||||
|
} else if (arg_str == "--no-c-compile") {
|
||||||
|
compile_c = false
|
||||||
} else if (arg_str.length() > 2 && arg_str.first() == '-') {
|
} else if (arg_str.length() > 2 && arg_str.first() == '-') {
|
||||||
flags.add(arg_str.slice(1,-1))
|
flags.add(arg_str.slice(1,-1))
|
||||||
} else {
|
} else {
|
||||||
@@ -164,9 +167,11 @@ fun main(argc: int, argv: **char):int {
|
|||||||
var c_output_pair = c_generator.generate_c(importer.name_ast_map, importer.ast_pass.ast_to_syntax)
|
var c_output_pair = c_generator.generate_c(importer.name_ast_map, importer.ast_pass.ast_to_syntax)
|
||||||
var kraken_c_output_name = kraken_file_name + ".c"
|
var kraken_c_output_name = kraken_file_name + ".c"
|
||||||
write_file(kraken_c_output_name, c_output_pair.first)
|
write_file(kraken_c_output_name, c_output_pair.first)
|
||||||
var compile_string = "cc -g " + opt_str + " -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast -std=c99 " + c_output_pair.second + " " + kraken_c_output_name + " -o " + executable_name
|
if (compile_c) {
|
||||||
printlnerr(compile_string)
|
var compile_string = "cc -g " + opt_str + " -Wno-int-to-pointer-cast -Wno-pointer-to-int-cast -std=c99 " + c_output_pair.second + " " + kraken_c_output_name + " -o " + executable_name
|
||||||
system(compile_string)
|
printlnerr(compile_string)
|
||||||
|
system(compile_string)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -912,7 +912,12 @@ obj ast_transformation (Object) {
|
|||||||
}
|
}
|
||||||
if (!possible_value)
|
if (!possible_value)
|
||||||
error(node, concat_symbol_tree(node) + ": HAS NO POSSIBLE FUNCTION OR FUNCTION TEMPLATE SOLUTIONS\nlooking for: " +
|
error(node, concat_symbol_tree(node) + ": HAS NO POSSIBLE FUNCTION OR FUNCTION TEMPLATE SOLUTIONS\nlooking for: " +
|
||||||
concat_symbol_tree(node->children[0]) + "(" + searching_for.function.reduce(fun(n:*type, s:string):string return s+","+n->to_string();, string()) + ")")
|
concat_symbol_tree(node->children[0]) + "(" + searching_for.function.reduce(fun(n:*type, s:string):string {
|
||||||
|
if (n)
|
||||||
|
return s+","+n->to_string()
|
||||||
|
else
|
||||||
|
return s+",null"
|
||||||
|
}, string()) + ")")
|
||||||
return possible_value
|
return possible_value
|
||||||
} else if (node->children.size == 2) {
|
} else if (node->children.size == 2) {
|
||||||
var template_inst = get_node("template_inst", node)
|
var template_inst = get_node("template_inst", node)
|
||||||
|
|||||||
@@ -15,12 +15,23 @@ fun regex(in: string::string):regex {
|
|||||||
var out.construct(in):regex
|
var out.construct(in):regex
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
fun regexState(): regexState {
|
||||||
|
var to_ret.construct(): regexState
|
||||||
|
return to_ret
|
||||||
|
}
|
||||||
|
fun regexState(charIn: char): regexState {
|
||||||
|
var to_ret.construct(charIn): regexState
|
||||||
|
return to_ret
|
||||||
|
}
|
||||||
|
fun regexState(first: char, last: char): regexState {
|
||||||
|
var to_ret.construct(first, last): regexState
|
||||||
|
return to_ret
|
||||||
|
}
|
||||||
obj regexState (Object) {
|
obj regexState (Object) {
|
||||||
// if only one character, both are the same
|
// if only one character, both are the same
|
||||||
var characterBegin: char
|
var characterBegin: char
|
||||||
var characterEnd: char
|
var characterEnd: char
|
||||||
var next_states: set::set<*regexState>
|
var next_states: set::set<int>
|
||||||
fun construct(charIn:char): *regexState {
|
fun construct(charIn:char): *regexState {
|
||||||
return construct(charIn, charIn)
|
return construct(charIn, charIn)
|
||||||
}
|
}
|
||||||
@@ -41,26 +52,37 @@ obj regexState (Object) {
|
|||||||
fun destruct():void {
|
fun destruct():void {
|
||||||
next_states.destruct()
|
next_states.destruct()
|
||||||
}
|
}
|
||||||
fun match_char(input: char): set::set<*regexState> {
|
fun match_char(input: char, states: ref vector::vector<regexState>, flags: *vector::vector<bool>, num_states: *int) {
|
||||||
return next_states.filter(fun(it:*regexState):bool { return it->characterBegin <= input && input <= it->characterEnd; })
|
next_states.for_each(fun(it:int) {
|
||||||
|
if (states[it].characterBegin <= input && input <= states[it].characterEnd) {
|
||||||
|
(*flags)[it] = true
|
||||||
|
(*num_states)++
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
fun is_end():bool {
|
fun is_end(states: ref vector::vector<regexState>):bool {
|
||||||
return next_states.any_true(fun(state: *regexState):bool { return state->characterBegin == 1; })
|
return next_states.any_true(fun(state: int):bool { return states[state].characterBegin == 1; })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
obj regex (Object, Serializable) {
|
obj regex (Object, Serializable) {
|
||||||
var regexString: string::string
|
var regexString: string::string
|
||||||
var begin: *regexState
|
var states: vector::vector<regexState>
|
||||||
var referenceCounter: *int
|
var flagsA: vector::vector<bool>
|
||||||
|
var flagsB: vector::vector<bool>
|
||||||
var is_straight_string: bool
|
var is_straight_string: bool
|
||||||
|
|
||||||
fun construct(): *regex {
|
fun construct(): *regex {
|
||||||
regexString.construct()
|
regexString.construct()
|
||||||
|
states.construct()
|
||||||
|
flagsA.construct()
|
||||||
|
flagsB.construct()
|
||||||
|
is_straight_string = false
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
fun construct(regexStringIn: string::string): *regex {
|
fun construct(regexStringIn: string::string): *regex {
|
||||||
regexString.copy_construct(®exStringIn)
|
regexString.copy_construct(®exStringIn)
|
||||||
|
states.construct()
|
||||||
is_straight_string = true
|
is_straight_string = true
|
||||||
for (var i = 0; i < regexString.length(); i++;) {
|
for (var i = 0; i < regexString.length(); i++;) {
|
||||||
// simple implementation doesn't count escaped characters as straight string
|
// simple implementation doesn't count escaped characters as straight string
|
||||||
@@ -70,44 +92,40 @@ obj regex (Object, Serializable) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!is_straight_string) {
|
if (!is_straight_string) {
|
||||||
referenceCounter = mem::new<int>()
|
|
||||||
*referenceCounter = 1
|
|
||||||
var beginningAndEnd = compile(regexStringIn)
|
var beginningAndEnd = compile(regexStringIn)
|
||||||
// init our begin, and the end state as the next state of each end
|
// init our begin, and the end state as the next state of each end
|
||||||
begin = beginningAndEnd.first
|
var end = states.size
|
||||||
var end = mem::new<regexState>()->construct((1) cast char)
|
states.add(regexState((1) cast char))
|
||||||
beginningAndEnd.second.for_each(fun(it: *regexState): void { it->next_states.add(end); })
|
beginningAndEnd.second.for_each(fun(it: int): void { states[it].next_states.add(end); })
|
||||||
}
|
}
|
||||||
|
flagsA.construct(states.size); flagsA.size = states.size
|
||||||
|
flagsB.construct(states.size); flagsB.size = states.size
|
||||||
return this
|
return this
|
||||||
}
|
}
|
||||||
|
|
||||||
fun copy_construct(old:*regex):void {
|
fun copy_construct(old:*regex):void {
|
||||||
regexString.copy_construct(&old->regexString)
|
regexString.copy_construct(&old->regexString)
|
||||||
is_straight_string = old->is_straight_string
|
is_straight_string = old->is_straight_string
|
||||||
if (!is_straight_string) {
|
states.copy_construct(&old->states)
|
||||||
begin = old->begin
|
flagsA.construct(states.size); flagsA.size = states.size
|
||||||
referenceCounter = old->referenceCounter
|
flagsB.construct(states.size); flagsB.size = states.size
|
||||||
*referenceCounter += 1
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fun destruct():void {
|
fun destruct():void {
|
||||||
regexString.destruct()
|
regexString.destruct()
|
||||||
if (!is_straight_string) {
|
states.destruct()
|
||||||
*referenceCounter -= 1
|
flagsA.destruct()
|
||||||
if (*referenceCounter == 0) {
|
flagsB.destruct()
|
||||||
util::safe_recursive_delete(begin, fun(it: *regexState): set::set<*regexState> { return it->next_states; } )
|
|
||||||
mem::delete(referenceCounter)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
fun serialize(): vector::vector<char> {
|
fun serialize(): vector::vector<char> {
|
||||||
return serialize::serialize(regexString)
|
return serialize::serialize(regexString)
|
||||||
}
|
}
|
||||||
fun unserialize(it: ref vector::vector<char>, pos: int): int {
|
fun unserialize(it: ref vector::vector<char>, pos: int): int {
|
||||||
var temp = string::string()
|
pos = regexString.unserialize(it, pos)
|
||||||
util::unpack(temp, pos) = serialize::unserialize<string::string>(it, pos)
|
states.construct()
|
||||||
construct(temp)
|
construct(regexString)
|
||||||
|
flagsA.construct(states.size); flagsA.size = states.size
|
||||||
|
flagsB.construct(states.size); flagsB.size = states.size
|
||||||
return pos
|
return pos
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,11 +138,11 @@ obj regex (Object, Serializable) {
|
|||||||
copy_construct(&other)
|
copy_construct(&other)
|
||||||
}
|
}
|
||||||
|
|
||||||
fun compile(regex_string: string::string): util::pair<*regexState, set::set<*regexState>> {
|
fun compile(regex_string: string::string): util::pair<int, set::set<int>> {
|
||||||
/*io::println(regex_string)*/
|
/*io::println(regex_string)*/
|
||||||
var first = mem::new<regexState>()->construct()
|
var first = states.size; states.add(regexState())
|
||||||
var previous_begin = set::set<*regexState>()
|
var previous_begin = set::set<int>()
|
||||||
var previous_end = set::set<*regexState>()
|
var previous_end = set::set<int>()
|
||||||
var current_begin = set::set(first)
|
var current_begin = set::set(first)
|
||||||
var current_end = set::set(first)
|
var current_end = set::set(first)
|
||||||
var alternating = false
|
var alternating = false
|
||||||
@@ -132,12 +150,12 @@ obj regex (Object, Serializable) {
|
|||||||
|
|
||||||
for (var i = 0; i < regex_string.length(); i++;) {
|
for (var i = 0; i < regex_string.length(); i++;) {
|
||||||
if (regex_string[i] == '*' && !escapeing) {
|
if (regex_string[i] == '*' && !escapeing) {
|
||||||
current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)
|
current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);)
|
||||||
current_begin.add_all(previous_begin)
|
current_begin.add_all(previous_begin)
|
||||||
current_end.add_all(previous_end)
|
current_end.add_all(previous_end)
|
||||||
|
|
||||||
} else if (regex_string[i] == '+' && !escapeing) {
|
} else if (regex_string[i] == '+' && !escapeing) {
|
||||||
current_end.for_each(fun(item: *regexState) item->next_states.add_all(current_begin);)
|
current_end.for_each(fun(item: int) states[item].next_states.add_all(current_begin);)
|
||||||
|
|
||||||
} else if (regex_string[i] == '?' && !escapeing) {
|
} else if (regex_string[i] == '?' && !escapeing) {
|
||||||
current_begin.add_all(previous_begin)
|
current_begin.add_all(previous_begin)
|
||||||
@@ -145,7 +163,6 @@ obj regex (Object, Serializable) {
|
|||||||
|
|
||||||
} else if (regex_string[i] == '|' && !escapeing) {
|
} else if (regex_string[i] == '|' && !escapeing) {
|
||||||
alternating = true
|
alternating = true
|
||||||
|
|
||||||
} else if (regex_string[i] == '(' && !escapeing) {
|
} else if (regex_string[i] == '(' && !escapeing) {
|
||||||
// note that we don't have a ')' case, as we skip past it with our indicies
|
// note that we don't have a ')' case, as we skip past it with our indicies
|
||||||
var perenEnd = i + 1
|
var perenEnd = i + 1
|
||||||
@@ -165,35 +182,34 @@ obj regex (Object, Serializable) {
|
|||||||
i = perenEnd-1
|
i = perenEnd-1
|
||||||
|
|
||||||
if (alternating) {
|
if (alternating) {
|
||||||
previous_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
|
previous_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } )
|
||||||
current_begin.add_all(innerBeginEnd.first->next_states)
|
current_begin.add_all(states[innerBeginEnd.first].next_states)
|
||||||
current_end.add_all(innerBeginEnd.second)
|
current_end.add_all(innerBeginEnd.second)
|
||||||
} else {
|
} else {
|
||||||
current_end.for_each(fun(it: *regexState):void { it->next_states.add_all(innerBeginEnd.first->next_states); } )
|
current_end.for_each(fun(it: int):void { states[it].next_states.add_all(states[innerBeginEnd.first].next_states); } )
|
||||||
previous_begin = current_begin
|
previous_begin = current_begin
|
||||||
previous_end = current_end
|
previous_end = current_end
|
||||||
current_begin = innerBeginEnd.first->next_states
|
current_begin = states[innerBeginEnd.first].next_states
|
||||||
current_end = innerBeginEnd.second
|
current_end = innerBeginEnd.second
|
||||||
}
|
}
|
||||||
alternating = false
|
alternating = false
|
||||||
|
|
||||||
} else if (regex_string[i] == '\\' && !escapeing) {
|
} else if (regex_string[i] == '\\' && !escapeing) {
|
||||||
escapeing = true
|
escapeing = true
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
var next: *regexState
|
var next: int
|
||||||
if (regex_string[i] == '[' && !escapeing) {
|
if (regex_string[i] == '[' && !escapeing) {
|
||||||
next = mem::new<regexState>()->construct(regex_string[i+1], regex_string[i+3])
|
next = states.size; states.add(regexState(regex_string[i+1], regex_string[i+3]))
|
||||||
i += 4 // [a-b] is 5, i++ adds one
|
i += 4 // [a-b] is 5, i++ adds one
|
||||||
} else {
|
} else {
|
||||||
next = mem::new<regexState>()->construct(regex_string[i])
|
next = states.size; states.add(regexState(regex_string[i]))
|
||||||
}
|
}
|
||||||
if (alternating) {
|
if (alternating) {
|
||||||
previous_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
|
previous_end.for_each(fun(it: int):void { states[it].next_states.add(next); })
|
||||||
current_begin.add(next)
|
current_begin.add(next)
|
||||||
current_end.add(next)
|
current_end.add(next)
|
||||||
} else {
|
} else {
|
||||||
current_end.for_each(fun(it: *regexState):void { it->next_states.add(next); })
|
current_end.for_each(fun(it: int):void { states[it].next_states.add(next); })
|
||||||
previous_begin = current_begin
|
previous_begin = current_begin
|
||||||
previous_end = current_end
|
previous_end = current_end
|
||||||
current_begin = set::set(next)
|
current_begin = set::set(next)
|
||||||
@@ -219,18 +235,35 @@ obj regex (Object, Serializable) {
|
|||||||
return regexString.length();
|
return regexString.length();
|
||||||
}
|
}
|
||||||
/*var next = set::set(begin)*/
|
/*var next = set::set(begin)*/
|
||||||
var next.construct(): set::set<*regexState>
|
for (var i = 1; i < flagsA.size; i++;)
|
||||||
next.add(begin)
|
flagsA[i] = false;
|
||||||
|
flagsA[0] = true
|
||||||
|
var num_active = 1
|
||||||
var longest = -1
|
var longest = -1
|
||||||
|
var flags = &flagsA
|
||||||
|
var next_flags = &flagsB
|
||||||
for (var i = 0; i < end-position; i++;) {
|
for (var i = 0; i < end-position; i++;) {
|
||||||
if (next.size() == 0)
|
if (num_active == 0)
|
||||||
return longest
|
return longest
|
||||||
if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
|
num_active = 0
|
||||||
longest = i
|
for (var state = 0; state < flags->size; state++;) {
|
||||||
next = next.flatten_map(fun(state: *regexState): set::set<*regexState> { return state->match_char(to_match[position+i]); })
|
if ((*flags)[state] && states[state].is_end(states)) {
|
||||||
|
longest = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (var j = 0; j < next_flags->size; j++;)
|
||||||
|
(*next_flags)[j] = false;
|
||||||
|
for (var state = 0; state < flags->size; state++;)
|
||||||
|
if ((*flags)[state])
|
||||||
|
states[state].match_char(to_match[position+i], states, next_flags, &num_active)
|
||||||
|
var tmp = flags
|
||||||
|
flags = next_flags
|
||||||
|
next_flags = tmp
|
||||||
}
|
}
|
||||||
if (next.any_true(fun(state: *regexState):bool { return state->is_end(); }))
|
for (var state = 0; state < flags->size; state++;)
|
||||||
return end-position
|
if ((*flags)[state] && states[state].is_end(states))
|
||||||
|
return end-position
|
||||||
return longest
|
return longest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user