//[of]:description //[c]Tests if a string matches with a regular expression. //[cf] //[of]:imports import "base/types" import "base/memory-allocator" import "collection/collection" import "collection/vector" import "text/string-buffer" import "text/string" //[cf] //[of]:definitions //[c] typedef char map = [256] byte equ max groups = 32 //[c] public struct regex private nodes: local vector states: local vector rules: local vector groups: int group mask: dword group starts: [max groups] string group stops: [max groups] string initial: state final: state match beginning: bool match ending: bool firsts: char map // for compilation only is case sensitive: bool end //[cf] //[of]:error codes public enum regex code regex ok regex missing cpar regex empty regex invalid range end //[c] //[cf] //[c] //[of]:initialize - release //[of]:initialize //[c] public func initialize (m: regex) initialize (states (m), 256) initialize (nodes (m), 256) initialize (rules (m), 256) groups (m) = 0 match beginning (m) = false match ending (m) = false end //[cf] //[of]:release //[c] public func release (m: regex) delete all (m) release (states (m)) release (nodes (m)) release (rules (m)) end //[cf] //[cf] //[of]:compiling //[of]:compile (source,case sensitive) //[c]Compile a regular expression //[c] public func compile (m: regex, s: string, is case sensitive: bool) is case sensitive (m) = is case sensitive delete all (m) remove all (nodes (m)) remove all (states (m)) remove all (rules (m)) groups (m) = 0 group mask (m) = 1 match beginning (m) = false match ending (m) = false def p = s if p[] == $^ match beginning (m) = true p += 1 end def parser: local regex parser source (parser) = p // 'firsts' is cleared before returning so an compile // error will make fail any search without an extra // test. So the object is valid and the match function // can be used even if the compilation fails. initialize (firsts (m)) def node = new or node (m) def code = get union (m, parser, node) if code <> regex ok return code end def s1 = new state (m) def s2 = new state (m) expand (m, node, s1, s2) initial (m) = s1 final (m) = s2 // compute firsts scan (m, s1) return code end //[c] //[c]Structures: //[c] struct regex parser source: string node: node end //[c] //[c]Subfunctions: //[of]:scan //[c]Scan all chars accessibles from this state: all epsilon transitions //[c]are followed recursively. //[c] func scan (m: regex, s: state) : void if mark (s) return end mark (s) = true each (rules (s)) ? r equ rule = r: rule if epsilon (rule) scan (m, state (rule)) else def i = 0 def p = chars (rule) def q = firsts (m) while i < 256 q [i] |= p [i] i += 1 end end end end //[cf] //[of]:expand //[c] func expand (m: regex, node: node, s1: state, s2: state) : void switch (type (node)) case nt rules equ rn = node : rule node add rule (s1, s2, rule (rn)) case nt or equ on = node : or node def mask = group mask (m) group start (s1) |= mask group stop (s2) |= mask group mask (m) <<= 1 groups (m) += 1 each (sequences (on)) ? ns equ nodes = ns : node def ss1 = s1 each (nodes) ? n equ subnode = n : node def ss2 = is last (subnode) -> s2, new state (m) expand (m, subnode, ss1, ss2) ss1 = ss2 end end case nt zero or many equ rn = node : rep node def t1 = new state (m) def t2 = new state (m) expand (m, node (rn), t1, t2) add epsilon (m, s1, s2) add epsilon (m, s1, t1) add epsilon (m, t2, s2) add epsilon (m, t2, t1) case nt one or many equ rn = node : rep node def t1 = new state (m) def t2 = new state (m) expand (m, node (rn), t1, t2) add epsilon (m, s1, t1) add epsilon (m, t2, s2) add epsilon (m, t2, t1) case nt zero or one equ rn = node : rep node expand (m, node (rn), s1, s2) add epsilon (m, s1, s2) end end //[cf] //[of]:get union func get union (m: regex, parser: regex parser, or node: or node) : regex code def sequences = sequences (or node) repeat def nodes: local collection def code = get sequence (m, parser, nodes) if code <> regex ok return code end def node = first (nodes) if is nil (node) return regex empty end add (sequences, node) def p = source (parser) if p[] <> $| break end source (parser) = p + 1 end return regex ok end //[cf] //[of]:get sequence //[c] func get sequence (m: regex, parser: regex parser, nodes: collection) : regex code initialize (nodes) repeat def code = get node (m, parser) if code <> regex ok return code end def node = node (parser) if is nil (node) break end add (nodes, node) end return regex ok end //[cf] //[of]:get node func get node (m: regex, parser: regex parser) def p = source (parser) def node = nil : node def c = p[] switch c //[of]: case nul char, ), | case nul char, $), $| // empty //[cf] //[of]: case $ case $$ p += 1 if is nul (p[]) match ending (m) = true else node = new char node (m, c) end //[cf] //[of]: case ( case $( def or node = new or node (m) source (parser) = p+1 def code = get union (m, parser, or node) if code <> regex ok return code end p = source (parser) if p[] <> $) return regex missing cpar end node = or node p += 1 //[cf] //[of]: case [ case $[ def rule = new char rule (m) def rule node = new rule node (m, rule) p += 1 p = get ranges (m, rule, p) if is nil (p) || p[] <> $] source (parser) = p return regex invalid range end node = rule node p += 1 //[cf] //[of]: case \\ case $\ p += 1 c = get escape char (p[]) node = new char node (m, c) p += 1 //[cf] //[of]: case . case $. node = new rule node (m, new any rule (m)) p += 1 //[cf] //[of]: else else node = new char node (m, c) p += 1 //[cf] end // Check repeaters if not nil (node) c = p[] if c == $* p += 1 node = new repeat node (m, node, nt zero or many) elsif c == $+ p += 1 node = new repeat node (m, node, nt one or many) elsif c == $? p += 1 node = new repeat node (m, node, nt zero or one) end end source (parser) = p node (parser) = node return regex ok end //[c] //[of]:get ranges func get ranges (m: regex, rule: rule, s: string) def p = s def invert = false if p[] == $^ invert = true p += 1 end repeat def c = p [] if is nul (c) || c == $] break end p += 1 def d = p [] if c == $\ && d <> nul char c = get escape char (d) p += 1 d = p [] end if d <> $- || p [1] == $] set (rule, c, is case sensitive (m)) else p += 1 d = p [] if d == $\ && p [1] <> nul char d = get escape char (d) p += 1 end if d <> nul char set (rule, c, d, is case sensitive (m)) p += 1 end end end if invert invert (rule) end return p end //[cf] //[of]:new char node //[c] func new char node (m: regex, c: char) return new rule node (m, new char rule (m, c)) end //[cf] //[cf] //[cf] //[cf] //[of]:matching //[of]:match (s) //[c]Returns the end position or nil if no match //[c] public func match (m: regex, string: string) // quick test if firsts (m) [string[0]:byte:int] == 0:byte return nil end // test without group (faster) if is nil (test without group (m, string)) return nil end // test with groups (slower) return test with groups (m, string) end //[c] equ stack size = 1024 //[c] public func test without group (m: regex, string: string) def stack: [stack size] local scan def top = stack + stack size def sp = top def final = final (m) def s = initial (m) def p = string repeat if s == final return p end def c = p[] each (rules (s)) ? r equ rule = r : rule if epsilon (rule) if sp == stack return nil // stack overflow end sp -= 1 s (sp []) = state (rule) p (sp []) = p elsif match (rule, c) if sp == stack return nil // stack overflow end sp -= 1 s (sp []) = state (rule) p (sp []) = p + 1 end end // pop next if sp == top break end s = s (sp []) p = p (sp []) sp += 1 end return nil end //[c] public func test with groups (m: regex, string: string) reset groups (m) def stack: [stack size] local scan def top = stack + stack size def sp = top def final = final (m) def s = initial (m) def p = string repeat if group start (s) <> 0 set group start (m, s, p) end if group stop (s) <> 0 set group stop (m, s, p) end if s == final return p end def c = p[] each (rules (s)) ? r equ rule = r : rule if epsilon (rule) if sp == stack return nil // stack overflow end sp -= 1 s (sp []) = state (rule) p (sp []) = p elsif match (rule, c) if sp == stack return nil // stack overflow end sp -= 1 s (sp []) = state (rule) p (sp []) = p + 1 end end // pop next if sp == top break end s = s (sp []) p = p (sp []) sp += 1 end return nil end //[c] //[c]Structures: //[c] struct scan s: state p: string end //[c] //[c]Subfunctions: //[of]:set group start func set group start (m: regex, s: state, p: string) def i = 0 def mask = group start (s) while mask <> 0 if (mask & 1) <> 0 group starts (m) [i] = p end i += 1 mask >>= 1 end end //[cf] //[of]:set group stop func set group stop (m: regex, s: state, p: string) def i = 0 def mask = group stop (s) while mask <> 0 if (mask & 1) <> 0 group stops (m) [i] = p end i += 1 mask >>= 1 end end //[cf] //[of]:reset groups //[c] func reset groups (m: regex) def i = 0 def n = number of groups (m) while i < n group starts (m) [i] = nil group stops (m) [i] = nil i += 1 end end //[cf] //[cf] //[cf] //[of]:accessing //[of]:number of groups //[c]Returns the number of groups //[c] //[c]The number of groups is known after compilation. //[c] public func number of groups (m: regex) return groups (m) end //[cf] //[of]:append (index, s) //[c]Appends the content of the i-th group to the string buffer //[c] //[c]The method can be invoked after the first call to match(). //[c]This method is valid as long as the source string is not modified. //[c] public func append (m: regex, index: int, s: string buffer) def start = group starts (m) [index] : string def stop = group stops (m) [index] : string if not nil (start) && not nil (stop) append (s, start, stop - start) end end //[cf] //[of]:size (index) //[c]Returns the size of the i-th group //[c] public func size (m: regex, index: int) def start = group starts (m) [index] : string def stop = group stops (m) [index] : string if not nil (start) && not nil (stop) return stop - start end return 0 end //[cf] //[cf] //[of]:testing //[of]:must match beginning //[c]Returns true if the expression must starts at the beginning of the line //[c] //[c]This is not handled by this component, this is the responsibility //[c]of the caller to handle begin and end matches because the end //[c]of line can be \n as well as a nul char and the beginning of the //[c]line should be optimized by the caller (avoid scanning all the line). //[c] public equ must match beginning (m: regex) = match beginning (m) //[cf] //[of]:must match ending //[c]Returns true if the expression must ends at the end of the line //[c] //[c]This is not handled by this component, this is the responsibility //[c]of the caller to handle begin and end matches because the end //[c]of line can be \n as well as a nul char and the beginning of the //[c]line should be optimized by the caller (avoid scanning all the line). //[c] public equ must match ending (m: regex) = match ending (m) //[cf] //[of]:is empty //[c]Returns true if the regular expression matches an empty string //[c] public func is empty (m: regex) return not nil (match (m, empty string)) end //[cf] //[cf] //[c] //[of]:private //[of]:regex //[of]:delete all //[c]Deletes all states, nodes and rules //[c] func delete all (m: regex) each (states (m)) ? s delete (s : state) end each (nodes (m)) ? n delete (n : node) end each (rules (m)) ? r delete (r : rule) end end //[cf] //[cf] //[of]:state //[of]:definition struct state rules: local collection group start: int group stop: int mark: bool end //[cf] //[c] //[of]:new state //[c] func new state (r: regex) def s = allocate memory (sizeof local state): state initialize (rules (s)) group start (s) = 0 group stop (s) = 0 mark (s) = false add (states (r), s) return s end //[cf] //[of]:delete //[c] func delete (s: state) free memory (s) end //[cf] //[c] //[of]:add rule (s1, s2, rule) func add rule (s1: state, s2: state, rule: rule) add (rules (s1), rule) state (rule) = s2 end //[cf] //[of]:add epsilon (s1, s2) func add epsilon (m: regex, s1: state, s2: state) def rule = new epsilon rule (m) add (rules (s1), rule) state (rule) = s2 end //[cf] //[cf] //[of]:rule //[of]:definition //[c] struct rule: local element epsilon: bool chars: char map state: state end //[cf] //[c] //[of]:new rule //[c] func new rule (m: regex) def r = allocate memory (sizeof local rule): rule add (rules (m), r) return r end //[cf] //[of]:new epsilon rule //[c] func new epsilon rule (m: regex) def r = new rule (m) epsilon (r) = true return r end //[cf] //[of]:new char rule //[c] func new char rule (m: regex) def r = new rule (m) epsilon (r) = false initialize (chars (r)) return r end //[cf] //[of]:new any rule //[c] func new any rule (m: regex) def r = new char rule (m) invert (r) return r end //[cf] //[of]:new char rule (c) //[c] func new char rule (m: regex, c: char) def r = new char rule (m) set (r, c, is case sensitive (m)) return r end //[cf] //[of]:delete //[c] func delete (rule: rule) free memory (rule) end //[cf] //[c] //[of]:set (c) //[c] func set (r: rule, c: char, is case sensitive: bool) chars (r) [c:byte:int] = 1:byte if ~ is case sensitive if c >= $a && c <= $z chars (r) [(c - $a + $A):byte:int] = 1:byte elsif c >= $A && c <= $Z chars (r) [(c - $A + $a):byte:int] = 1:byte end end end //[cf] //[of]:set (c1, c2) //[c] func set (r: rule, c1: char, c2: char, is case sensitive: bool) def c = c1 while c <= c2 set (r, c, is case sensitive) c += \1 end end //[cf] //[of]:invert //[c] func invert (r: rule) def i = 1 while i < 256 chars (r) [i] = 1:byte - chars (r) [i] i += 1 end end //[cf] //[c] //[of]:match (c) //[c] equ match (r: rule, c: char) = chars (r) [c:byte:int] <> 0:byte //[cf] //[cf] //[c] //[of]:node //[of]:definition struct node : local element type: node type end //[cf] //[of]:constants enum node type nt rules nt or nt zero or many nt one or many nt zero or one end //[cf] //[of]:delete func delete (m: node) if type (m) == nt or release (sequences (m : or node)) end free memory (m) end //[cf] //[cf] //[of]:rule node //[of]:definition struct rule node : local node rule: rule end //[cf] //[of]:create func new rule node (m: regex, rule: rule) def n = allocate memory (sizeof local rule node) : rule node type (n) = nt rules rule (n) = rule add (nodes (m), n) return n end //[cf] //[cf] //[of]:or node //[of]:definition struct or node : local node sequences: local vector end //[cf] //[of]:create func new or node (m: regex) def n = allocate memory (sizeof local or node) : or node type (n) = nt or initialize (sequences (n)) add (nodes (m), n) return n end //[cf] //[cf] //[of]:rep node //[of]:definition struct rep node : local node node: node end //[cf] //[of]:create func new repeat node (m: regex, node: node, type: node type) def n = allocate memory (sizeof local rep node) : rep node type (n) = type node (n) = node add (nodes (m), n) return n end //[cf] //[cf] //[c] //[of]:utility functions //[of]:initialize (char map) func initialize (m: []byte) def p = m def i = 0 while i < 256 p [i] = 0:byte i += 1 end end //[cf] //[of]:get escape char (c) //[c]Returns the char for an escape sequence \x //[c] func get escape char (x: char) def c: char switch x case nul char c = $\ case $a c = \a case $b c = \b case $n c = \n case $f c = \f case $r c = \r case $t c = \t case $v c = \v else c = x end return c end //[cf] //[cf] //[cf]