Discussion Session 4¶
Language of Regular Expressions¶
The language of all regular expressions can be described by the following grammar (BNF)
reg ::= CHAR reg ::= . reg ::=reg
reg
reg ::=reg
|reg
reg ::=reg
* reg ::=reg
? reg ::= (reg
)
The following grammar doesn’t have ambiguity and sets different precedences for
operators |
, space (invisible operator), and *
.
reg ::=seq
|reg
// Alt reg ::=seq
seq ::=block
seq // Cat seq ::=block
block ::=atom
* // Star block ::=atom
? // Opt block ::=atom
atom ::= CHAR // Char atom ::= . // Any atom ::= (reg
) // Paren
Question
Can the language of regular expressions be described by a regular expression?
Example of Abstract Syntax Tree¶
{"Char": ["a"]} # a
{"Alt": [ # ab|c
{"Cat": [
{"Char": ["a"]},
{"Char": ["b"]}]},
{"Char": ["c"]}]}
{"Star": [{"Char": ["a"]}]} # a*
"Any" # .
Match Regular Expression against String¶
Simple implementation based on Search: regmatch.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 | # Author: Zhiqiang Ren
# Date: 02/18/2015
def reg_match(reg, exp):
node = (exp, reg, [])
return search([node])
def search(nodes):
while (not nodes_is_empty(nodes)):
node = nodes_get_node(nodes)
# print "node is", node
if (node_is_sol(node)):
return True
new_nodes = node_get_next(node)
nodes = nodes_add(nodes[1:], new_nodes)
return False
# implement nodes as a stack
def nodes_is_empty(nodes):
return len(nodes) == 0
def nodes_get_node(nodes):
return nodes[0]
def nodes_add(nodes, new_nodes):
return nodes + new_nodes
# node: (exp, cur_reg, rest_regs)
def node_is_sol(node):
(exp, cur_reg, rest_regs) = node
if (exp == "" and cur_reg == None and rest_regs == []):
return True
else:
return False
def node_get_next(node):
(exp, cur_reg, rest_regs) = node
if (cur_reg == None):
return []
if (type(cur_reg) == str): # Terminal
if ("Any" == cur_reg):
if (len(exp) <= 0):
return []
else:
if (len(rest_regs) > 0):
return [(exp[1:], rest_regs[0], rest_regs[1:])]
else:
return [(exp[1:], None, [])]
else:
raise NotImplementedError(cur_reg + " is not supported")
elif (type(cur_reg) == dict):
[(label, children)] = cur_reg.items()
if ("Char" == label):
ch = children[0]
if (len(exp) <= 0):
return []
else:
if (exp[0] == ch):
if (len(rest_regs) > 0):
return [(exp[1:], rest_regs[0], rest_regs[1:])]
else:
return [(exp[1:], None, [])]
else:
return []
elif ("Cat" == label):
reg1 = children[0]
reg2 = children[1]
return [(exp, reg1, [reg2] + rest_regs)]
elif ("Alt" == label):
reg1 = children[0]
reg2 = children[1]
return [(exp, reg1, rest_regs), (exp, reg2, rest_regs)]
elif ("Opt" == label):
reg = children[0]
if (len(rest_regs) > 0):
new_node = (exp, rest_regs[0], rest_regs[1:])
else:
new_node = (exp, None, rest_regs[1:])
return [(exp, reg, rest_regs), new_node]
elif ("Star" == label):
reg = children[0]
num = children[1]
if (num == len(exp)):
return []
# empty
if (len(rest_regs) > 0):
node1 = (exp, rest_regs[0], rest_regs[1:])
else:
node1 = (exp, None, rest_regs[1:])
new_star = {"Star": [reg, len(exp)]}
node2 = (exp, reg, [new_star] + rest_regs)
return [node1, node2]
else:
raise NotImplementedError(label + " is not supported")
else:
raise NotImplementedError(str(type(cur_reg)) + " is not supported")
if __name__ == "__main__":
reg_a = {"Char": ["a"]}
reg_b = {"Char": ["b"]}
reg_c = {"Char": ["c"]}
reg_ab = {"Cat": [reg_a, reg_b]}
reg_a_b = {"Alt": [reg_a, reg_b]}
reg_ao = {"Opt": [reg_a]}
reg_any = "Any"
reg_as = {"Star": [reg_a, -1]}
reg_ass = {"Star": [reg_as, -1]}
# ret = reg_match(reg_a, "a")
# print "=== ret is ", ret
# ret = reg_match(reg_b, "b")
# print "=== ret is ", ret
# ret = reg_match(reg_ab, "ab")
# print "=== is ", ret
# ret = reg_match(reg_a_b, "a")
# print "=== is ", ret
# ret = reg_match(reg_a_b, "b")
# print "=== is ", ret
# ret = reg_match(reg_any, "ba")
# print "=== is ", ret
# ret = reg_match(reg_ao, "a")
# print "=== is ", ret
# ret = reg_match(reg_as, "")
# print "=== is ", ret
# ret = reg_match(reg_ass, "aaaaa")
# print "=== is ", ret
# (a|b)*
reg1 = {"Star": [reg_a_b, -1]}
# (a|b)*c
reg2 = {"Cat": [reg1, reg_c]}
# ((a|b)*c)*
reg3 = {"Star": [reg2, -1]}
ret = reg_match(reg3, "aabaabaccccaaaaaaaabbccc")
print "=== is ", ret
|