# Discussion Session 4¶

## Language of Regular Expressions¶

The language of all regular expressions can be described by the following grammar (BNF)

reg ::=  CHAR
reg ::=  .
reg ::=  reg reg
reg ::=  reg | reg
reg ::=  reg *
reg ::=  reg ?
reg ::=  ( reg )


The following grammar doesn’t have ambiguity and sets different precedences for operators |, space (invisible operator), and *.

reg   ::=  seq | reg    // Alt
reg   ::=  seq
seq   ::=  block seq      // Cat
seq   ::=  block
block ::=  atom *       // Star
block ::=  atom ?       // Opt
block ::=  atom
atom  ::=  CHAR            // Char
atom  ::=  .               // Any
atom  ::=  ( reg )       // Paren


Question

Can the language of regular expressions be described by a regular expression?

## Example of Abstract Syntax Tree¶

{"Char": ["a"]}    # a

{"Alt": [          # ab|c
{"Cat": [
{"Char": ["a"]},
{"Char": ["b"]}]},
{"Char": ["c"]}]}

{"Star": [{"Char": ["a"]}]}  # a*

"Any"        # .


## Match Regular Expression against String¶

Simple implementation based on Search: regmatch.py

  1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 # Author: Zhiqiang Ren # Date: 02/18/2015 def reg_match(reg, exp): node = (exp, reg, []) return search([node]) def search(nodes): while (not nodes_is_empty(nodes)): node = nodes_get_node(nodes) # print "node is", node if (node_is_sol(node)): return True new_nodes = node_get_next(node) nodes = nodes_add(nodes[1:], new_nodes) return False # implement nodes as a stack def nodes_is_empty(nodes): return len(nodes) == 0 def nodes_get_node(nodes): return nodes[0] def nodes_add(nodes, new_nodes): return nodes + new_nodes # node: (exp, cur_reg, rest_regs) def node_is_sol(node): (exp, cur_reg, rest_regs) = node if (exp == "" and cur_reg == None and rest_regs == []): return True else: return False def node_get_next(node): (exp, cur_reg, rest_regs) = node if (cur_reg == None): return [] if (type(cur_reg) == str): # Terminal if ("Any" == cur_reg): if (len(exp) <= 0): return [] else: if (len(rest_regs) > 0): return [(exp[1:], rest_regs[0], rest_regs[1:])] else: return [(exp[1:], None, [])] else: raise NotImplementedError(cur_reg + " is not supported") elif (type(cur_reg) == dict): [(label, children)] = cur_reg.items() if ("Char" == label): ch = children[0] if (len(exp) <= 0): return [] else: if (exp[0] == ch): if (len(rest_regs) > 0): return [(exp[1:], rest_regs[0], rest_regs[1:])] else: return [(exp[1:], None, [])] else: return [] elif ("Cat" == label): reg1 = children[0] reg2 = children[1] return [(exp, reg1, [reg2] + rest_regs)] elif ("Alt" == label): reg1 = children[0] reg2 = children[1] return [(exp, reg1, rest_regs), (exp, reg2, rest_regs)] elif ("Opt" == label): reg = children[0] if (len(rest_regs) > 0): new_node = (exp, rest_regs[0], rest_regs[1:]) else: new_node = (exp, None, rest_regs[1:]) return [(exp, reg, rest_regs), new_node] elif ("Star" == label): reg = children[0] num = children[1] if (num == len(exp)): return [] # empty if (len(rest_regs) > 0): node1 = (exp, rest_regs[0], rest_regs[1:]) else: node1 = (exp, None, rest_regs[1:]) new_star = {"Star": [reg, len(exp)]} node2 = (exp, reg, [new_star] + rest_regs) return [node1, node2] else: raise NotImplementedError(label + " is not supported") else: raise NotImplementedError(str(type(cur_reg)) + " is not supported") if __name__ == "__main__": reg_a = {"Char": ["a"]} reg_b = {"Char": ["b"]} reg_c = {"Char": ["c"]} reg_ab = {"Cat": [reg_a, reg_b]} reg_a_b = {"Alt": [reg_a, reg_b]} reg_ao = {"Opt": [reg_a]} reg_any = "Any" reg_as = {"Star": [reg_a, -1]} reg_ass = {"Star": [reg_as, -1]} # ret = reg_match(reg_a, "a") # print "=== ret is ", ret # ret = reg_match(reg_b, "b") # print "=== ret is ", ret # ret = reg_match(reg_ab, "ab") # print "=== is ", ret # ret = reg_match(reg_a_b, "a") # print "=== is ", ret # ret = reg_match(reg_a_b, "b") # print "=== is ", ret # ret = reg_match(reg_any, "ba") # print "=== is ", ret # ret = reg_match(reg_ao, "a") # print "=== is ", ret # ret = reg_match(reg_as, "") # print "=== is ", ret # ret = reg_match(reg_ass, "aaaaa") # print "=== is ", ret # (a|b)* reg1 = {"Star": [reg_a_b, -1]} # (a|b)*c reg2 = {"Cat": [reg1, reg_c]} # ((a|b)*c)* reg3 = {"Star": [reg2, -1]} ret = reg_match(reg3, "aabaabaccccaaaaaaaabbccc") print "=== is ", ret