Discussion Session 4

Language of Regular Expressions

The language of all regular expressions can be described by the following grammar (BNF)

reg ::=  CHAR
reg ::=  .
reg ::=  reg reg
reg ::=  reg | reg
reg ::=  reg *
reg ::=  reg ?
reg ::=  ( reg )

The following grammar doesn’t have ambiguity and sets different precedences for operators |, space (invisible operator), and *.

reg   ::=  seq | reg    // Alt
reg   ::=  seq
seq   ::=  block seq      // Cat
seq   ::=  block
block ::=  atom *       // Star
block ::=  atom ?       // Opt
block ::=  atom
atom  ::=  CHAR            // Char
atom  ::=  .               // Any
atom  ::=  ( reg )       // Paren

Question

Can the language of regular expressions be described by a regular expression?

Example of Abstract Syntax Tree

{"Char": ["a"]}    # a

{"Alt": [          # ab|c
    {"Cat": [
        {"Char": ["a"]},
        {"Char": ["b"]}]},
    {"Char": ["c"]}]}

{"Star": [{"Char": ["a"]}]}  # a*

"Any"        # .

Match Regular Expression against String

Simple implementation based on Search: regmatch.py

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Author: Zhiqiang Ren
# Date: 02/18/2015

def reg_match(reg, exp):
    node = (exp, reg, [])
    return search([node])

def search(nodes):
    while (not nodes_is_empty(nodes)):
        node = nodes_get_node(nodes)
        # print "node is", node

        if (node_is_sol(node)):
            return True

        new_nodes = node_get_next(node)
        nodes = nodes_add(nodes[1:], new_nodes)

    return False
    

# implement nodes as a stack
def nodes_is_empty(nodes):
    return len(nodes) == 0

def nodes_get_node(nodes):
    return nodes[0]

def nodes_add(nodes, new_nodes):
    return nodes + new_nodes


# node: (exp, cur_reg, rest_regs)
def node_is_sol(node):
    (exp, cur_reg, rest_regs) = node
    if (exp == "" and cur_reg == None and rest_regs == []):
        return True
    else:
        return False

def node_get_next(node):
    (exp, cur_reg, rest_regs) = node
    if (cur_reg == None):
        return []

    if (type(cur_reg) == str):  # Terminal
        if ("Any" == cur_reg):
            if (len(exp) <= 0):
                return []
            else:
                if (len(rest_regs) > 0):
                    return [(exp[1:], rest_regs[0], rest_regs[1:])]
                else:
                    return [(exp[1:], None, [])]
        else:
            raise NotImplementedError(cur_reg + " is not supported")

    elif (type(cur_reg) == dict):
        [(label, children)] = cur_reg.items()
        if ("Char" == label):
            ch = children[0]
            if (len(exp) <= 0):
                return []
            else:
                if (exp[0] == ch):
                    if (len(rest_regs) > 0):
                        return [(exp[1:], rest_regs[0], rest_regs[1:])]
                    else:
                        return [(exp[1:], None, [])]
                else:
                    return []

        elif ("Cat" == label):
            reg1 = children[0]
            reg2 = children[1]
            return [(exp, reg1, [reg2] + rest_regs)]

        elif ("Alt" == label):
            reg1 = children[0]
            reg2 = children[1]

            return [(exp, reg1, rest_regs), (exp, reg2, rest_regs)]

        elif ("Opt" == label):
            reg = children[0]

            if (len(rest_regs) > 0):
                new_node = (exp, rest_regs[0], rest_regs[1:])
            else:
                new_node = (exp, None, rest_regs[1:])
            
            return [(exp, reg, rest_regs), new_node]

        elif ("Star" == label):
            reg = children[0]
            num = children[1]

            if (num == len(exp)):
                return []

            # empty
            if (len(rest_regs) > 0):
                node1 = (exp, rest_regs[0], rest_regs[1:])
            else:
                node1 = (exp, None, rest_regs[1:])

            new_star = {"Star": [reg, len(exp)]}
            node2 = (exp, reg, [new_star] + rest_regs)

            return [node1, node2]
        else:
            raise NotImplementedError(label + " is not supported")

    else:
        raise NotImplementedError(str(type(cur_reg)) + " is not supported")


if __name__ == "__main__":
    reg_a = {"Char": ["a"]}
    reg_b = {"Char": ["b"]}
    reg_c = {"Char": ["c"]}
    reg_ab = {"Cat": [reg_a, reg_b]}
    reg_a_b = {"Alt": [reg_a, reg_b]}
    reg_ao = {"Opt": [reg_a]}
    reg_any = "Any"

    reg_as = {"Star": [reg_a, -1]}
    reg_ass = {"Star": [reg_as, -1]}

    # ret = reg_match(reg_a, "a")
    # print "=== ret is ", ret
    # ret = reg_match(reg_b, "b")
    # print "=== ret is ", ret
    # ret = reg_match(reg_ab, "ab")
    # print "=== is ", ret
    # ret = reg_match(reg_a_b, "a")
    # print "=== is ", ret
    # ret = reg_match(reg_a_b, "b")
    # print "=== is ", ret
    # ret = reg_match(reg_any, "ba")
    # print "=== is ", ret
    # ret = reg_match(reg_ao, "a")
    # print "=== is ", ret
    # ret = reg_match(reg_as, "")
    # print "=== is ", ret
    # ret = reg_match(reg_ass, "aaaaa")
    # print "=== is ", ret

    # (a|b)*
    reg1 = {"Star": [reg_a_b, -1]}
    # (a|b)*c
    reg2 = {"Cat": [reg1, reg_c]}
    # ((a|b)*c)*
    reg3 = {"Star": [reg2, -1]}
    ret = reg_match(reg3, "aabaabaccccaaaaaaaabbccc")
    print "=== is ", ret