📜  为正则表达式C(A + B)+构建DFA的程序(1)

📅  最后修改于: 2023-12-03 15:36:01.444000             🧑  作者: Mango

介绍

本程序使用Python语言,基于正则表达式和NFA转换为DFA的算法,构建出对于正则表达式C(A + B)+的DFA。首先,程序将根据正则表达式构建出对应的NFA,然后通过子集构造算法把NFA转换为DFA。

代码实现
import itertools

class State:
    """
    State类表示DFA的状态
    """
    def __init__(self, id, states):
        self.id = 'Q' + str(id) # 状态编号
        self.states = states # 子集状态
    
class DFA:
    """
    DFA类表示DFA自动机
    """
    def __init__(self, states, inputs, trans, start, accepts):
        self.states = states # DFA的所有状态集合
        self.inputs = inputs # DFA的所有输入符号集合
        self.trans = trans # DFA的转移函数
        self.start = start # DFA的开始状态
        self.accepts = accepts # DFA的接受状态

def compile(regexp):
    """
    构建DFA的函数
    """
    nfa_states = set() # NFA状态集合
    nfa_start = [] # NFA的开始状态
    nfa_accepts = [] # NFA的接受状态
    alphabet = set() # 输入符号集合

    # ---------------------------------------------------------------------- #
    # 正则表达式解析函数
    # ---------------------------------------------------------------------- #

    def parse(strset):
        """
        解析正则表达式的函数
        """
        nonlocal alphabet
        
        choices = strset.split('+')
        if len(choices)>1:
            exprs = [parse(choice) for choice in choices]
            exprs.insert(0, 'choices')
            return exprs

        seqs = strset.split('.')
        if len(seqs)>1:
            exprs = [parse(seq) for seq in seqs]
            exprs.insert(0, 'sequence')
            return exprs

        stars = strset.split('*')
        if len(stars)>1:
            exprs = parse(stars[0])
            exprs.insert(0, 'closure')
            return exprs

        if len(strset)>1:
            raise ValueError(f'Invalid syntax: {strset}')
        
        alphabet.add(strset)
        return strset

    # ---------------------------------------------------------------------- #
    # 构建NFA
    # ---------------------------------------------------------------------- #

    # 解析正则表达式
    exprs = parse(regexp)

    def new_id():
        """
        生成新状态id的函数
        """
        new_id.counter += 1
        return new_id.counter
    new_id.counter = 0

    def build_nfa(exprs):
        """
        根据解析后的正则表达式构建NFA
        """
        if isinstance(exprs, str):
            # 构建一个基础NFA
            start, accept = new_id(), new_id()
            nfa_states.add((start, exprs, accept))
            nfa_start.append(start)
            nfa_accepts.append(accept)
            return start, accept

        elif isinstance(exprs, list):
            # 递归构建NFA
            if exprs[0] == 'sequence':
                start0, accept0 = build_nfa(exprs[1])
                for e in exprs[2:]:
                    start1, accept1 = build_nfa(e)
                    nfa_states.add((accept0, None, start1))
                    accept0 = accept1
                return start0, accept0

            elif exprs[0] == 'choices':
                start, accept = new_id(), new_id()
                for e in exprs[1:]:
                    start1, accept1 = build_nfa(e)
                    nfa_states |= {(start, None, start1), (accept1, None, accept)}
                nfa_start.append(start)
                nfa_accepts.append(accept)
                return start, accept

            elif exprs[0] == 'closure':
                start, accept = new_id(), new_id()
                start0, accept0 = build_nfa(exprs[1])
                nfa_states |= {(start, None, start0), (start0, None, accept0), (accept0, None, accept),
                               (accept, None, start), (accept, None, start0)}
                nfa_start.append(start)
                nfa_accepts.append(accept)
                return start, accept

    build_nfa(exprs)

    # ---------------------------------------------------------------------- #
    # 子集构造算法,将NFA转换为DFA
    # ---------------------------------------------------------------------- #

    def e_closure(states):
        """
        计算NFA状态集states的epsilon闭包
        """
        e_closure_set = set(states)
        worklist = list(states)
        while worklist:
            state = worklist.pop()
            for nfa_state in nfa_states:
                start, edge, end = nfa_state
                if start == state and edge is None and end not in e_closure_set:
                    e_closure_set.add(end)
                    worklist.append(end)
        return e_closure_set

    dfa_states = [] # DFA状态集合
    dfa_trans = {} # DFA转移函数

    nfa_start_e_closure = e_closure(nfa_start)
    dfa_start = State(0, nfa_start_e_closure) # DFA开始状态为NFA开始状态的epsilon闭包
    dfa_states.append(dfa_start)

    worklist = [dfa_start]
    while worklist:
        state = worklist.pop()
        for symbol in alphabet:
            nfa_move_set = set()
            for nfa_state in state.states:
                start, edge, end = nfa_state
                if edge == symbol:
                    nfa_move_set.add(end)
            if not nfa_move_set:
                continue
            nfa_move_e_closure = e_closure(nfa_move_set)
            new_dfa_state = None
            for i, s in enumerate(dfa_states):
                if s.states == nfa_move_e_closure:
                    new_dfa_state = s
                    break
            if new_dfa_state is None:
                new_dfa_state = State(len(dfa_states), nfa_move_e_closure)
                dfa_states.append(new_dfa_state)
                worklist.append(new_dfa_state)
            dfa_trans[(state.id, symbol)] = new_dfa_state.id

    dfa_accepts = [s.id for s in dfa_states if any(nfa_accept in s.states for nfa_accept in nfa_accepts)]

    # ---------------------------------------------------------------------- #
    # 构建DFA并返回
    # ---------------------------------------------------------------------- #

    dfa = DFA([s.id for s in dfa_states],
              list(alphabet),
              dfa_trans,
              dfa_start.id,
              dfa_accepts)
    return dfa
示例

我们以C(A + B)+为例,构建DFA,并用graphviz将其绘制出来。

from graphviz import Digraph

def draw(dfa):
    dot = Digraph(comment='DFA')
    dot.attr(rankdir='LR')
    
    for accept in dfa.accepts:
        dot.node(accept, shape='doublecircle')
    dot.node(dfa.start)

    for state in dfa.states:
        for input in dfa.inputs:
            try:
                trans = dfa.trans[(state, input)]
                dot.edge(state, trans, label=input)
            except KeyError:
                pass

    return dot

dfa = compile('C(A + B)+')
draw(dfa)