📜  有限自动机模式搜索

📅  最后修改于: 2021-04-27 22:19:00             🧑  作者: Mango

给定文本txt [0..n-1]和模式pat [0..m-1] ,编写一个函数search(char pat [],char txt []) ,将所有出现的pat []都打印在txt中[] 。您可以假设n> m。

例子:

输入:txt [] =“这是一个测试文本” pat [] =“ TEST”输出:在索引10处找到的模式输入:txt [] =“ AABAACAADAABAABA” pat [] =“ AABA”输出:在索引0处找到的模式在索引9处找到的模式在索引12处找到的模式模式搜索

模式搜索是计算机科学中的一个重要问题。当我们在记事本/单词文件或浏览器或数据库中搜索字符串时,将使用模式搜索算法来显示搜索结果。

在之前的文章中,我们讨论了以下算法:

天真的算法
KMP算法
Rabin Karp算法

在这篇文章中,我们将讨论基于有限自动机(FA)的模式搜索算法。在基于FA的算法中,我们预处理图案并构建一个表示有限自动机的2D数组。 FA的构建是该算法的主要难题。一旦建立了FA,搜索就很简单。在搜索中,我们只需要从自动机的第一个状态和文本的第一个字符。在每一步中,我们都会考虑文本的下一个字符,在已构建的FA中查找下一个状态,然后移至新状态。如果我们达到最终状态,则可以在文本中找到模式。搜索过程的时间复杂度为O(n)。
在讨论FA构造之前,让我们看一下以下ACACAGA模式的FA。


上图代表ACACAGA模式的图形和表格表示。

FA中的状态数将为M + 1,其中M是模式的长度。构造FA的主要目的是为每个可能的字符从当前状态获取下一个状态。给定一个字符x和一个状态k,我们可以通过考虑字符串“ pat [0..k-1] x”来获得下一个状态,该字符串基本上是模式字符pat [0],pat [1]…pat [ k-1]和字符x。想法是获得给定模式的最长前缀的长度,以使该前缀也为“ pat [0..k-1] x”的后缀。长度的值给我们下一个状态。例如,让我们看看如何从上图中的当前状态5和字符“ C”中获取下一个状态。我们需要考虑字符串“ pat [0..4] C”,即“ ACACAC”。模式的最长前缀的长度(使前缀为“ ACACAC”的后缀)为4(“ ACAC”)。因此,字符“ C”的下一个状态(从状态5开始)为4。

在以下代码中,computeTF()构造FA。 computeTF()的时间复杂度为O(m ^ 3 * NO_OF_CHARS),其中m是模式的长度,NO_OF_CHARS是字母的大小(模式和文本中可能的字符总数)。该实现会尝试从可能为“ pat [0..k-1] x”后缀的最长前缀开始。在O(m * NO_OF_CHARS)中有更好的构造FA的实现(提示:我们可以在KMP算法中使用类似lps数组的构造)。我们在下一篇关于模式搜索的文章中介绍了更好的实现。

C
// C program for Finite Automata Pattern searching
// Algorithm
#include
#include
#define NO_OF_CHARS 256
  
int getNextState(char *pat, int M, int state, int x)
{
    // If the character c is same as next character
    // in pattern,then simply increment state
    if (state < M && x == pat[state])
        return state+1;
  
    // ns stores the result which is next state
    int ns, i;
  
    // ns finally contains the longest prefix
    // which is also suffix in "pat[0..state-1]c"
  
    // Start from the largest possible value
    // and stop when you find a prefix which
    // is also suffix
    for (ns = state; ns > 0; ns--)
    {
        if (pat[ns-1] == x)
        {
            for (i = 0; i < ns-1; i++)
                if (pat[i] != pat[state-ns+1+i])
                    break;
            if (i == ns-1)
                return ns;
        }
    }
  
    return 0;
}
  
/* This function builds the TF table which represents4
    Finite Automata for a given pattern */
void computeTF(char *pat, int M, int TF[][NO_OF_CHARS])
{
    int state, x;
    for (state = 0; state <= M; ++state)
        for (x = 0; x < NO_OF_CHARS; ++x)
            TF[state][x] = getNextState(pat, M, state, x);
}
  
/* Prints all occurrences of pat in txt */
void search(char *pat, char *txt)
{
    int M = strlen(pat);
    int N = strlen(txt);
  
    int TF[M+1][NO_OF_CHARS];
  
    computeTF(pat, M, TF);
  
    // Process txt over FA.
    int i, state=0;
    for (i = 0; i < N; i++)
    {
        state = TF[state][txt[i]];
        if (state == M)
            printf ("\n Pattern found at index %d",
                                           i-M+1);
    }
}
  
// Driver program to test above function
int main()
{
    char *txt = "AABAACAADAABAAABAA";
    char *pat = "AABA";
    search(pat, txt);
    return 0;
}


CPP
// CPP program for Finite Automata Pattern searching 
// Algorithm 
#include 
using namespace std;
#define NO_OF_CHARS 256 
  
int getNextState(string pat, int M, int state, int x) 
{ 
    // If the character c is same as next character 
    // in pattern,then simply increment state 
    if (state < M && x == pat[state]) 
        return state+1; 
  
    // ns stores the result which is next state 
    int ns, i; 
  
    // ns finally contains the longest prefix 
    // which is also suffix in "pat[0..state-1]c" 
  
    // Start from the largest possible value 
    // and stop when you find a prefix which 
    // is also suffix 
    for (ns = state; ns > 0; ns--) 
    { 
        if (pat[ns-1] == x) 
        { 
            for (i = 0; i < ns-1; i++) 
                if (pat[i] != pat[state-ns+1+i]) 
                    break; 
            if (i == ns-1) 
                return ns; 
        } 
    } 
  
    return 0; 
} 
  
/* This function builds the TF table which represents4 
    Finite Automata for a given pattern */
void computeTF(string pat, int M, int TF[][NO_OF_CHARS]) 
{ 
    int state, x; 
    for (state = 0; state <= M; ++state) 
        for (x = 0; x < NO_OF_CHARS; ++x) 
            TF[state][x] = getNextState(pat, M, state, x); 
} 
  
/* Prints all occurrences of pat in txt */
void search(string pat, string txt) 
{ 
    int M = pat.size(); 
    int N = txt.size(); 
  
    int TF[M+1][NO_OF_CHARS]; 
  
    computeTF(pat, M, TF); 
  
    // Process txt over FA. 
    int i, state=0; 
    for (i = 0; i < N; i++) 
    { 
        state = TF[state][txt[i]]; 
        if (state == M) 
            cout<<" Pattern found at index "<< i-M+1<


Java
// Java program for Finite Automata Pattern
// searching Algorithm
class GFG {
      
    static int NO_OF_CHARS = 256;
    static int getNextState(char[] pat, int M,  
                             int state, int x)
    {
          
        // If the character c is same as next
        // character in pattern,then simply 
        // increment state
        if(state < M && x == pat[state])
            return state + 1;
              
        // ns stores the result which is next state
        int ns, i;
  
        // ns finally contains the longest prefix
        // which is also suffix in "pat[0..state-1]c"
  
        // Start from the largest possible value
        // and stop when you find a prefix which
        // is also suffix
        for (ns = state; ns > 0; ns--)
        {
            if (pat[ns-1] == x)
            {
                for (i = 0; i < ns-1; i++)
                    if (pat[i] != pat[state-ns+1+i])
                        break;
                    if (i == ns-1)
                        return ns;
            }
        }
  
            return 0;
    }
  
    /* This function builds the TF table which
    represents Finite Automata for a given pattern */
    static void computeTF(char[] pat, int M, int TF[][])
    {
        int state, x;
        for (state = 0; state <= M; ++state)
            for (x = 0; x < NO_OF_CHARS; ++x)
                TF[state][x] = getNextState(pat, M, state, x);
    }
  
    /* Prints all occurrences of pat in txt */
    static void search(char[] pat, char[] txt)
    {
        int M = pat.length;
        int N = txt.length;
  
        int[][] TF = new int[M+1][NO_OF_CHARS];
  
        computeTF(pat, M, TF);
  
        // Process txt over FA.
        int i, state = 0;
        for (i = 0; i < N; i++)
        {
            state = TF[state][txt[i]];
            if (state == M)
                System.out.println("Pattern found "
                          + "at index " + (i-M+1));
        }
    }
  
    // Driver code
    public static void main(String[] args) 
    {
        char[] pat = "AABAACAADAABAAABAA".toCharArray();
        char[] txt = "AABA".toCharArray();
        search(txt,pat);
    }
}
  
// This code is contributed by debjitdbb.


Python
# Python program for Finite Automata 
# Pattern searching Algorithm
  
NO_OF_CHARS = 256
  
def getNextState(pat, M, state, x):
    '''
    calculate the next state 
    '''
  
    # If the character c is same as next character 
      # in pattern, then simply increment state
  
    if state < M and x == ord(pat[state]):
        return state+1
  
    i=0
    # ns stores the result which is next state
  
    # ns finally contains the longest prefix 
     # which is also suffix in "pat[0..state-1]c"
  
     # Start from the largest possible value and 
      # stop when you find a prefix which is also suffix
    for ns in range(state,0,-1):
        if ord(pat[ns-1]) == x:
            while(i


C#
// C# program for Finite Automata Pattern 
// searching Algorithm 
using System;
  
class GFG
{
  
public static int NO_OF_CHARS = 256;
public static int getNextState(char[] pat, int M, 
                               int state, int x)
{
  
    // If the character c is same as next 
    // character in pattern,then simply 
    // increment state 
    if (state < M && (char)x == pat[state])
    {
        return state + 1;
    }
  
    // ns stores the result 
    // which is next state 
    int ns, i;
  
    // ns finally contains the longest 
    // prefix which is also suffix in 
    // "pat[0..state-1]c" 
  
    // Start from the largest possible  
    // value and stop when you find a 
    // prefix which is also suffix 
    for (ns = state; ns > 0; ns--)
    {
        if (pat[ns - 1] == (char)x)
        {
            for (i = 0; i < ns - 1; i++)
            {
                if (pat[i] != pat[state - ns + 1 + i])
                {
                    break;
                }
            }
                if (i == ns - 1)
                {
                    return ns;
                }
        }
    }
  
        return 0;
}
  
/* This function builds the TF table which 
represents Finite Automata for a given pattern */
public static void computeTF(char[] pat, 
                             int M, int[][] TF)
{
    int state, x;
    for (state = 0; state <= M; ++state)
    {
        for (x = 0; x < NO_OF_CHARS; ++x)
        {
            TF[state][x] = getNextState(pat, M, 
                                        state, x);
        }
    }
}
  
/* Prints all occurrences of 
   pat in txt */
public static void search(char[] pat, 
                          char[] txt)
{
    int M = pat.Length;
    int N = txt.Length;
  
  
    int[][] TF = RectangularArrays.ReturnRectangularIntArray(M + 1, 
                                                      NO_OF_CHARS);
  
    computeTF(pat, M, TF);
  
    // Process txt over FA. 
    int i, state = 0;
    for (i = 0; i < N; i++)
    {
        state = TF[state][txt[i]];
        if (state == M)
        {
            Console.WriteLine("Pattern found " + 
                              "at index " + (i - M + 1));
        }
    }
}
  
public static class RectangularArrays
{
public static int[][] ReturnRectangularIntArray(int size1, 
                                                int size2)
{
    int[][] newArray = new int[size1][];
    for (int array1 = 0; array1 < size1; array1++)
    {
        newArray[array1] = new int[size2];
    }
  
    return newArray;
}
}
  
  
// Driver code 
public static void Main(string[] args)
{
    char[] pat = "AABAACAADAABAAABAA".ToCharArray();
    char[] txt = "AABA".ToCharArray();
    search(txt,pat);
}
}
  
// This code is contributed by Shrikant13


输出:

Pattern found at index 0
  Pattern found at index 9
  Pattern found at index 13

参考:
Thomas H. Cormen,Charles E. Leiserson,Ronald L.Rivest,Clifford Stein的算法简介