📜  用于模式搜索的Rabin-Karp算法

📅  最后修改于: 2021-04-29 15:56:58             🧑  作者: Mango

给定文本txt [0..n-1]和模式pat [0..m-1] ,编写一个函数search(char pat [],char txt []) ,将所有出现的pat []都打印在txt中[] 。您可以假设n> m。
例子:

Input:  txt[] = "THIS IS A TEST TEXT"
        pat[] = "TEST"
Output: Pattern found at index 10

Input:  txt[] =  "AABAACAADAABAABA"
        pat[] =  "AABA"
Output: Pattern found at index 0
        Pattern found at index 9
        Pattern found at index 12

模式搜索

幼稚的字符串匹配算法可将模式一一滑动。各滑动之后,在当前的换档,并且如果所有字符它逐一检查字符匹配,则打印该匹配。
像朴素算法一样,Rabin-Karp算法也可以使图案一个接一个地滑动。但是与Naive算法不同,Rabin Karp算法将模式的哈希值与文本的当前子字符串的哈希值进行匹配,如果哈希值匹配,则仅开始匹配各个字符。因此Rabin Karp算法需要计算以下字符串的哈希值。
1)模式本身。
2)长度为m的文本的所有子字符串。

由于我们需要为文本大小为m的所有子字符串有效地计算哈希值,因此我们必须具有具有以下属性的哈希函数。
下一个班次的哈希值必须可以根据当前哈希值和文本中的下一个字符有效地计算得出,或者可以说hash(txt [s + 1 .. s + m])必须可以根据hash(txt [s .. s ]有效计算得出+ m-1])txt [s + m],hash(txt [s + 1 .. s + m]) = rehash(txt [s + m],hash(txt [s .. s + m- 1])),并且重新哈希必须为O(1)操作。
Rabin和Karp建议的哈希函数计算一个整数值。用于字符串的整数值是一个字符串的数值。

例如,如果所有可能的字符都为1到10,则“ 122”的数值将为122。可能的字符数大于10(通常为256),并且图案长度可能很大。因此,数值实际上不能存储为整数。因此,使用模块化算法来计算数值,以确保可以将哈希值存储在整数变量中(可以适合存储字)。要进行重新哈希处理,我们需要删除最高有效位数,并在哈希值中添加新的最低有效位数。使用以下公式可进行重新哈希处理。

hash(txt [s + 1 .. s + m])=(d(hash(txt [s .. s + m-1] – txt [s] * h)+ txt [s + m])mod q
hash(txt [s .. s + m-1]) :移位s上的哈希值。
hash(txt [s + 1 .. s + m]) :下一个移位(或移位s +1)的哈希值
d:在字母字符数
q :质数
h:d ^(m-1)

上面的表达式如何工作?

C++
/* Following program is a C++ implementation of Rabin Karp 
Algorithm given in the CLRS book */
#include 
using namespace std;
  
// d is the number of characters in the input alphabet 
#define d 256 
  
/* pat -> pattern 
    txt -> text 
    q -> A prime number 
*/
void search(char pat[], char txt[], int q) 
{ 
    int M = strlen(pat); 
    int N = strlen(txt); 
    int i, j; 
    int p = 0; // hash value for pattern 
    int t = 0; // hash value for txt 
    int h = 1; 
  
    // The value of h would be "pow(d, M-1)%q" 
    for (i = 0; i < M - 1; i++) 
        h = (h * d) % q; 
  
    // Calculate the hash value of pattern and first 
    // window of text 
    for (i = 0; i < M; i++) 
    { 
        p = (d * p + pat[i]) % q; 
        t = (d * t + txt[i]) % q; 
    } 
  
    // Slide the pattern over text one by one 
    for (i = 0; i <= N - M; i++) 
    { 
  
        // Check the hash values of current window of text 
        // and pattern. If the hash values match then only 
        // check for characters on by one 
        if ( p == t ) 
        { 
            /* Check for characters one by one */
            for (j = 0; j < M; j++) 
            { 
                if (txt[i+j] != pat[j]) 
                    break; 
            } 
  
            // if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1] 
            if (j == M) 
                cout<<"Pattern found at index "<< i<


C
/* Following program is a C implementation of Rabin Karp
Algorithm given in the CLRS book */
#include
#include
  
// d is the number of characters in the input alphabet
#define d 256
  
/* pat -> pattern
    txt -> text
    q -> A prime number
*/
void search(char pat[], char txt[], int q)
{
    int M = strlen(pat);
    int N = strlen(txt);
    int i, j;
    int p = 0; // hash value for pattern
    int t = 0; // hash value for txt
    int h = 1;
  
    // The value of h would be "pow(d, M-1)%q"
    for (i = 0; i < M-1; i++)
        h = (h*d)%q;
  
    // Calculate the hash value of pattern and first
    // window of text
    for (i = 0; i < M; i++)
    {
        p = (d*p + pat[i])%q;
        t = (d*t + txt[i])%q;
    }
  
    // Slide the pattern over text one by one
    for (i = 0; i <= N - M; i++)
    {
  
        // Check the hash values of current window of text
        // and pattern. If the hash values match then only
        // check for characters on by one
        if ( p == t )
        {
            /* Check for characters one by one */
            for (j = 0; j < M; j++)
            {
                if (txt[i+j] != pat[j])
                    break;
            }
  
            // if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
            if (j == M)
                printf("Pattern found at index %d \n", i);
        }
  
        // Calculate hash value for next window of text: Remove
        // leading digit, add trailing digit
        if ( i < N-M )
        {
            t = (d*(t - txt[i]*h) + txt[i+M])%q;
  
            // We might get negative value of t, converting it
            // to positive
            if (t < 0)
            t = (t + q);
        }
    }
}
  
/* Driver Code */
int main()
{
    char txt[] = "GEEKS FOR GEEKS";
    char pat[] = "GEEK";
    
      // A prime number
    int q = 101; 
    
      // function call
    search(pat, txt, q);
    return 0;
}


Java
// Following program is a Java implementation 
// of Rabin Karp Algorithm given in the CLRS book
  
public class Main 
{
    // d is the number of characters in the input alphabet
    public final static int d = 256;
      
    /* pat -> pattern
        txt -> text
        q -> A prime number
    */
    static void search(String pat, String txt, int q)
    {
        int M = pat.length();
        int N = txt.length();
        int i, j;
        int p = 0; // hash value for pattern
        int t = 0; // hash value for txt
        int h = 1;
      
        // The value of h would be "pow(d, M-1)%q"
        for (i = 0; i < M-1; i++)
            h = (h*d)%q;
      
        // Calculate the hash value of pattern and first
        // window of text
        for (i = 0; i < M; i++)
        {
            p = (d*p + pat.charAt(i))%q;
            t = (d*t + txt.charAt(i))%q;
        }
      
        // Slide the pattern over text one by one
        for (i = 0; i <= N - M; i++)
        {
      
            // Check the hash values of current window of text
            // and pattern. If the hash values match then only
            // check for characters on by one
            if ( p == t )
            {
                /* Check for characters one by one */
                for (j = 0; j < M; j++)
                {
                    if (txt.charAt(i+j) != pat.charAt(j))
                        break;
                }
      
                // if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
                if (j == M)
                    System.out.println("Pattern found at index " + i);
            }
      
            // Calculate hash value for next window of text: Remove
            // leading digit, add trailing digit
            if ( i < N-M )
            {
                t = (d*(t - txt.charAt(i)*h) + txt.charAt(i+M))%q;
      
                // We might get negative value of t, converting it
                // to positive
                if (t < 0)
                t = (t + q);
            }
        }
    }
      
    /* Driver Code */
    public static void main(String[] args)
    {
        String txt = "GEEKS FOR GEEKS";
        String pat = "GEEK";
            
          // A prime number
        int q = 101; 
        
          // Function Call
        search(pat, txt, q);
    }
}
  
// This code is contributed by nuclode


Python
# Following program is the python implementation of
# Rabin Karp Algorithm given in CLRS book
  
# d is the number of characters in the input alphabet
d = 256
  
# pat  -> pattern
# txt  -> text
# q    -> A prime number
  
def search(pat, txt, q):
    M = len(pat)
    N = len(txt)
    i = 0
    j = 0
    p = 0    # hash value for pattern
    t = 0    # hash value for txt
    h = 1
  
    # The value of h would be "pow(d, M-1)%q"
    for i in xrange(M-1):
        h = (h*d)%q
  
    # Calculate the hash value of pattern and first window
    # of text
    for i in xrange(M):
        p = (d*p + ord(pat[i]))%q
        t = (d*t + ord(txt[i]))%q
  
    # Slide the pattern over text one by one
    for i in xrange(N-M+1):
        # Check the hash values of current window of text and
        # pattern if the hash values match then only check
        # for characters on by one
        if p==t:
            # Check for characters one by one
            for j in xrange(M):
                if txt[i+j] != pat[j]:
                    break
                else: j+=1
  
            # if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1]
            if j==M:
                print "Pattern found at index " + str(i)
  
        # Calculate hash value for next window of text: Remove
        # leading digit, add trailing digit
        if i < N-M:
            t = (d*(t-ord(txt[i])*h) + ord(txt[i+M]))%q
  
            # We might get negative values of t, converting it to
            # positive
            if t < 0:
                t = t+q
  
# Driver Code
txt = "GEEKS FOR GEEKS"
pat = "GEEK"
  
# A prime number
q = 101 
  
# Function Call
search(pat,txt,q)
  
# This code is contributed by Bhavya Jain


C#
// Following program is a C# implementation 
// of Rabin Karp Algorithm given in the CLRS book 
using System;
public class GFG 
{ 
    // d is the number of characters in the input alphabet 
    public readonly static int d = 256; 
      
    /* pat -> pattern 
        txt -> text 
        q -> A prime number 
    */
    static void search(String pat, String txt, int q) 
    { 
        int M = pat.Length; 
        int N = txt.Length; 
        int i, j; 
        int p = 0; // hash value for pattern 
        int t = 0; // hash value for txt 
        int h = 1; 
      
        // The value of h would be "pow(d, M-1)%q" 
        for (i = 0; i < M-1; i++) 
            h = (h*d)%q; 
      
        // Calculate the hash value of pattern and first 
        // window of text 
        for (i = 0; i < M; i++) 
        { 
            p = (d*p + pat[i])%q; 
            t = (d*t + txt[i])%q; 
        } 
      
        // Slide the pattern over text one by one 
        for (i = 0; i <= N - M; i++) 
        { 
      
            // Check the hash values of current window of text 
            // and pattern. If the hash values match then only 
            // check for characters on by one 
            if ( p == t ) 
            { 
                /* Check for characters one by one */
                for (j = 0; j < M; j++) 
                { 
                    if (txt[i+j] != pat[j]) 
                        break; 
                } 
      
                // if p == t and pat[0...M-1] = txt[i, i+1, ...i+M-1] 
                if (j == M) 
                    Console.WriteLine("Pattern found at index " + i); 
            } 
      
            // Calculate hash value for next window of text: Remove 
            // leading digit, add trailing digit 
            if ( i < N-M ) 
            { 
                t = (d*(t - txt[i]*h) + txt[i+M])%q; 
      
                // We might get negative value of t, converting it 
                // to positive 
                if (t < 0) 
                t = (t + q); 
            } 
        } 
    } 
      
    /* Driver Code */
    public static void Main() 
    { 
        String txt = "GEEKS FOR GEEKS"; 
        String pat = "GEEK"; 
        
          // A prime number 
        int q = 101;
        
          // Function Call
        search(pat, txt, q); 
    } 
} 
  
// This code is contributed by PrinciRaj19992


PHP
 pattern
   txt -> text
   q -> A prime number
*/
function search($pat, $txt, $q)
{
    $M = strlen($pat);
    $N = strlen($txt);
    $i; $j;
    $p = 0; // hash value 
            // for pattern
    $t = 0; // hash value 
            // for txt
    $h = 1;
    $d =1;
  
    // The value of h would
    // be "pow(d, M-1)%q"
    for ($i = 0; $i < $M - 1; $i++)
        $h = ($h * $d) % $q;
  
    // Calculate the hash value
    // of pattern and first
    // window of text
    for ($i = 0; $i < $M; $i++)
    {
        $p = ($d * $p + $pat[$i]) % $q;
        $t = ($d * $t + $txt[$i]) % $q;
    }
  
    // Slide the pattern over
    // text one by one
    for ($i = 0; $i <= $N - $M; $i++)
    {
  
        // Check the hash values of 
        // current window of text
        // and pattern. If the hash
        // values match then only
        // check for characters on
        // by one
        if ($p == $t)
        {
            // Check for characters
            // one by one
            for ($j = 0; $j < $M; $j++)
            {
                if ($txt[$i + $j] != $pat[$j])
                    break;
            }
  
            // if p == t and pat[0...M-1] = 
            // txt[i, i+1, ...i+M-1]
            if ($j == $M)
                echo "Pattern found at index ",
                                      $i, "\n";
        }
  
        // Calculate hash value for 
        // next window of text: 
        // Remove leading digit,
        // add trailing digit
        if ($i < $N - $M)
        {
            $t = ($d * ($t - $txt[$i] * 
                        $h) + $txt[$i + 
                             $M]) % $q;
  
            // We might get negative 
            // value of t, converting
            // it to positive
            if ($t < 0)
            $t = ($t + $q);
        }
    }
}
  
// Driver Code
$txt = "GEEKS FOR GEEKS";
$pat = "GEEK";
  
// A prime number
$q = 101;
  
// Function Call
search($pat, $txt, $q);
  
// This code is contributed
// by ajit
?>


输出:

Pattern found at index 0
Pattern found at index 10

时间复杂度:
Rabin-Karp算法的平均运行时间和最佳情况运行时间为O(n + m),但最坏情况运行时间为O(nm)。当模式和文本的所有字符都与txt []的所有子字符串的哈希值与pat []的哈希值匹配时,会发生Rabin-Karp算法的最坏情况。例如,pat [] =“ AAA”和txt [] =“ AAAAAAA”。