Aho-Corasick演算法



用於模式匹配的Aho-Corasick演算法

Aho-Corasick演算法是一種字典匹配演算法,它可以線上性時間內找到給定文字中所有模式集的所有出現。它由Alfred V. Aho和Margaret J. Corasick於1975年開發,廣泛應用於惡意軟體檢測、文字分析和自然語言處理等應用。

Aho-Corasick演算法是如何工作的?

Aho-Corasick演算法只需要遍歷文字一次即可搜尋所有模式,並且不會進行任何不必要的回溯。它可以輕鬆處理不同長度的多個關鍵字,也可以輕鬆處理重疊匹配。

該演算法藉助Trie資料結構跟蹤已搜尋的模式。Trie是一種主要用於儲存字串的樹形資料結構。

讓我們透過一個例子來理解:

Input:
set of patterns = {their, there, any, bye} 
main string = "isthereanyanswerokgoodbye"
Output:
Word there location: 2
Word any location: 7
Word bye location: 22
Pattern Aho-Corasick

Aho-Corasick演算法包含以下步驟:

  • 預處理
  • 搜尋/匹配

預處理階段

在預處理步驟中,我們從關鍵字構建有限狀態機(Trie)。Trie的每個節點代表關鍵字的一個字首,每個可能的擴充套件字首都有一條用字元標記的邊。Trie的根節點代表空字首,最後一個節點標記為最終節點。

預處理步驟進一步分為三個子步驟:

  • 跳轉階段 - 此階段根據模式的字元定義狀態之間的轉換。它用二維陣列表示。

  • 失敗階段 - 它定義了當發生不匹配時狀態之間的轉換。它用一維陣列表示。

  • 輸出階段 - 在此階段,演算法儲存在給定狀態下結束的所有模式的索引。它也由一維陣列表示。

搜尋/匹配階段

搜尋步驟是透過從左到右掃描文字,並根據文字中的字元在Trie中跟隨邊和失敗連結來完成的。每當我們到達最終節點時,我們都會報告文字中相應關鍵字的匹配。

示例

以下示例演示了Aho-Corasick演算法在不同程式語言中的工作原理。

#include <stdio.h>
#include <string.h>
#define MAXS 500    // sum of the length of all patterns
#define MAXC 26     // as 26 letters in alphabet
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(char* array[], int size) {
   for(int i = 0; i<MAXS; i++)
      output[i] = 0;  // all element of output array is 0
   for(int i = 0; i<MAXS; i++)
      fail[i] = -1;  // all element of failure array is -1
   for(int i = 0; i<MAXS; i++)
      for(int j = 0; j<MAXC; j++)
         gotoMat[i][j] = -1;  // all element of goto matrix is -1
   // initial state
   int state = 1;    
   // make trie for all pattern in array
   for (int i = 0; i < size; i++) {    
      char* word = array[i];
      int presentState = 0;
      // adding pattern
      for (int j = 0; j < strlen(word); ++j) {    
         int ch = word[j] - 'a';
         if (gotoMat[presentState][ch] == -1)    
            // increasing state
            gotoMat[presentState][ch] = state++;   
         presentState = gotoMat[presentState][ch];
      }
      // adding current word in the output
      output[presentState] |= (1 << i); 
   }
   // if ch is not directly connected to root node
   for (int ch = 0; ch < MAXC; ++ch)   
      if (gotoMat[0][ch] == -1)
         gotoMat[0][ch] = 0;
   // node goes to previous state when fails
   for (int ch = 0; ch < MAXC; ++ch) {    
      if (gotoMat[0][ch] != 0) {
         fail[gotoMat[0][ch]] = 0;
         // adding next level node to the queue
         int q[MAXS], front = 0, rear = 0;
         q[rear++] = gotoMat[0][ch];
         while (front != rear) {
            // removing front node
            int state = q[front++];   
            for (int ch = 0; ch <= MAXC; ++ch) {
               // if goto state is present
               if (gotoMat[state][ch] != -1) {    
                  int failure = fail[state];
                  // find deepest node with proper suffix
                  while (gotoMat[failure][ch] == -1)    
                     failure = fail[failure];
                  failure = gotoMat[failure][ch];
                  fail[gotoMat[state][ch]] = failure;
                  // Merging output values
                  output[gotoMat[state][ch]] |= output[failure];  
                  // adding next level node to the queue
                  q[rear++] = gotoMat[state][ch];    
               }
            }
         }
      }
   }
   return state;
}
int getNextState(int presentState, char nextChar) {
   int answer = presentState;
   int ch = nextChar - 'a'; //subtract ascii of 'a'
   // if go to is not found, use failure function
   while (gotoMat[answer][ch] == -1) 
      answer = fail[answer];
   return gotoMat[answer][ch];
}
void patternSearch(char* arr[], int size, char* text) {
   buildTree(arr, size);  // make the trie structure
   int presentState = 0;  // make current state as 0
   // to find all occurances of pattern
   for (int i = 0; i < strlen(text); i++) {    
      presentState = getNextState(presentState, text[i]);
      // matching found and print words
      for (int j = 0; j < size; ++j) {  
         if (output[presentState] & (1 << j)) {
           printf("Word %s location: %zu\n", arr[j], i - strlen(arr[j]) + 1);
         }
      }
   }
}
int main() {
   char* arr[] = {"their", "there", "answer", "any", "bye"};
   char* text = "isthereanyanswerokgoodbye";
   int k = sizeof(arr)/sizeof(arr[0]);
   patternSearch(arr, k, text);
   return 0;
}
#include <iostream>
#include <queue>
#define MAXS 500    // sum of the length of all patterns
#define MAXC 26     // as 26 letters in alphabet
using namespace std;
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(string array[], int size) {
   for(int i = 0; i<MAXS; i++)
      output[i] = 0;  // all element of output array is 0
   for(int i = 0; i<MAXS; i++)
      fail[i] = -1;  // all element of failure array is -1
   for(int i = 0; i<MAXS; i++)
      for(int j = 0; j<MAXC; j++)
         gotoMat[i][j] = -1;  // all element of goto matrix is -1
   // initial state
   int state = 1;    
   // make trie for all pattern in array
   for (int i = 0; i < size; i++) {    
      string word = array[i];
      int presentState = 0;
      // adding pattern
      for (int j = 0; j < word.size(); ++j) {    
         int ch = word[j] - 'a';
         if (gotoMat[presentState][ch] == -1)    
            // increasing state
            gotoMat[presentState][ch] = state++;   
         presentState = gotoMat[presentState][ch];
      }
      // adding current word in the output
      output[presentState] |= (1 << i); 
   }
   // if ch is not directly connected to root node
   for (int ch = 0; ch < MAXC; ++ch)   
      if (gotoMat[0][ch] == -1)
         gotoMat[0][ch] = 0;
   queue<int> q;
   // node goes to previous state when fails
   for (int ch = 0; ch < MAXC; ++ch) {    
      if (gotoMat[0][ch] != 0) {
         fail[gotoMat[0][ch]] = 0;
         q.push(gotoMat[0][ch]);
      }
   }
   while (q.size()) {
      // removing front node
      int state = q.front();   
      q.pop();
      for (int ch = 0; ch <= MAXC; ++ch) {
         // if goto state is present
         if (gotoMat[state][ch] != -1) {    
            int failure = fail[state];
            // find deepest node with proper suffix
            while (gotoMat[failure][ch] == -1)    
               failure = fail[failure];
               failure = gotoMat[failure][ch];
               fail[gotoMat[state][ch]] = failure;
               // Merging output values
               output[gotoMat[state][ch]] |= output[failure];  
               // adding next level node to the queue
               q.push(gotoMat[state][ch]);    
         }
      }
   }
   return state;
}
int getNextState(int presentState, char nextChar) {
   int answer = presentState;
   int ch = nextChar - 'a'; //subtract ascii of 'a'
   // if go to is not found, use failure function
   while (gotoMat[answer][ch] == -1) 
      answer = fail[answer];
   return gotoMat[answer][ch];
}
void patternSearch(string arr[], int size, string text) {
   buildTree(arr, size);  // make the trie structure
   int presentState = 0;  // make current state as 0
   // to find all occurances of pattern
   for (int i = 0; i < text.size(); i++) {    
      presentState = getNextState(presentState, text[i]);
      // matching found and print words
      for (int j = 0; j < size; ++j) {  
         if (output[presentState] & (1 << j)) {
            cout << "Word " << arr[j] << " location: " << i - arr[j].size() + 1 << endl;
         }
      }
   }
}
int main() {
   string arr[] = {"their", "there", "answer", "any", "bye"};
   string text = "isthereanyanswerokgoodbye";
   int k = sizeof(arr)/sizeof(arr[0]);
   patternSearch(arr, k, text);
   return 0;
}
import java.util.*;
public class Main {
   static final int MAXS = 500; // sum of the length of all patterns
   static final int MAXC = 26;  // as 26 letters in alphabet
   static int[] output = new int[MAXS];
   static int[] fail = new int[MAXS];
   static int[][] gotoMat = new int[MAXS][MAXC];
   // method to construct trie
   static int buildTree(String[] array, int size) {
      for(int i = 0; i<MAXS; i++)
         output[i] = 0;  // all element of output array is 0
      for(int i = 0; i<MAXS; i++)
         fail[i] = -1;  // all element of failure array is -1
      for(int i = 0; i<MAXS; i++)
         for(int j = 0; j<MAXC; j++)
            gotoMat[i][j] = -1;  // all element of goto matrix is -1
        // initial state
        int state = 1;    
        // make trie for all pattern in array
        for (int i = 0; i < size; i++) {    
            String word = array[i];
            int presentState = 0;
            // adding pattern
            for (int j = 0; j < word.length(); ++j) {    
                int ch = word.charAt(j) - 'a';
                if (gotoMat[presentState][ch] == -1)    
                    // increasing state
                    gotoMat[presentState][ch] = state++;   
                presentState = gotoMat[presentState][ch];
            }
            // adding current word in the output
            output[presentState] |= (1 << i); 
        }
        // if ch is not directly connected to root node
        for (int ch = 0; ch < MAXC; ++ch)   
            if (gotoMat[0][ch] == -1)
                gotoMat[0][ch] = 0;
        Queue<Integer> q = new LinkedList<>();
        // node goes to previous state when fails
        for (int ch = 0; ch < MAXC; ++ch) {    
            if (gotoMat[0][ch] != 0) {
                fail[gotoMat[0][ch]] = 0;
                q.add(gotoMat[0][ch]);
            }
        }
        while (!q.isEmpty()) {
            // removing front node
            state = q.poll();   
            for (int ch = 0; ch < MAXC; ++ch) {
                // if goto state is present
                if (gotoMat[state][ch] != -1) {    
                    int failure = fail[state];
                    // find deepest node with proper suffix
                    while (gotoMat[failure][ch] == -1)    
                        failure = fail[failure];
                    failure = gotoMat[failure][ch];
                    fail[gotoMat[state][ch]] = failure;
                    // Merging output values
                    output[gotoMat[state][ch]] |= output[failure];  
                    // adding next level node to the queue
                    q.add(gotoMat[state][ch]);    
                }
            }
        }
        return state;
    }
    static int getNextState(int presentState, char nextChar) {
        int answer = presentState;
        int ch = nextChar - 'a'; //subtract ascii of 'a'
        // if go to is not found, use failure function
        while (gotoMat[answer][ch] == -1) 
            answer = fail[answer];
        return gotoMat[answer][ch];
    }
    static void patternSearch(String[] arr, int size, String text) {
        buildTree(arr, size);  // make the trie structure
        int presentState = 0;  // make current state as 0
        // to find all occurances of pattern
        for (int i = 0; i < text.length(); i++) {    
            presentState = getNextState(presentState, text.charAt(i));
            // matching found and print words
            for (int j = 0; j < size; ++j) {  
                if ((output[presentState] & (1 << j)) != 0) {
                    System.out.println("Word " + arr[j] + " location: " + (i - arr[j].length() + 1));
                }
            }
        }
    }
    public static void main(String[] args) {
        String[] arr = {"their", "there", "answer", "any", "bye"};
        String text = "isthereanyanswerokgoodbye";
        int k = arr.length;
        patternSearch(arr, k, text);
    }
}
from collections import deque
MAXS = 500    # sum of the length of all patterns
MAXC = 26     # as 26 letters in alphabet
output = [0]*MAXS
fail = [-1]*MAXS
gotoMat = [[-1]*MAXC for _ in range(MAXS)]
# function to construct trie
def buildTree(array):
    global output, fail, gotoMat
    size = len(array)
    # initial state
    state = 1    
    # make trie for all pattern in array
    for i in range(size):    
        word = array[i]
        presentState = 0
        # adding pattern
        for j in range(len(word)):    
            ch = ord(word[j]) - ord('a')
            if gotoMat[presentState][ch] == -1:    
                # increasing state
                gotoMat[presentState][ch] = state   
                state += 1
            presentState = gotoMat[presentState][ch]
        # adding current word in the output
        output[presentState] |= (1 << i) 
    # if ch is not directly connected to root node
    for ch in range(MAXC):   
        if gotoMat[0][ch] == -1:
            gotoMat[0][ch] = 0
    q = deque()
    # node goes to previous state when fails
    for ch in range(MAXC):    
        if gotoMat[0][ch] != 0:
            fail[gotoMat[0][ch]] = 0
            q.append(gotoMat[0][ch])
    while q:
        # removing front node
        state = q.popleft()   
        for ch in range(MAXC):
            # if goto state is present
            if gotoMat[state][ch] != -1:    
                failure = fail[state]
                # find deepest node with proper suffix
                while gotoMat[failure][ch] == -1:    
                    failure = fail[failure]
                failure = gotoMat[failure][ch]
                fail[gotoMat[state][ch]] = failure
                # Merging output values
                output[gotoMat[state][ch]] |= output[failure]  
                # adding next level node to the queue
                q.append(gotoMat[state][ch])    
    return state

def getNextState(presentState, nextChar):
    answer = presentState
    ch = ord(nextChar) - ord('a') #subtract ascii of 'a'
    # if go to is not found, use failure function
    while gotoMat[answer][ch] == -1: 
        answer = fail[answer]
    return gotoMat[answer][ch]

def patternSearch(arr, text):
    buildTree(arr)  # make the trie structure
    presentState = 0  # make current state as 0
    size = len(arr)
    # to find all occurances of pattern
    for i in range(len(text)):    
        presentState = getNextState(presentState, text[i])
        # matching found and print words
        for j in range(size):  
            if (output[presentState] & (1 << j)) != 0:
                print(f"Word {arr[j]} location: {i - len(arr[j]) + 1}")

def main():
    arr = ["their", "there", "answer", "any", "bye"]
    text = "isthereanyanswerokgoodbye"
    patternSearch(arr, text)

if __name__ == "__main__":
    main()

輸出

Word there location: 2
Word any location: 7
Word answer location: 10
Word bye location: 22
廣告