- 資料結構與演算法
- DSA - 首頁
- DSA - 概述
- DSA - 環境設定
- DSA - 演算法基礎
- DSA - 漸進分析
- 資料結構
- DSA - 資料結構基礎
- DSA - 資料結構和型別
- DSA - 陣列資料結構
- 連結串列
- DSA - 連結串列資料結構
- DSA - 雙向連結串列資料結構
- DSA - 迴圈連結串列資料結構
- 棧與佇列
- DSA - 棧資料結構
- DSA - 表示式解析
- DSA - 佇列資料結構
- 搜尋演算法
- DSA - 搜尋演算法
- DSA - 線性搜尋演算法
- DSA - 二分搜尋演算法
- DSA - 插值搜尋
- DSA - 跳躍搜尋演算法
- DSA - 指數搜尋
- DSA - 斐波那契搜尋
- DSA - 子列表搜尋
- DSA - 雜湊表
- 排序演算法
- DSA - 排序演算法
- DSA - 氣泡排序演算法
- DSA - 插入排序演算法
- DSA - 選擇排序演算法
- DSA - 歸併排序演算法
- DSA - 希爾排序演算法
- DSA - 堆排序
- DSA - 桶排序演算法
- DSA - 計數排序演算法
- DSA - 基數排序演算法
- DSA - 快速排序演算法
- 圖資料結構
- DSA - 圖資料結構
- DSA - 深度優先遍歷
- DSA - 廣度優先遍歷
- DSA - 生成樹
- 樹資料結構
- DSA - 樹資料結構
- DSA - 樹遍歷
- DSA - 二叉搜尋樹
- DSA - AVL樹
- DSA - 紅黑樹
- DSA - B樹
- DSA - B+樹
- DSA - 伸展樹
- DSA - Trie樹
- DSA - 堆資料結構
- 遞迴
- DSA - 遞迴演算法
- DSA - 使用遞迴的漢諾塔
- DSA - 使用遞迴的斐波那契數列
- 分治法
- DSA - 分治法
- DSA - 最大-最小問題
- DSA - Strassen矩陣乘法
- DSA - Karatsuba演算法
- 貪心演算法
- DSA - 貪心演算法
- DSA - 旅行商問題(貪心法)
- DSA - Prim最小生成樹
- DSA - Kruskal最小生成樹
- DSA - Dijkstra最短路徑演算法
- DSA - 地圖著色演算法
- DSA - 分數揹包問題
- DSA - 帶截止日期的作業排序
- DSA - 最優合併模式演算法
- 動態規劃
- DSA - 動態規劃
- DSA - 矩陣鏈乘法
- DSA - Floyd-Warshall演算法
- DSA - 0-1揹包問題
- DSA - 最長公共子序列演算法
- DSA - 旅行商問題(動態規劃法)
- 近似演算法
- DSA - 近似演算法
- DSA - 頂點覆蓋演算法
- DSA - 集合覆蓋問題
- DSA - 旅行商問題(近似演算法)
- 隨機化演算法
- DSA - 隨機化演算法
- DSA - 隨機化快速排序演算法
- DSA - Karger最小割演算法
- DSA - Fisher-Yates洗牌演算法
- DSA有用資源
- DSA - 問答
- DSA - 快速指南
- DSA - 有用資源
- DSA - 討論
Aho-Corasick演算法
用於模式匹配的Aho-Corasick演算法
Aho-Corasick演算法是一種字典匹配演算法,它可以線上性時間內找到給定文字中所有模式集的所有出現。它由Alfred V. Aho和Margaret J. Corasick於1975年開發,廣泛應用於惡意軟體檢測、文字分析和自然語言處理等應用。
Aho-Corasick演算法是如何工作的?
Aho-Corasick演算法只需要遍歷文字一次即可搜尋所有模式,並且不會進行任何不必要的回溯。它可以輕鬆處理不同長度的多個關鍵字,也可以輕鬆處理重疊匹配。
該演算法藉助Trie資料結構跟蹤已搜尋的模式。Trie是一種主要用於儲存字串的樹形資料結構。
讓我們透過一個例子來理解:
Input:
set of patterns = {their, there, any, bye}
main string = "isthereanyanswerokgoodbye"
Output:
Word there location: 2
Word any location: 7
Word bye location: 22
Aho-Corasick演算法包含以下步驟:
- 預處理
- 搜尋/匹配
預處理階段
在預處理步驟中,我們從關鍵字構建有限狀態機(Trie)。Trie的每個節點代表關鍵字的一個字首,每個可能的擴充套件字首都有一條用字元標記的邊。Trie的根節點代表空字首,最後一個節點標記為最終節點。
預處理步驟進一步分為三個子步驟:
跳轉階段 - 此階段根據模式的字元定義狀態之間的轉換。它用二維陣列表示。
失敗階段 - 它定義了當發生不匹配時狀態之間的轉換。它用一維陣列表示。
輸出階段 - 在此階段,演算法儲存在給定狀態下結束的所有模式的索引。它也由一維陣列表示。
搜尋/匹配階段
搜尋步驟是透過從左到右掃描文字,並根據文字中的字元在Trie中跟隨邊和失敗連結來完成的。每當我們到達最終節點時,我們都會報告文字中相應關鍵字的匹配。
示例
以下示例演示了Aho-Corasick演算法在不同程式語言中的工作原理。
#include <stdio.h>
#include <string.h>
#define MAXS 500 // sum of the length of all patterns
#define MAXC 26 // as 26 letters in alphabet
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(char* array[], int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
char* word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < strlen(word); ++j) {
int ch = word[j] - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
// adding next level node to the queue
int q[MAXS], front = 0, rear = 0;
q[rear++] = gotoMat[0][ch];
while (front != rear) {
// removing front node
int state = q[front++];
for (int ch = 0; ch <= MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q[rear++] = gotoMat[state][ch];
}
}
}
}
}
return state;
}
int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
void patternSearch(char* arr[], int size, char* text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < strlen(text); i++) {
presentState = getNextState(presentState, text[i]);
// matching found and print words
for (int j = 0; j < size; ++j) {
if (output[presentState] & (1 << j)) {
printf("Word %s location: %zu\n", arr[j], i - strlen(arr[j]) + 1);
}
}
}
}
int main() {
char* arr[] = {"their", "there", "answer", "any", "bye"};
char* text = "isthereanyanswerokgoodbye";
int k = sizeof(arr)/sizeof(arr[0]);
patternSearch(arr, k, text);
return 0;
}
#include <iostream>
#include <queue>
#define MAXS 500 // sum of the length of all patterns
#define MAXC 26 // as 26 letters in alphabet
using namespace std;
int output[MAXS];
int fail[MAXS];
int gotoMat[MAXS][MAXC];
int buildTree(string array[], int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
string word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < word.size(); ++j) {
int ch = word[j] - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
queue<int> q;
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
q.push(gotoMat[0][ch]);
}
}
while (q.size()) {
// removing front node
int state = q.front();
q.pop();
for (int ch = 0; ch <= MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q.push(gotoMat[state][ch]);
}
}
}
return state;
}
int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
void patternSearch(string arr[], int size, string text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < text.size(); i++) {
presentState = getNextState(presentState, text[i]);
// matching found and print words
for (int j = 0; j < size; ++j) {
if (output[presentState] & (1 << j)) {
cout << "Word " << arr[j] << " location: " << i - arr[j].size() + 1 << endl;
}
}
}
}
int main() {
string arr[] = {"their", "there", "answer", "any", "bye"};
string text = "isthereanyanswerokgoodbye";
int k = sizeof(arr)/sizeof(arr[0]);
patternSearch(arr, k, text);
return 0;
}
import java.util.*;
public class Main {
static final int MAXS = 500; // sum of the length of all patterns
static final int MAXC = 26; // as 26 letters in alphabet
static int[] output = new int[MAXS];
static int[] fail = new int[MAXS];
static int[][] gotoMat = new int[MAXS][MAXC];
// method to construct trie
static int buildTree(String[] array, int size) {
for(int i = 0; i<MAXS; i++)
output[i] = 0; // all element of output array is 0
for(int i = 0; i<MAXS; i++)
fail[i] = -1; // all element of failure array is -1
for(int i = 0; i<MAXS; i++)
for(int j = 0; j<MAXC; j++)
gotoMat[i][j] = -1; // all element of goto matrix is -1
// initial state
int state = 1;
// make trie for all pattern in array
for (int i = 0; i < size; i++) {
String word = array[i];
int presentState = 0;
// adding pattern
for (int j = 0; j < word.length(); ++j) {
int ch = word.charAt(j) - 'a';
if (gotoMat[presentState][ch] == -1)
// increasing state
gotoMat[presentState][ch] = state++;
presentState = gotoMat[presentState][ch];
}
// adding current word in the output
output[presentState] |= (1 << i);
}
// if ch is not directly connected to root node
for (int ch = 0; ch < MAXC; ++ch)
if (gotoMat[0][ch] == -1)
gotoMat[0][ch] = 0;
Queue<Integer> q = new LinkedList<>();
// node goes to previous state when fails
for (int ch = 0; ch < MAXC; ++ch) {
if (gotoMat[0][ch] != 0) {
fail[gotoMat[0][ch]] = 0;
q.add(gotoMat[0][ch]);
}
}
while (!q.isEmpty()) {
// removing front node
state = q.poll();
for (int ch = 0; ch < MAXC; ++ch) {
// if goto state is present
if (gotoMat[state][ch] != -1) {
int failure = fail[state];
// find deepest node with proper suffix
while (gotoMat[failure][ch] == -1)
failure = fail[failure];
failure = gotoMat[failure][ch];
fail[gotoMat[state][ch]] = failure;
// Merging output values
output[gotoMat[state][ch]] |= output[failure];
// adding next level node to the queue
q.add(gotoMat[state][ch]);
}
}
}
return state;
}
static int getNextState(int presentState, char nextChar) {
int answer = presentState;
int ch = nextChar - 'a'; //subtract ascii of 'a'
// if go to is not found, use failure function
while (gotoMat[answer][ch] == -1)
answer = fail[answer];
return gotoMat[answer][ch];
}
static void patternSearch(String[] arr, int size, String text) {
buildTree(arr, size); // make the trie structure
int presentState = 0; // make current state as 0
// to find all occurances of pattern
for (int i = 0; i < text.length(); i++) {
presentState = getNextState(presentState, text.charAt(i));
// matching found and print words
for (int j = 0; j < size; ++j) {
if ((output[presentState] & (1 << j)) != 0) {
System.out.println("Word " + arr[j] + " location: " + (i - arr[j].length() + 1));
}
}
}
}
public static void main(String[] args) {
String[] arr = {"their", "there", "answer", "any", "bye"};
String text = "isthereanyanswerokgoodbye";
int k = arr.length;
patternSearch(arr, k, text);
}
}
from collections import deque
MAXS = 500 # sum of the length of all patterns
MAXC = 26 # as 26 letters in alphabet
output = [0]*MAXS
fail = [-1]*MAXS
gotoMat = [[-1]*MAXC for _ in range(MAXS)]
# function to construct trie
def buildTree(array):
global output, fail, gotoMat
size = len(array)
# initial state
state = 1
# make trie for all pattern in array
for i in range(size):
word = array[i]
presentState = 0
# adding pattern
for j in range(len(word)):
ch = ord(word[j]) - ord('a')
if gotoMat[presentState][ch] == -1:
# increasing state
gotoMat[presentState][ch] = state
state += 1
presentState = gotoMat[presentState][ch]
# adding current word in the output
output[presentState] |= (1 << i)
# if ch is not directly connected to root node
for ch in range(MAXC):
if gotoMat[0][ch] == -1:
gotoMat[0][ch] = 0
q = deque()
# node goes to previous state when fails
for ch in range(MAXC):
if gotoMat[0][ch] != 0:
fail[gotoMat[0][ch]] = 0
q.append(gotoMat[0][ch])
while q:
# removing front node
state = q.popleft()
for ch in range(MAXC):
# if goto state is present
if gotoMat[state][ch] != -1:
failure = fail[state]
# find deepest node with proper suffix
while gotoMat[failure][ch] == -1:
failure = fail[failure]
failure = gotoMat[failure][ch]
fail[gotoMat[state][ch]] = failure
# Merging output values
output[gotoMat[state][ch]] |= output[failure]
# adding next level node to the queue
q.append(gotoMat[state][ch])
return state
def getNextState(presentState, nextChar):
answer = presentState
ch = ord(nextChar) - ord('a') #subtract ascii of 'a'
# if go to is not found, use failure function
while gotoMat[answer][ch] == -1:
answer = fail[answer]
return gotoMat[answer][ch]
def patternSearch(arr, text):
buildTree(arr) # make the trie structure
presentState = 0 # make current state as 0
size = len(arr)
# to find all occurances of pattern
for i in range(len(text)):
presentState = getNextState(presentState, text[i])
# matching found and print words
for j in range(size):
if (output[presentState] & (1 << j)) != 0:
print(f"Word {arr[j]} location: {i - len(arr[j]) + 1}")
def main():
arr = ["their", "there", "answer", "any", "bye"]
text = "isthereanyanswerokgoodbye"
patternSearch(arr, text)
if __name__ == "__main__":
main()
輸出
Word there location: 2 Word any location: 7 Word answer location: 10 Word bye location: 22
廣告