Kasai 演算法



Kasai 演算法 用於根據給定的字尾陣列和文字構建最長公共字首 (LCP) 陣列。構建 LCP 陣列後,可以有效地在給定文字中搜索模式。我們已經討論了幾種可以有效解決模式匹配問題的演算法,包括 KMP 演算法、Boyer-Moore 演算法和 Rabin-Karp 演算法。在本教程中,我們將探討 Kasai 演算法。

Kasai 演算法的工作原理?

要理解Kasai 演算法,我們首先需要學習該演算法的兩個核心概念:

  • 字尾陣列 - 這是一個數組,按字典順序儲存給定文字中所有後綴的起始索引。

  • LCP 陣列 - 顧名思義,兩個字串的最長公共字首 (簡稱 LCP) 是最長的既是兩個字串字首的字串。

Kasai 演算法基於以下觀察結果:

如果從位置ij 開始的兩個字尾的 LCP 為k,則從i+1j+1 開始的字尾的 LCP 至少為k-1,除非其中一個是字尾陣列中的最後一個字尾。這是因為在移除第一個字元後,字尾中字元的相對順序保持不變,除非它們到達文字的結尾。因此,我們可以利用此屬性在後綴陣列的線性掃描中計算 LCP 值,從第一個字尾開始,並將當前 LCP 值儲存在變數k 中。

每當我們移動到下一個字尾對時,我們將k減 1,然後只要位置i+kj+k 處的字元匹配,就遞增k。為了找到下一個字尾對,我們使用一個反向陣列,它將每個字尾索引對映到其在後綴陣列中的位置。

讓我們考慮 Kasai 演算法的輸入輸出場景:

Input:
string: "AABAAABCEDBABCDDEBC" 
Output:
Suffix Array: 0 1 9 3 8 2 14 10 4 11 5 15 7 12 13 6 
Common Prefix Array: 1 2 3 0 4 1 2 2 0 1 1 1 1 0 1 0 

示例

以下示例在不同的程式語言中實際演示了 Kasai 演算法。

#include<stdio.h> 
#include<string.h> 
#include<stdlib.h> 
// Defining a structure to represent suffix
struct suffixes {
   int index; 
   int rank[2];  
};
// function to compare two suffixes    
int compare(const void* a, const void* b) { 
   struct suffixes* suf1 = (struct suffixes*)a;
   struct suffixes* suf2 = (struct suffixes*)b;
   // If first rank is same
   if(suf1->rank[0] == suf2->rank[0]) { 
      // Compare second rank    
      return (suf1->rank[1] < suf2->rank[1]) ? -1 : 1;
   }else {
      return (suf1->rank[0] < suf2->rank[0]) ? -1 : 1;
   }
}
// function to build suffix array
int* createSuffArray(char* orgnlString, int n) { 
   struct suffixes suffArray[n]; 
   for (int i = 0; i < n; i++) {
      suffArray[i].index = i;
      // Rank based on character itself     
      suffArray[i].rank[0] = orgnlString[i] - 'a'; 
      // Next rank is next character
      suffArray[i].rank[1] = ((i+1)<n)?(orgnlString[i+1]-'a'):-1; 
   }
   // Sorting the suffixes 
   qsort(suffArray, n, sizeof(struct suffixes), compare); 
   int index[n];
   for (int k = 4; k < 2*n; k = k*2) {     
      int currRank = 0;
      int prevRank = suffArray[0].rank[0];
      suffArray[0].rank[0] = currRank;
      index[suffArray[0].index] = 0;
      // to assign rank and index values to first suffix    
      for (int i = 1; i < n; i++) { 
         if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i-1].rank[1]) {
            prevRank = suffArray[i].rank[0];
            // If same as previous rank, assign the same new rank
            suffArray[i].rank[0] = currRank; 
         } else{   
            prevRank = suffArray[i].rank[0];
            // increment rank and assign
            suffArray[i].rank[0] = ++currRank; 
         }
         index[suffArray[i].index] = i;
      }
      for (int i = 0; i < n; i++) {   
         int nextIndex = suffArray[i].index + k/2;
         suffArray[i].rank[1] = (nextIndex < n)? suffArray[index[nextIndex]].rank[0]: -1;
      }
      qsort(suffArray, n, sizeof(struct suffixes), compare); 
   }
   // to store indexes of all sorted suffixes
   int* suffixVector = (int*)malloc(n * sizeof(int)); 
   for (int i = 0; i < n; i++)
      suffixVector[i] = suffArray[i].index;    
      return  suffixVector; 
}
// applying Kasai's algorithm to build LCP array
int* kasaiAlgorithm(char* orgnlString, int* suffixVector, int n) { 
   // To store lcp array 
   int* longPrefix = (int*)malloc(n * sizeof(int));  
   // To store inverse of suffix array elements
   int* suffixInverse = (int*)malloc(n * sizeof(int)); 
   // to fill values in suffixInverse[] array
   for (int i=0; i < n; i++)
      suffixInverse[suffixVector[i]] = i;     
   int k = 0;
   for (int i=0; i<n; i++) {    
      if (suffixInverse[i] == n-1) {    
         k = 0;
         continue;
      }
      int j = suffixVector[suffixInverse[i]+1];
      while (i+k<n && j+k<n && orgnlString[i+k]==orgnlString[j+k]) 
         k++;
      longPrefix[suffixInverse[i]] = k;   
      if (k>0)
         k--;  
   }
   return longPrefix; 
}
void displayArray(int* vec, int n) { 
   for (int i = 0; i < n; i++)
      printf("%d ", vec[i]); 
   printf("\n");
}
int main() { 
   char orgnlString[] = "AAABCAEAAABCBDDAAAABC"; 
   int n = strlen(orgnlString);
   int* suffArray = createSuffArray(orgnlString, n); 
   printf("Suffix Array is: \n"); 
   displayArray(suffArray, n);
   // calling function to build LCP array
   int* prefixCommn = kasaiAlgorithm(orgnlString, suffArray, n); 
    // Print the LCP array
   printf("Common Prefix Array is: \n");
   displayArray(prefixCommn, n);
   return 0;
}
#include<iostream> 
#include<vector> 
#include<algorithm> 
using namespace std; 
// Defining a structure to represent suffix
struct suffixes {
   int index; 
   int rank[2];  
};
// function to compare two suffixes    
bool compare(suffixes suf1, suffixes suf2) { 
   // If first rank is same
   if(suf1.rank[0] == suf2.rank[0]) { 
      // Compare second rank    
      if(suf1.rank[1] < suf2.rank[1]) 
         return true;
      else
         return false;
   }else {
      if(suf1.rank[0] < suf2.rank[0]) 
         return true;
      else
         return false;
   }
}
// function to build suffix array
vector<int> createSuffArray(string orgnlString) { 
   int n = orgnlString.size(); 
   suffixes suffArray[n]; 
   for (int i = 0; i < n; i++) {
      suffArray[i].index = i;
      // Rank based on character itself     
      suffArray[i].rank[0] = orgnlString[i] - 'a'; 
      // Next rank is next character
      suffArray[i].rank[1] = ((i+1)<n)?(orgnlString[i+1]-'a'):-1; 
   }
   // Sorting the suffixes 
   sort(suffArray, suffArray+n, compare); 
   int index[n];

   for (int k = 4; k < 2*n; k = k*2) {     
      int currRank = 0;
      int prevRank = suffArray[0].rank[0];
      suffArray[0].rank[0] = currRank;
      index[suffArray[0].index] = 0;
      // to assign rank and index values to first suffix    
      for (int i = 1; i < n; i++) { 
         if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i-1].rank[1]) {
            prevRank = suffArray[i].rank[0];
            // If same as previous rank, assign the same new rank
            suffArray[i].rank[0] = currRank; 
         } else{   
            prevRank = suffArray[i].rank[0];
            // increment rank and assign
            suffArray[i].rank[0] = ++currRank; 
         }
         index[suffArray[i].index] = i;
      }
      for (int i = 0; i < n; i++) {   
         int nextIndex = suffArray[i].index + k/2;
         suffArray[i].rank[1] = (nextIndex < n)? suffArray[index[nextIndex]].rank[0]: -1;
      }
      sort(suffArray, suffArray+n, compare); 
   }
   // to store indexes of all sorted suffixes
   vector<int>suffixVector; 
   for (int i = 0; i < n; i++)
      suffixVector.push_back(suffArray[i].index);    
      return  suffixVector; 
}
// applying Kasai's algorithm to build LCP array
vector<int> kasaiAlgorithm(string orgnlString, vector<int> suffixVector) { 
   int n = suffixVector.size();
   // To store lcp array 
   vector<int> longPrefix(n, 0);  
   // To store inverse of suffix array elements
   vector<int> suffixInverse(n, 0); 
   // to fill values in suffixInverse[] array
   for (int i=0; i < n; i++)
      suffixInverse[suffixVector[i]] = i;     
   int k = 0;
   for (int i=0; i<n; i++) {    
      if (suffixInverse[i] == n-1) {    
         k = 0;
         continue;
      }
      int j = suffixVector[suffixInverse[i]+1];
      while (i+k<n && j+k<n && orgnlString[i+k]==orgnlString[j+k]) 
         k++;
      longPrefix[suffixInverse[i]] = k;   
      if (k>0)
         k--;  
   }
   return longPrefix; 
}
void displayArray(vector<int> vec) { 
   vector<int>::iterator it;
   for (it = vec.begin(); it < vec.end() ; it++)
      cout << *it << " "; 
   cout << endl;
}
int main() { 
   string orgnlString = "AAABCAEAAABCBDDAAAABC"; 
   vector<int>suffArray = createSuffArray(orgnlString); 
   int n = suffArray.size();
   cout<< "Suffix Array is: "<<endl; 
   displayArray(suffArray);
   // calling function to build LCP array
   vector<int>prefixCommn = kasaiAlgorithm(orgnlString, suffArray); 
    // Print the LCP array
   cout<< "Common Prefix Array is: "<<endl;
   displayArray(prefixCommn);
}
import java.util.Arrays;
public class Main {
    // Defining a class to represent suffix
    static class suffixes {
        int index;
        int[] rank = new int[2];
    }
    // method to compare two suffixes  
    static int compare(suffixes suf1, suffixes suf2) {
        // If first rank is same
        if (suf1.rank[0] == suf2.rank[0]) {
            // Compare second rank 
            if (suf1.rank[1] < suf2.rank[1])
                return -1;
            else
                return 1;
        } else {
            if (suf1.rank[0] < suf2.rank[0])
                return -1;
            else
                return 1;
        }
    }
    // method to build suffix array
    static int[] createSuffArray(String orgnlString) {
        int n = orgnlString.length();
        suffixes[] suffArray = new suffixes[n];
        for (int i = 0; i < n; i++) {
            suffArray[i] = new suffixes();
            suffArray[i].index = i;
            // Rank based on character itself     
            suffArray[i].rank[0] = orgnlString.charAt(i) - 'a';
            // Next rank is next character
            suffArray[i].rank[1] = ((i + 1) < n) ? (orgnlString.charAt(i + 1) - 'a') : -1;
        }
        // Sorting the suffixes
        Arrays.sort(suffArray, Main::compare);
        int[] index = new int[n];
        for (int k = 4; k < 2 * n; k = k * 2) {
            int currRank = 0;
            int prevRank = suffArray[0].rank[0];
            suffArray[0].rank[0] = currRank;
            index[suffArray[0].index] = 0;
            // to assign rank and index values to first suffix 
            for (int i = 1; i < n; i++) {
                if (suffArray[i].rank[0] == prevRank && suffArray[i].rank[1] == suffArray[i - 1].rank[1]) {
                    prevRank = suffArray[i].rank[0];
                    // If same as previous rank, assign the same new rank
                    suffArray[i].rank[0] = currRank;
                } else {
                    prevRank = suffArray[i].rank[0];
                    // increment rank and assign
                    suffArray[i].rank[0] = ++currRank;
                }
                index[suffArray[i].index] = i;
            }
            for (int i = 0; i < n; i++) {
                int nextIndex = suffArray[i].index + k / 2;
                suffArray[i].rank[1] = (nextIndex < n) ? suffArray[index[nextIndex]].rank[0] : -1;
            }
            Arrays.sort(suffArray, Main::compare);
        }
        // to store indexes of all sorted suffixes
        int[] suffixVector = new int[n];
        for (int i = 0; i < n; i++)
            suffixVector[i] = suffArray[i].index;
        return suffixVector;
    }
    // applying Kasai's algorithm to build LCP array
    static int[] kasaiAlgorithm(String orgnlString, int[] suffixVector) {
        int n = suffixVector.length;
        // To store lcp array
        int[] longPrefix = new int[n];
        // To store inverse of suffix array elements
        int[] suffixInverse = new int[n];
        // to fill values in suffixInverse[] array
        for (int i = 0; i < n; i++)
            suffixInverse[suffixVector[i]] = i;
        int k = 0;
        for (int i = 0; i < n; i++) {
            if (suffixInverse[i] == n - 1) {
                k = 0;
                continue;
            }
            int j = suffixVector[suffixInverse[i] + 1];
            while (i + k < n && j + k < n && orgnlString.charAt(i + k) == orgnlString.charAt(j + k))
                k++;
            longPrefix[suffixInverse[i]] = k;
            if (k > 0)
                k--;
        }
        return longPrefix;
    }
    static void displayArray(int[] vec) {
        for (int i : vec)
            System.out.print(i + " ");
        System.out.println();
    }
    public static void main(String[] args) {
        String orgnlString = "AAABCAEAAABCBDDAAAABC";
        int[] suffArray = createSuffArray(orgnlString);
        System.out.println("Suffix Array is: ");
        displayArray(suffArray);
        // calling method to build LCP array
        int[] prefixCommn = kasaiAlgorithm(orgnlString, suffArray);
        // Print the LCP array
        System.out.println("Common Prefix Array is: ");
        displayArray(prefixCommn);
    }
}
# Defining a class to represent suffix
class Suffix:
    def __init__(self):
        self.index = 0
        self.rank = [0, 0]
# function to compare two suffixes
def compare(a, b):
    if a.rank[0] == b.rank[0]:
        if a.rank[1] < b.rank[1]:
            return -1
        else:
            return 1
    else:
        if a.rank[0] < b.rank[0]:
            return -1
        else:
            return 1
# function to build suffix array
def createSuffArray(orgnlString):
    n = len(orgnlString)
    suffArray = [Suffix() for _ in range(n)]
    for i in range(n):
        suffArray[i].index = i
        suffArray[i].rank[0] = ord(orgnlString[i]) - ord('a')
        suffArray[i].rank[1] = ord(orgnlString[i + 1]) - ord('a') if ((i + 1) < n) else -1
    suffArray = sorted(suffArray, key=lambda x: (x.rank[0], x.rank[1]))
    ind = [0]*n
    k = 4
    while k < 2*n:
        rank = 0
        prev_rank = suffArray[0].rank[0]
        suffArray[0].rank[0] = rank
        ind[suffArray[0].index] = 0
        for i in range(1, n):
            if suffArray[i].rank[0] == prev_rank and suffArray[i].rank[1] == suffArray[i - 1].rank[1]:
                prev_rank = suffArray[i].rank[0]
                suffArray[i].rank[0] = rank
            else:
                prev_rank = suffArray[i].rank[0]
                rank += 1
                suffArray[i].rank[0] = rank
            ind[suffArray[i].index] = i
        for i in range(n):
            nextIndex = suffArray[i].index + k//2
            suffArray[i].rank[1] = suffArray[ind[nextIndex]].rank[0] if (nextIndex < n) else -1
        suffArray = sorted(suffArray, key=lambda x: (x.rank[0], x.rank[1]))
        k *= 2
    suffixVector = [0]*n
    for i in range(n):
        suffixVector[i] = suffArray[i].index
    return suffixVector
# applying Kasai's algorithm to build LCP array
def kasaiAlgorithm(orgnlString, suffixVector):
    n = len(suffixVector)
    longPrefix = [0]*n
    suffixInverse = [0]*n
    for i in range(n):
        suffixInverse[suffixVector[i]] = i
    k = 0
    for i in range(n):
        if suffixInverse[i] == n - 1:
            k = 0
            continue
        j = suffixVector[suffixInverse[i] + 1]
        while i + k < n and j + k < n and orgnlString[i + k] == orgnlString[j + k]:
            k += 1
        longPrefix[suffixInverse[i]] = k
        if k > 0:
            k -= 1
    return longPrefix

# Function to print an array
def displayArray(vec):
    for i in vec:
        print(i, end=' ')
    print()
def main():
    orgnlString = "AAABCAEAAABCBDDAAAABC"
    suffArray = createSuffArray(orgnlString)
    print("Suffix Array is: ")
    displayArray(suffArray)
    prefixCommn = kasaiAlgorithm(orgnlString, suffArray)
    print("Common Prefix Array is: ")
    displayArray(prefixCommn)

if __name__ == "__main__":
    main()

輸出

Suffix Array is: 
15 0 7 16 17 1 8 2 9 18 5 19 3 10 12 4 11 20 14 13 6 
Common Prefix Array is: 
3 5 5 2 4 4 4 3 3 3 0 2 2 2 0 1 1 1 1 0 0 
廣告