字尾陣列演算法



字尾陣列是一種資料結構,用於儲存給定字串的所有後綴的字典序。它對於各種字串處理問題非常有用,例如模式匹配、搜尋、查詢最長公共字首等等。這個陣列可以快速找到模式在一個較大的文字中的位置。

字尾陣列如何工作?

假設文字為“Carpet”。要構建其後綴陣列,請按照以下步驟操作:

  • 生成給定文字的所有後綴。在這種情況下,可能的字尾可能是“carpet”、“arpet”、“rpet”、“pet”、“et”、“t”。

  • 對所有後綴進行排序。所有按排序順序排列的字尾為“arpet”、“carpet”、“et”、“pet”、“rpet”和“t”。

  • 因此,字尾陣列如下:[1, 0, 4, 3, 2, 5]。

Suffix Array

要將此後綴陣列用於模式匹配,我們可以在排序後的字尾上執行二分搜尋,以查詢以模式開頭的字尾的範圍。例如,讓我們取上述字串“Carpet”,我們想找到模式“ar”,我們可以將其與字尾陣列中的中間字尾“pet”進行比較。

由於“ar”小於此後綴,因此我們可以丟棄字尾陣列的右半部分,並在左半部分繼續二分搜尋。最終,我們會發現以“ar”開頭的字尾在原始字串中的位置為“1”。

讓我們看看字尾陣列的輸入輸出場景:

Input:
string: "AABABCEDBABCDEB" 
Output:
Pattern found at index: 3
Pattern found at index: 9
Pattern found at index: 1 

示例

以下示例說明了字尾陣列在模式匹配中的工作原理。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Structure of suffixes
struct Suffix {
   int index;
   char suff[100];
};
// Function to compare two suffixes for sorting
int strCompare(const void* a, const void* b) {
   struct Suffix* s1 = (struct Suffix*)a;
   struct Suffix* s2 = (struct Suffix*)b;
   return strcmp(s1->suff, s2->suff);
}
// Function to fill the suffix array
int* fillSuffixArray(char* txt, int n) {
   struct Suffix* suffixes = (struct Suffix*) malloc(n * sizeof(struct Suffix));
   // Store suffixes and their indexes in an array 
   for (int i = 0; i < n; i++) {
      suffixes[i].index = i;
      strncpy(suffixes[i].suff, &(txt[i]), n - i);
      suffixes[i].suff[n-i] = '\0';
   }
   // Sort the suffixes 
   qsort(suffixes, n, sizeof(struct Suffix), strCompare);
   // Store indexes of all sorted suffixes in the suffix array
   int* suffixArr = (int*) malloc(n * sizeof(int));
   for (int i = 0; i < n; i++)
      suffixArr[i] = suffixes[i].index;

   // Deallocate the dynamic memory
   free(suffixes);
   return suffixArr;
}
// binary search on suffix array and find all occurrences of pattern
void suffixArraySearch(char* pat, char* txt, int* suffixArr, int n) {
   int m = strlen(pat);
   // binary search for pattern in text using the built suffix array
   int l = 0, r = n - 1;
   while (l <= r) {
      int mid = l + (r - l) / 2;
      char substr[100];
      strncpy(substr, &(txt[suffixArr[mid]]), m);
      substr[m] = '\0';
      int res = strncmp(pat, substr, m);
      if (res == 0) {
         printf("Pattern found at index: %d\n", suffixArr[mid]);
         // Move to the left of mid
         int temp = mid - 1;
         while (temp >= 0 && strncmp(pat, &(txt[suffixArr[temp]]), m) == 0) {
            printf("Pattern found at index: %d\n", suffixArr[temp]);
            temp--;
         }
         // Move to the right of mid
         temp = mid + 1;
         while (temp < n && strncmp(pat, &(txt[suffixArr[temp]]), m) == 0) {
            printf("Pattern found at index: %d\n", suffixArr[temp]);
            temp++;
         }
         return;
      }
      if (res < 0) r = mid - 1;
      else l = mid + 1;
    }
    printf("Pattern not found\n");
}
int main() {
   char txt[] = "AAAABCAEAAABCBDDAAAABC";
   // pattern to be searched
   char pat[] = "AAABC";
   int n = strlen(txt);
   int* suffixArr = fillSuffixArray(txt, n);
   suffixArraySearch(pat, txt, suffixArr, n);
   free(suffixArr);  
   return 0;
}
#include <iostream>
#include <algorithm>
#include <cstring>
using namespace std;
// Structure of suffixes
struct Suffix {
   int index;
   string suff;
};
// Function to compare two suffixes for sorting
bool strCompare(Suffix a, Suffix b) {
   return a.suff < b.suff;
}
// Function to fill the suffix array
int* fillSuffixArray(string txt, int n) {
   Suffix* suffixes = new Suffix[n];
   // Storing suffixes and indexes in an array 
   for (int i = 0; i < n; i++) {
      suffixes[i].index = i;
      suffixes[i].suff = txt.substr(i, n - i);
   }
   // Sorting the suffixes 
   sort(suffixes, suffixes+n, strCompare);
   // Store indexes of all sorted suffixes in suffix array
   int* suffixArr = new int[n];
   for (int i = 0; i < n; i++)
      suffixArr[i] = suffixes[i].index;
   // Deallocate the dynamic memory
   delete[] suffixes;
   return suffixArr;
}
// binary search on the suffix array and find all occurrences of pattern
void suffixArraySearch(string pat, string txt, int* suffixArr, int n) {
   int m = pat.length();
   // binary search for the pattern 
   int l = 0, r = n - 1;
   while (l <= r) {
      int mid = l + (r - l) / 2;
      string substr = txt.substr(suffixArr[mid], m);
      if (pat == substr) {
         cout << "Pattern found at index: " << suffixArr[mid] << endl;
         // Move to the left of mid
         int temp = mid - 1;
         while (temp >= 0 && txt.substr(suffixArr[temp], m) == pat) {
            cout << "Pattern found at index: " << suffixArr[temp] << endl;
            temp--;
         }
         // Move to the right of mid
         temp = mid + 1;
         while (temp < n && txt.substr(suffixArr[temp], m) == pat) {
            cout << "Pattern found at index: " << suffixArr[temp] << endl;
            temp++;
         }
         return;
      }
      if (pat < substr) r = mid - 1;
      else l = mid + 1;
   }
   cout << "Pattern not found" << endl;
}
int main() {
   string txt = "AAAABCAEAAABCBDDAAAABC";
   // pattern to be searched
   string pat = "AAABC";
   int n = txt.length();
   int* suffixArr = fillSuffixArray(txt, n);
   suffixArraySearch(pat, txt, suffixArr, n);
   delete[] suffixArr;  
   return 0;
} 
import java.util.Arrays;
public class Main {
   // Structure of suffixes
   static class SuffixCmpr implements Comparable<SuffixCmpr> {
      int index;
      String suff;
      // Constructor
      public SuffixCmpr(int index, String suff) {
         this.index = index;
         this.suff = suff;
      }
      // to sort suffixes alphabetically
      public int compareTo(SuffixCmpr other) {
         return this.suff.compareTo(other.suff);
      }
   }
   // method to build a suffix array
   public static int[] fillsuffixArray(String s) {
      int n = s.length();
      SuffixCmpr[] suffixes = new SuffixCmpr[n];
      // Create and sort suffixes
      for (int i = 0; i < n; i++) {
         suffixes[i] = new SuffixCmpr(i, s.substring(i));
      }
      Arrays.sort(suffixes);
      // Store indexes of all sorted suffixes
      int[] fillsuffixArray = new int[n];
      for (int i = 0; i < n; i++) {
         fillsuffixArray[i] = suffixes[i].index;
      }
      return fillsuffixArray;
   }
   // method to search a pattern in a text using suffix array
   public static void suffixArraySearch(String pattern, String txt, int[] suffArr) {
      int n = txt.length();
      int m = pattern.length();
      // binary search for the pattern in the text using suffix array
      int l = 0, r = n - 1;
      while (l <= r) {
         int mid = l + (r - l) / 2;
         int res = pattern.compareTo(txt.substring(suffArr[mid], Math.min(suffArr[mid] + m, n)));
         if (res == 0) {
            System.out.println("Pattern found at index: " + suffArr[mid]);
            // Move to previous suffix in the sorted array
            int temp = mid - 1;
            while (temp >= 0 && txt.substring(suffArr[temp], Math.min(suffArr[temp] + m, n)).equals(pattern)) {
                System.out.println("Pattern found at index: " + suffArr[temp]);
                temp--;
            }
            // Move to next suffix in the sorted array
            temp = mid + 1;
            while (temp < n && txt.substring(suffArr[temp], Math.min(suffArr[temp] + m, n)).equals(pattern)) {
                System.out.println("Pattern found at index: " + suffArr[temp]);
                temp++;
            }
            return;
         }
         if (res < 0) r = mid - 1;
         else l = mid + 1;
      }
      System.out.println("Pattern not found");
   }
   public static void main(String[] args) {
      String txt = "AAAABCAEAAABCBDDAAAABC";
      String pattern = "AAABC";
      // Filling the suffix array
      int[] suffArr = fillsuffixArray(txt);
      // Calling method to Search pattern in suffix array
      suffixArraySearch(pattern, txt, suffArr);
   }
}
def fill_suffix_array(txt):
    # Array of tuples, each tuple stores the index and suffix
    suffixes = [(i, txt[i:]) for i in range(len(txt))]
    # Sort the suffixes
    suffixes.sort(key=lambda x: x[1])
    # Return the list of indices after sorting
    return [suff[0] for suff in suffixes]

def suffixArraySearch(pat, txt, suffix_arr):
    n = len(txt)
    m = len(pat)
    # Iterate over the suffix array
    for i in range(n):
        if txt[suffix_arr[i]:min(suffix_arr[i] + m, n)] == pat:
            print(f"Pattern found at index: {suffix_arr[i]}")

def main():
    txt = "AAAABCAEAAABCBDDAAAABC"
    pat = "AAABC"
    suffix_arr = fill_suffix_array(txt)
    suffixArraySearch(pat, txt, suffix_arr)

if __name__ == "__main__":
    main()

輸出

Pattern found at index: 8
Pattern found at index: 1
Pattern found at index: 17

字尾陣列的複雜度

使用字尾陣列進行模式匹配的優點是它只需要 O(m log n) 時間,其中 m 是模式的長度,n 是字串的長度。缺點是首先構建字尾陣列需要 O(n log n) 時間和 O(n) 空間。

廣告