feat(strings): add professional suffix array and LCP implementation

idrisibrahimerten · idrisibrahimerten · commit 36bef763f012 · 2025-07-03T13:09:34.000+03:00
diff --git a/strings/suffix_array.py b/strings/suffix_array.py
@@ -0,0 +1,106 @@
+'''
+suffix_array.py
+
+Professional implementation of Suffix Array and LCP (Longest Common Prefix) array in Python.
+
+Features:
+- Efficient O(n log n) construction using doubling method
+- Kasai's algorithm for LCP array in O(n)
+- Detailed docstrings and complexity analysis
+- Standalone usage example and simple unit tests
+
+Author: Idris Ibrahim Erten
+License: MIT
+'''  
+
+def build_suffix_array(s: str) -> list[int]:
+    """
+    Builds the suffix array of the given string using the doubling algorithm.
+
+    Parameters:
+    s (str): Input string
+
+    Returns:
+    list[int]: List of starting indices of suffixes in sorted order
+
+    Complexity:
+    O(n log n) time and O(n) space.
+    """
+    # Append a sentinel that is lexicographically smaller than all other characters
+    s += '\0'
+    n = len(s)
+    # Initial ranking by character code
+    ranks = [ord(c) for c in s]
+    sa = list(range(n))
+    tmp = [0] * n
+    k = 1
+    # Doubling loop
+    while k < n:
+        # Sort by (rank[i], rank[i+k]) pairs
+        sa.sort(key=lambda i: (ranks[i], ranks[i + k] if i + k < n else -1))
+        # Temporary array for new ranks
+        tmp[sa[0]] = 0
+        for i in range(1, n):
+            prev, curr = sa[i - 1], sa[i]
+            # Compare pair (rank, next rank)
+            r_prev = (ranks[prev], ranks[prev + k] if prev + k < n else -1)
+            r_curr = (ranks[curr], ranks[curr + k] if curr + k < n else -1)
+            tmp[curr] = tmp[prev] + (1 if r_curr != r_prev else 0)
+        ranks, tmp = tmp, ranks  # reuse lists to save memory
+        k <<= 1
+        if ranks[sa[-1]] == n - 1:
+            break
+    # Drop the sentinel index
+    return sa[1:]
+
+
+def build_lcp_array(s: str, sa: list[int]) -> list[int]:
+    """
+    Builds the LCP (Longest Common Prefix) array using Kasai's algorithm.
+
+    Parameters:
+    s (str): Original string
+    sa (list[int]): Suffix array of s
+
+    Returns:
+    list[int]: LCP array where lcp[i] = LCP(sa[i], sa[i-1])
+
+    Complexity:
+    O(n) time and O(n) space.
+    """
+    n = len(sa)
+    # Inverse of suffix array: pos[i] gives rank of suffix at i
+    pos = [0] * n
+    for i, suf in enumerate(sa):
+        pos[suf] = i
+    lcp = [0] * n
+    k = 0
+    for i in range(len(s)):
+        if pos[i] == 0:
+            k = 0
+            continue
+        j = sa[pos[i] - 1]
+        # Compare characters starting from k
+        while i + k < len(s) and j + k < len(s) and s[i + k] == s[j + k]:
+            k += 1
+        lcp[pos[i]] = k
+        if k:
+            k -= 1
+    return lcp[1:]
+
+
+if __name__ == '__main__':
+    # Example usage and simple tests
+    test_strings = ['banana', 'abracadabra', 'mississippi']
+    for s in test_strings:
+        sa = build_suffix_array(s)
+        lcp = build_lcp_array(s, sa)
+        print(f"String: {s}")
+        print(f"Suffix Array: {sa}")
+        print(f"LCP Array   : {lcp}\n")
+
+    # Assertions for correctness
+    s = 'banana'
+    expected_sa = [5, 3, 1, 0, 4, 2]  # indices of sorted suffixes
+    assert build_suffix_array(s) == expected_sa, 'SA test failed'
+    print('All tests passed!')