Skip to content

Commit 36bef76

Browse files
feat(strings): add professional suffix array and LCP implementation
1 parent c3d4b9e commit 36bef76

File tree

1 file changed

+106
-0
lines changed

1 file changed

+106
-0
lines changed

strings/suffix_array.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
'''
2+
suffix_array.py
3+
4+
Professional implementation of Suffix Array and LCP (Longest Common Prefix) array in Python.
5+
6+
Features:
7+
- Efficient O(n log n) construction using doubling method
8+
- Kasai's algorithm for LCP array in O(n)
9+
- Detailed docstrings and complexity analysis
10+
- Standalone usage example and simple unit tests
11+
12+
Author: Idris Ibrahim Erten
13+
License: MIT
14+
'''
15+
16+
def build_suffix_array(s: str) -> list[int]:
17+
"""
18+
Builds the suffix array of the given string using the doubling algorithm.
19+
20+
Parameters:
21+
s (str): Input string
22+
23+
Returns:
24+
list[int]: List of starting indices of suffixes in sorted order
25+
26+
Complexity:
27+
O(n log n) time and O(n) space.
28+
"""
29+
# Append a sentinel that is lexicographically smaller than all other characters
30+
s += '\0'
31+
n = len(s)
32+
# Initial ranking by character code
33+
ranks = [ord(c) for c in s]
34+
sa = list(range(n))
35+
tmp = [0] * n
36+
k = 1
37+
# Doubling loop
38+
while k < n:
39+
# Sort by (rank[i], rank[i+k]) pairs
40+
sa.sort(key=lambda i: (ranks[i], ranks[i + k] if i + k < n else -1))
41+
# Temporary array for new ranks
42+
tmp[sa[0]] = 0
43+
for i in range(1, n):
44+
prev, curr = sa[i - 1], sa[i]
45+
# Compare pair (rank, next rank)
46+
r_prev = (ranks[prev], ranks[prev + k] if prev + k < n else -1)
47+
r_curr = (ranks[curr], ranks[curr + k] if curr + k < n else -1)
48+
tmp[curr] = tmp[prev] + (1 if r_curr != r_prev else 0)
49+
ranks, tmp = tmp, ranks # reuse lists to save memory
50+
k <<= 1
51+
if ranks[sa[-1]] == n - 1:
52+
break
53+
# Drop the sentinel index
54+
return sa[1:]
55+
56+
57+
def build_lcp_array(s: str, sa: list[int]) -> list[int]:
58+
"""
59+
Builds the LCP (Longest Common Prefix) array using Kasai's algorithm.
60+
61+
Parameters:
62+
s (str): Original string
63+
sa (list[int]): Suffix array of s
64+
65+
Returns:
66+
list[int]: LCP array where lcp[i] = LCP(sa[i], sa[i-1])
67+
68+
Complexity:
69+
O(n) time and O(n) space.
70+
"""
71+
n = len(sa)
72+
# Inverse of suffix array: pos[i] gives rank of suffix at i
73+
pos = [0] * n
74+
for i, suf in enumerate(sa):
75+
pos[suf] = i
76+
lcp = [0] * n
77+
k = 0
78+
for i in range(len(s)):
79+
if pos[i] == 0:
80+
k = 0
81+
continue
82+
j = sa[pos[i] - 1]
83+
# Compare characters starting from k
84+
while i + k < len(s) and j + k < len(s) and s[i + k] == s[j + k]:
85+
k += 1
86+
lcp[pos[i]] = k
87+
if k:
88+
k -= 1
89+
return lcp[1:]
90+
91+
92+
if __name__ == '__main__':
93+
# Example usage and simple tests
94+
test_strings = ['banana', 'abracadabra', 'mississippi']
95+
for s in test_strings:
96+
sa = build_suffix_array(s)
97+
lcp = build_lcp_array(s, sa)
98+
print(f"String: {s}")
99+
print(f"Suffix Array: {sa}")
100+
print(f"LCP Array : {lcp}\n")
101+
102+
# Assertions for correctness
103+
s = 'banana'
104+
expected_sa = [5, 3, 1, 0, 4, 2] # indices of sorted suffixes
105+
assert build_suffix_array(s) == expected_sa, 'SA test failed'
106+
print('All tests passed!')

0 commit comments

Comments
 (0)