Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions exercises/advanced/suffix_automaton/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Suffix Automaton (SAM)

This exercise implements a Suffix Automaton and small utilities.

## What this provides
- `suffix_automaton.py`: implementation and helper functions.
- `examples.py`: run `python examples.py` to see sample outputs.
- `test_suffix_automaton.py`: unit tests (pytest).

## Algorithms & Problems solvable
- Check if a string `t` is a substring of `s`
- Count distinct substrings of `s` (linear time)
- Longest common substring between `s` and `t`
- Count number of occurrences of `t` in `s` (using endpos propagation)

## Complexity
- Construction: O(n) where n = len(s)
- Queries (contains/occurrences/LCS): O(|t|)

## How to run
```bash
python examples.py
pytest -q
16 changes: 16 additions & 0 deletions exercises/advanced/suffix_automaton/examples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# exercises/advanced/suffix_automaton/examples.py
from suffix_automaton import build_from_string

def demo():
s = "abracadabra"
sam = build_from_string(s)
print("Original:", s)
print("Contains 'cada'?", sam.contains("cada"))
print("Contains 'xyz'?", sam.contains("xyz"))
print("Distinct substrings:", sam.count_distinct_substrings())
l, substr = sam.longest_common_substring("cadabra")
print("LCS with 'cadabra':", l, substr)
print("Occurrences of 'abra':", sam.occurrences("abra"))

if __name__ == "__main__":
demo()
162 changes: 162 additions & 0 deletions exercises/advanced/suffix_automaton/suffix_automaton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# exercises/advanced/suffix_automaton/suffix_automaton.py
from collections import defaultdict, deque
from typing import Dict, List, Tuple

class SuffixAutomaton:
"""
Suffix Automaton (SAM) implementation with utilities:
- add_string(s): build SAM for s
- contains(t): check if t is a substring of s
- count_distinct_substrings(): count distinct substrings of s
- longest_common_substring(t): length and one example of LCS with t
- occurrences(t): number of occurrences of t in s (requires endpos propagation)
Complexity: build O(n), queries O(|t|) generally.
"""

class State:
__slots__ = ("len", "link", "next", "occ")
def __init__(self):
self.len = 0
self.link = -1
self.next: Dict[str,int] = {}
self.occ = 0 # for occurrence counting (endpos size)

def __init__(self):
self.states: List[SuffixAutomaton.State] = []
self.last = 0
self._init_sam()

def _init_sam(self):
self.states = [SuffixAutomaton.State()]
self.states[0].len = 0
self.states[0].link = -1
self.last = 0

def sa_extend(self, c: str):
"""Extend SAM by character c (single char string)."""
p = self.last
cur = len(self.states)
self.states.append(SuffixAutomaton.State())
self.states[cur].len = self.states[p].len + 1
self.states[cur].occ = 1 # this state corresponds to a new end position

while p != -1 and c not in self.states[p].next:
self.states[p].next[c] = cur
p = self.states[p].link

if p == -1:
self.states[cur].link = 0
else:
q = self.states[p].next[c]
if self.states[p].len + 1 == self.states[q].len:
self.states[cur].link = q
else:
# clone
clone = len(self.states)
self.states.append(SuffixAutomaton.State())
self.states[clone].len = self.states[p].len + 1
self.states[clone].next = self.states[q].next.copy()
self.states[clone].link = self.states[q].link
# occ for clone stays 0 (we'll propagate later)
while p != -1 and self.states[p].next.get(c) == q:
self.states[p].next[c] = clone
p = self.states[p].link
self.states[q].link = self.states[cur].link = clone

self.last = cur

def build(self, s: str):
"""Build SAM for string s."""
self._init_sam()
for ch in s:
self.sa_extend(ch)
# After building, we can propagate occurrence counts if desired
self._propagate_occurrences()

def _propagate_occurrences(self):
"""Propagate endpos counts from longer states to linked states."""
# Bucket states by length (counting sort approach)
max_len = max(state.len for state in self.states) if self.states else 0
buckets = [0] * (max_len + 1)
for st in self.states:
buckets[st.len] += 1
for i in range(1, len(buckets)):
buckets[i] += buckets[i - 1]
order = [None] * len(self.states)
for i in range(len(self.states) - 1, -1, -1):
st = self.states[i]
buckets[st.len] -= 1
order[buckets[st.len]] = i
# traverse states from longest to shortest
for idx in reversed(order):
st = self.states[idx]
if st.link != -1:
self.states[st.link].occ += st.occ

def contains(self, t: str) -> bool:
"""Return True if t is a substring of the built string."""
cur = 0
for ch in t:
if ch not in self.states[cur].next:
return False
cur = self.states[cur].next[ch]
return True

def count_distinct_substrings(self) -> int:
"""Number of distinct substrings of the original string."""
# sum over states: len[state] - len[link[state]]
total = 0
for i, st in enumerate(self.states):
if st.link != -1:
total += st.len - self.states[st.link].len
else:
total += st.len # for root, link == -1 contributes len (but root len is 0)
# but root len is zero so above still works
# standard formula: sum_{v} (len[v] - len[link[v]])
total = sum((st.len - (self.states[st.link].len if st.link != -1 else 0)) for st in self.states)
return total

def longest_common_substring(self, t: str) -> Tuple[int, str]:
"""
Find the length and a substring that is the LCS between the built string and t.
Returns (length, substring_example).
"""
v = 0
l = 0
best = 0
best_pos = 0 # position in t where best ends
for i, ch in enumerate(t):
# walk with transitions; if not possible, follow links
while v != 0 and ch not in self.states[v].next:
v = self.states[v].link
l = self.states[v].len if v != -1 else 0
if ch in self.states[v].next:
v = self.states[v].next[ch]
l += 1
else:
v = 0
l = 0
if l > best:
best = l
best_pos = i
if best == 0:
return 0, ""
return best, t[best_pos - best + 1: best_pos + 1]

def occurrences(self, t: str) -> int:
"""
Return number of occurrences of t in the built string.
Requires that _propagate_occurrences was called after build (we call it automatically).
"""
cur = 0
for ch in t:
if ch not in self.states[cur].next:
return 0
cur = self.states[cur].next[ch]
return self.states[cur].occ

# A minimal CLI-style helper (importable)
def build_from_string(s: str) -> SuffixAutomaton:
sam = SuffixAutomaton()
sam.build(s)
return sam
37 changes: 37 additions & 0 deletions exercises/advanced/suffix_automaton/test_suffix_automaton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# exercises/advanced/suffix_automaton/test_suffix_automaton.py
import pytest
from suffix_automaton import build_from_string

def test_contains_and_occurrences():
s = "ababab"
sam = build_from_string(s)
assert sam.contains("aba")
assert sam.contains("bab")
assert not sam.contains("baa")
# occurrences: "aba" appears twice in "ababab" (positions 0 and 2)
assert sam.occurrences("aba") == 2
assert sam.occurrences("ab") == 3

def test_distinct_substrings_small():
s = "aaa"
sam = build_from_string(s)
# substrings: "a", "aa", "aaa" => 3 distinct
assert sam.count_distinct_substrings() == 3

def test_lcs_example():
s = "abcdxyz"
t = "xyzabcd"
sam = build_from_string(s)
length, substr = sam.longest_common_substring(t)
assert length == 4
# substring can be "abcd" or "xyz" depending — check length and membership
assert len(substr) == length
assert substr in s and substr in t

def test_no_common():
s = "abc"
t = "zzz"
sam = build_from_string(s)
length, substr = sam.longest_common_substring(t)
assert length == 0
assert substr == ""