Skip to content

Commit e43536e

Browse files
authored
Merge pull request #104 from BrianLusina/feat/similar-string-groups
feat(strings): similar string groups
2 parents a90f0fc + 69e2998 commit e43536e

17 files changed

+339
-0
lines changed

DIRECTORY.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -729,6 +729,8 @@
729729
* [Test Reverse Vowels](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_vowels/test_reverse_vowels.py)
730730
* Reverse Words
731731
* [Test Reverse Words](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_words/test_reverse_words.py)
732+
* Similar String Groups
733+
* [Test Similar String Groups](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/similar_string_groups/test_similar_string_groups.py)
732734
* Spreadsheet Encoding
733735
* [Test Spreadsheet Encode](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/spreadsheet_encoding/test_spreadsheet_encode.py)
734736
* String Compression

datastructures/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from datastructures.sets import DisjointSetUnion, UnionFind
2+
3+
__all__ = [
4+
"DisjointSetUnion",
5+
"UnionFind"
6+
]

datastructures/sets/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from datastructures.sets.union_find import DisjointSetUnion, UnionFind
2+
3+
__all__ = ["DisjointSetUnion", "UnionFind"]
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
class DisjointSetUnion:
2+
"""A class for the Union-Find (Disjoint Set Union) data structure."""
3+
4+
def __init__(self, size: int):
5+
"""Initializes the data structure with 'size' elements, each in its own set."""
6+
if size <= 0:
7+
raise ValueError("Size must be a positive integer.")
8+
self.root = list(range(size))
9+
self.rank = [1] * size # For union by rank
10+
self.count = size # Number of disjoint sets
11+
12+
def find(self, i: int) -> int:
13+
"""Finds the representative (root) of the set containing element 'i'."""
14+
if self.root[i] == i:
15+
return i
16+
# Path compression: make all nodes on the path point to the root
17+
self.root[i] = self.find(self.root[i])
18+
return self.root[i]
19+
20+
def union(self, i: int, j: int) -> bool:
21+
"""
22+
Merges the sets containing elements 'i' and 'j'.
23+
Returns True if a merge occurred, False if they were already in the same set.
24+
"""
25+
root_i = self.find(i)
26+
root_j = self.find(j)
27+
28+
if root_i != root_j:
29+
# Union by rank: attach the smaller tree to the larger tree
30+
if self.rank[root_i] > self.rank[root_j]:
31+
self.root[root_j] = root_i
32+
elif self.rank[root_i] < self.rank[root_j]:
33+
self.root[root_i] = root_j
34+
else:
35+
self.root[root_j] = root_i
36+
self.rank[root_i] += 1
37+
38+
self.count -= 1
39+
return True
40+
41+
return False
42+
43+
def get_count(self) -> int:
44+
"""Returns the current number of disjoint sets."""
45+
return self.count
46+
47+
48+
class UnionFind:
49+
"""A minimal Union-Find data structure with path compression."""
50+
51+
def __init__(self, size: int):
52+
"""Initializes the data structure with 'size' elements."""
53+
if size <= 0:
54+
raise ValueError("Size must be a positive integer.")
55+
self.parent = list(range(size))
56+
57+
def find(self, x: int) -> int:
58+
"""Finds the representative (root) of the set containing element 'x'."""
59+
if self.parent[x] != x:
60+
# Path compression
61+
self.parent[x] = self.find(self.parent[x])
62+
return self.parent[x]
63+
64+
def union(self, x: int, y: int) -> bool:
65+
"""
66+
Merges the sets containing elements 'x' and 'y'.
67+
Returns True if a merge occurred, False if already in same set.
68+
"""
69+
root_x = self.find(x)
70+
root_y = self.find(y)
71+
if root_x != root_y:
72+
self.parent[root_y] = root_x
73+
return True
74+
return False
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
# Similar String Groups
2+
3+
Two strings x and y are considered similar if they are either exactly the same or can be made identical by swapping at
4+
most two different characters in string x.
5+
6+
We define a similarity group as a set of strings where each string is similar to at least one other string in the group.
7+
A string doesn't need to be directly similar to every other string in the group — it just needs to be connected to them
8+
through a chain of similarities.
9+
10+
Given a list of strings strs, where each string is an anagram of the others, your task is to determine how many such
11+
similarity groups exist in the list.
12+
13+
Constraints:
14+
15+
- 1 ≤ strs.length ≤ 300
16+
- 1 ≤ strs[i].length ≤ 300
17+
- strs[i] consists of lowercase letters only.
18+
- All words in strs have the same length and are anagrams of each other.
19+
20+
---
21+
22+
## Examples
23+
24+
![Example 1](./images/similar_string_groups_example_1.png)
25+
![Example 2](./images/similar_string_groups_example_2.png)
26+
![Example 3](./images/similar_string_groups_example_3.png)
27+
28+
---
29+
30+
## Solution
31+
32+
This problem can be seen as a graph connectivity challenge. Each string is a node, and an edge exists between two nodes
33+
if their corresponding strings are similar. Our goal is to count how many connected groups (components) exist in this
34+
graph.
35+
36+
We solve this problem using the Union-Find (Disjoint Set Union) data structure to efficiently group similar strings.
37+
Initially, each string is placed in its own group. We then iterate over all possible pairs of strings. For each pair at
38+
indexes i and j, we check whether the two strings are similar — that is, either exactly the same or differ at exactly
39+
two positions (meaning one swap can make them equal). If they are similar and currently belong to different groups
40+
(i.e., their roots in the Union-Find structure are different), we perform a union operation to merge their groups.
41+
Repeating this across all string pairs gradually reduces the number of distinct groups. Finally, we count the number of
42+
unique roots in the Union-Find structure, which represents the number of similar string groups.
43+
44+
Here’s the step-by-step explanation of the solution:
45+
46+
1. Initialize n = len(strs).
47+
2. Create a Union-Find (DSU) structure with n elements, where each element is its own parent.
48+
3. Define a function areSimilar(s1, s2) that returns TRUE if both strings s1 and s2 are similar according to the given
49+
condition:
50+
- Initialize an empty list diff = [] to track differences.
51+
- Loop through both strings in parallel using zip.
52+
- If characters differ at any position, record the mismatch in diff.
53+
- Early exit if more than 2 differences and return FALSE.
54+
- After the loop is completed, evaluate the result:
55+
- len(diff) == 0 means the strings are identical.
56+
- len(diff) == 2 and diff[0] == diff[1][::-1] means there are exactly two differences and the character pairs are
57+
mirror images of each other.
58+
59+
4. Loop over all pairs (i, j) such that 0 ≤ i < j < n.
60+
5. For each pair, use the areSimilar function to check if strs[i] and strs[j] are similar.
61+
6. If similar, use find(i) and find(j) to get their root parents.
62+
7. If the roots differ, merge them using union(i, j).
63+
8. After processing all pairs, iterate over all indexes i from 0 to n - 1 and find their root parents using find(i).
64+
9. Add each root to a set to track unique groups.
65+
10. Return the size of the set as the number of similarity groups.
66+
67+
Let’s look at the following illustration to get a better understanding of the solution:
68+
69+
![Solution 1](./images/similar_string_groups_solution_1.png)
70+
![Solution 2](./images/similar_string_groups_solution_2.png)
71+
![Solution 3](./images/similar_string_groups_solution_3.png)
72+
![Solution 4](./images/similar_string_groups_solution_4.png)
73+
![Solution 5](./images/similar_string_groups_solution_5.png)
74+
![Solution 6](./images/similar_string_groups_solution_6.png)
75+
![Solution 7](./images/similar_string_groups_solution_7.png)
76+
77+
### Time Complexity
78+
Let's break the time complexity down into two major components:
79+
80+
#### **Comparing all pairs of strings**
81+
82+
To check if two strings are similar, we compare them character by character, which takes _O(m)_ where m is the length
83+
of each string. Given there are n strings and we compare all possible pairs of strings, there are O(n²) comparisons.
84+
Therefore, the total time spent on comparisons is O(n²∗m).
85+
86+
#### **Union-Find operations (find and union)**
87+
88+
For each similar pair, we perform a find and possibly a union operation. With path compression, each operation takes
89+
O(α(n)) time, where α(n) is nearly constant in practice. Since there are up to O(n²) similar pairs, the total time for
90+
Union-Find operations is O(n²∗α(n)).
91+
92+
The comparison step dominates the time complexity, as m (the string length) is typically much larger than α(n), which
93+
grows very slowly. Therefore, the overall time complexity is O(n²∗m).
94+
95+
### Space Complexity
96+
97+
The space complexity of the algorithm comes from the following components:
98+
99+
#### **Union-Find parent array**:
100+
101+
Requires O(n) space to store the parent of each node (one per string).
102+
103+
#### **Temporary storage in areSimilar() function**:
104+
105+
Uses O(1) space — a constant-sized list to track the positions where the two strings differ. Since at most 2 differences
106+
are allowed, space usage remains constant.
107+
108+
#### **Set to store unique groups (roots)**:
109+
110+
Requires O(n) space in the worst case, when all strings are in separate groups and each has a unique root.
111+
112+
The total space complexity is O(n), as all other components (e.g., temporary storage and sets) do not exceed linear
113+
space relative to the input size.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
from typing import List
2+
from datastructures import DisjointSetUnion, UnionFind
3+
4+
5+
def num_similar_groups(strs: List[str]) -> int:
6+
strs_len = len(strs)
7+
if strs_len == 0:
8+
return 0
9+
10+
# All strings have the same length, per constraints
11+
word_len = len(strs[0])
12+
13+
# Initialize Union-Find with n elements, one for each string.
14+
# The initial count is n (each string is its own group).
15+
uf = DisjointSetUnion(strs_len)
16+
17+
def is_similar(s1: str, s2: str) -> bool:
18+
"""
19+
Checks if two strings are similar.
20+
Similar means identical (0 diffs) or 1 swap (2 diffs).
21+
"""
22+
diff_count = 0
23+
positions_that_differ = []
24+
for k in range(word_len):
25+
if s1[k] != s2[k]:
26+
positions_that_differ.append(k)
27+
diff_count += 1
28+
29+
# Optimization: If more than 2 differences,
30+
# they can't be similar.
31+
if diff_count > 2:
32+
return False
33+
34+
if diff_count == 2:
35+
i = positions_that_differ[0]
36+
j = positions_that_differ[1]
37+
return s1[i] == s2[j] and s1[j] == s2[i]
38+
39+
# At this point, diff_count is either 0 or 1
40+
# Only 0 differences (identical strings) are similar
41+
return diff_count == 0
42+
43+
# Iterate over all unique pairs of strings
44+
for i in range(strs_len):
45+
for j in range(i + 1, strs_len):
46+
# If the strings are similar, merge their groups.
47+
# The union() method handles decrementing the count
48+
# only if they were in different groups.
49+
if is_similar(strs[i], strs[j]):
50+
uf.union(i, j)
51+
52+
# The final count of disjoint sets is the number of groups
53+
return uf.get_count()
54+
55+
# Helper: Decide if two strings are similar
56+
def are_similar(s1, s2):
57+
diff = []
58+
for a, b in zip(s1, s2):
59+
if a != b:
60+
diff.append((a, b))
61+
if len(diff) > 2:
62+
return False
63+
64+
return (len(diff) == 0) or (
65+
len(diff) == 2 and diff[0] == diff[1][::-1]
66+
)
67+
68+
def num_similar_groups_2(strs: List[str]) -> int:
69+
n = len(strs)
70+
if n == 0:
71+
return 0
72+
73+
uf = UnionFind(n)
74+
75+
for i in range(n):
76+
for j in range(i + 1, n):
77+
if are_similar(strs[i], strs[j]):
78+
uf.union(i, j)
79+
80+
roots = {uf.find(i) for i in range(n)}
81+
return len(roots)
66 KB
Loading
62.3 KB
Loading
71.4 KB
Loading
47.4 KB
Loading

0 commit comments

Comments
 (0)