Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions DIRECTORY.md
Original file line number Diff line number Diff line change
Expand Up @@ -729,6 +729,8 @@
* [Test Reverse Vowels](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_vowels/test_reverse_vowels.py)
* Reverse Words
* [Test Reverse Words](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/reverse_words/test_reverse_words.py)
* Similar String Groups
* [Test Similar String Groups](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/similar_string_groups/test_similar_string_groups.py)
* Spreadsheet Encoding
* [Test Spreadsheet Encode](https://github.com/BrianLusina/PythonSnips/blob/master/pystrings/spreadsheet_encoding/test_spreadsheet_encode.py)
* String Compression
Expand Down
6 changes: 6 additions & 0 deletions datastructures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from datastructures.sets import DisjointSetUnion, UnionFind

__all__ = [
"DisjointSetUnion",
"UnionFind"
]
3 changes: 3 additions & 0 deletions datastructures/sets/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from datastructures.sets.union_find import DisjointSetUnion, UnionFind

__all__ = ["DisjointSetUnion", "UnionFind"]
74 changes: 74 additions & 0 deletions datastructures/sets/union_find/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
class DisjointSetUnion:
"""A class for the Union-Find (Disjoint Set Union) data structure."""

def __init__(self, size: int):
"""Initializes the data structure with 'size' elements, each in its own set."""
if size <= 0:
raise ValueError("Size must be a positive integer.")
self.root = list(range(size))
self.rank = [1] * size # For union by rank
self.count = size # Number of disjoint sets

def find(self, i: int) -> int:
"""Finds the representative (root) of the set containing element 'i'."""
if self.root[i] == i:
return i
# Path compression: make all nodes on the path point to the root
self.root[i] = self.find(self.root[i])
return self.root[i]

def union(self, i: int, j: int) -> bool:
"""
Merges the sets containing elements 'i' and 'j'.
Returns True if a merge occurred, False if they were already in the same set.
"""
root_i = self.find(i) # 0
root_j = self.find(j) # 1

if root_i != root_j:
# Union by rank: attach the smaller tree to the larger tree
if self.rank[root_i] > self.rank[root_j]:
self.root[root_j] = root_i
elif self.rank[root_i] < self.rank[root_j]:
self.root[root_i] = root_j
else:
self.root[root_j] = root_i
self.rank[root_i] += 1

self.count -= 1
return True

return False

def get_count(self) -> int:
"""Returns the current number of disjoint sets."""
return self.count


class UnionFind:
"""A minimal Union-Find data structure with path compression."""

def __init__(self, size: int):
"""Initializes the data structure with 'size' elements."""
if size <= 0:
raise ValueError("Size must be a positive integer.")
self.parent = list(range(size))

def find(self, x: int) -> int:
"""Finds the representative (root) of the set containing element 'x'."""
if self.parent[x] != x:
# Path compression
self.parent[x] = self.find(self.parent[x])
return self.parent[x]

def union(self, x: int, y: int) -> bool:
"""
Merges the sets containing elements 'x' and 'y'.
Returns True if a merge occurred, False if already in same set.
"""
root_x = self.find(x)
root_y = self.find(y)
if root_x != root_y:
self.parent[root_y] = root_x
return True
return False
113 changes: 113 additions & 0 deletions pystrings/similar_string_groups/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Similar String Groups

Two strings x and y are considered similar if they are either exactly the same or can be made identical by swapping at
most two different characters in string x.

We define a similarity group as a set of strings where each string is similar to at least one other string in the group.
A string doesn't need to be directly similar to every other string in the group — it just needs to be connected to them
through a chain of similarities.

Given a list of strings strs, where each string is an anagram of the others, your task is to determine how many such
similarity groups exist in the list.

Constraints:

- 1 ≤ strs.length ≤ 300
- 1 ≤ strs[i].length ≤ 300
- strs[i] consists of lowercase letters only.
- All words in strs have the same length and are anagrams of each other.

---

## Examples

![Example 1](./images/similar_string_groups_example_1.png)
![Example 2](./images/similar_string_groups_example_2.png)
![Example 3](./images/similar_string_groups_example_3.png)

---

## Solution

This problem can be seen as a graph connectivity challenge. Each string is a node, and an edge exists between two nodes
if their corresponding strings are similar. Our goal is to count how many connected groups (components) exist in this
graph.

We solve this problem using the Union-Find (Disjoint Set Union) data structure to efficiently group similar strings.
Initially, each string is placed in its own group. We then iterate over all possible pairs of strings. For each pair at
indexes i and j, we check whether the two strings are similar — that is, either exactly the same or differ at exactly
two positions (meaning one swap can make them equal). If they are similar and currently belong to different groups
(i.e., their roots in the Union-Find structure are different), we perform a union operation to merge their groups.
Repeating this across all string pairs gradually reduces the number of distinct groups. Finally, we count the number of
unique roots in the Union-Find structure, which represents the number of similar string groups.

Here’s the step-by-step explanation of the solution:

1. Initialize n = len(strs).
2. Create a Union-Find (DSU) structure with n elements, where each element is its own parent.
3. Define a function areSimilar(s1, s2) that returns TRUE if both strings s1 and s2 are similar according to the given
condition:
- Initialize an empty list diff = [] to track differences.
- Loop through both strings in parallel using zip.
- If characters differ at any position, record the mismatch in diff.
- Early exit if more than 2 differences and return FALSE.
- After the loop is completed, evaluate the result:
- len(diff) == 0 means the strings are identical.
- len(diff) == 2 and diff[0] == diff[1][::-1] means there are exactly two differences and the character pairs are
mirror images of each other.

4. Loop over all pairs (i, j) such that 0 ≤ i < j < n.
5. For each pair, use the areSimilar function to check if strs[i] and strs[j] are similar.
6. If similar, use find(i) and find(j) to get their root parents.
7. If the roots differ, merge them using union(i, j).
8. After processing all pairs, iterate over all indexes i from 0 to n - 1 and find their root parents using find(i).
9. Add each root to a set to track unique groups.
10. Return the size of the set as the number of similarity groups.

Let’s look at the following illustration to get a better understanding of the solution:

![Solution 1](./images/similar_string_groups_solution_1.png)
![Solution 2](./images/similar_string_groups_solution_2.png)
![Solution 3](./images/similar_string_groups_solution_3.png)
![Solution 4](./images/similar_string_groups_solution_4.png)
![Solution 5](./images/similar_string_groups_solution_5.png)
![Solution 6](./images/similar_string_groups_solution_6.png)
![Solution 7](./images/similar_string_groups_solution_7.png)

### Time Complexity
Let's break the time complexity down into two major components:

#### **Comparing all pairs of strings**

To check if two strings are similar, we compare them character by character, which takes _O(m)_ where m is the length
of each string. Given there are n strings and we compare all possible pairs of strings, there are O(n²) comparisons.
Therefore, the total time spent on comparisons is O(n²∗m).

#### **Union-Find operations (find and union)**

For each similar pair, we perform a find and possibly a union operation. With path compression, each operation takes
O(α(n)) time, where α(n) is nearly constant in practice. Since there are up to O(n²) similar pairs, the total time for
Union-Find operations is O(n²∗α(n)).

The comparison step dominates the time complexity, as m (the string length) is typically much larger than α(n), which
grows very slowly. Therefore, the overall time complexity is O(n²∗m).

### Space Complexity

The space complexity of the algorithm comes from the following components:

#### **Union-Find parent array**:

Requires O(n) space to store the parent of each node (one per string).

#### **Temporary storage in areSimilar() function**:

Uses O(1) space — a constant-sized list to track the positions where the two strings differ. Since at most 2 differences
are allowed, space usage remains constant.

#### **Set to store unique groups (roots)**:

Requires O(n) space in the worst case, when all strings are in separate groups and each has a unique root.

The total space complexity is O(n), as all other components (e.g., temporary storage and sets) do not exceed linear
space relative to the input size.
77 changes: 77 additions & 0 deletions pystrings/similar_string_groups/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from typing import List
from datastructures import DisjointSetUnion, UnionFind


def num_similar_groups(strs: List[str]) -> int:
strs_len = len(strs)
if strs_len == 0:
return 0

# All strings have the same length, per constraints
word_len = len(strs[0])

# Initialize Union-Find with n elements, one for each string.
# The initial count is n (each string is its own group).
uf = DisjointSetUnion(strs_len)

def is_similar(s1: str, s2: str) -> bool:
"""
Checks if two strings are similar.
Similar means identical (0 diffs) or 1 swap (2 diffs).
"""
diff_count = 0
positions_that_differ = []
for k in range(word_len):
if s1[k] != s2[k]:
positions_that_differ.append(k)
diff_count += 1

# Optimization: If more than 2 differences,
# they can't be similar.
if diff_count > 2:
return False

if diff_count == 2:
i = positions_that_differ[0]
j = positions_that_differ[1]
return s1[i] == s2[j] and s1[j] == s2[i]

# Must have 0 diffs (identical) or 2 diffs (one swap)
return diff_count == 0 or diff_count == 2

# Iterate over all unique pairs of strings
for i in range(strs_len):
for j in range(i + 1, strs_len):
# If the strings are similar, merge their groups.
# The union() method handles decrementing the count
# only if they were in different groups.
if is_similar(strs[i], strs[j]):
uf.union(i, j)

# The final count of disjoint sets is the number of groups
return uf.get_count()

# Helper: Decide if two strings are similar
def are_similar(s1, s2):
diff = []
for a, b in zip(s1, s2):
if a != b:
diff.append((a, b))
if len(diff) > 2:
return False

return (len(diff) == 0) or (
len(diff) == 2 and diff[0] == diff[1][::-1]
)

def num_similar_groups_2(strs: List[str]) -> int:
n = len(strs)
uf = DisjointSetUnion(n)

for i in range(n):
for j in range(i + 1, n):
if are_similar(strs[i], strs[j]):
uf.union(i, j)

roots = {uf.find(i) for i in range(n)}
return len(roots)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 60 additions & 0 deletions pystrings/similar_string_groups/test_similar_string_groups.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import unittest
from . import num_similar_groups, num_similar_groups_2


class SimilarStringGroupsTestCase(unittest.TestCase):
def test_1(self):
strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"]
expected = 3
actual = num_similar_groups(strs)
self.assertEqual(expected, actual)

def test_2(self):
strs = ["abc", "acb", "bac", "bca", "cab", "cba"]
expected = 1
actual = num_similar_groups(strs)
self.assertEqual(expected, actual)

def test_3(self):
strs = ["abcd", "abdc", "acbd", "bdca"]
expected = 2
actual = num_similar_groups(strs)
self.assertEqual(expected, actual)

def test_4(self):
strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs",
"fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"]
expected = 2
actual = num_similar_groups(strs)
self.assertEqual(expected, actual)


class SimilarStringGroups2TestCase(unittest.TestCase):
def test_1(self):
strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"]
expected = 3
actual = num_similar_groups_2(strs)
self.assertEqual(expected, actual)

def test_2(self):
strs = ["abc", "acb", "bac", "bca", "cab", "cba"]
expected = 1
actual = num_similar_groups_2(strs)
self.assertEqual(expected, actual)

def test_3(self):
strs = ["abcd", "abdc", "acbd", "bdca"]
expected = 2
actual = num_similar_groups_2(strs)
self.assertEqual(expected, actual)

def test_4(self):
strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs",
"fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"]
expected = 2
actual = num_similar_groups_2(strs)
self.assertEqual(expected, actual)


if __name__ == '__main__':
unittest.main()
Loading