-
Notifications
You must be signed in to change notification settings - Fork 2
feat(strings): similar string groups #104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 3 commits
Commits
Show all changes
5 commits
Select commit
Hold shift + click to select a range
bc4c62e
feat(strings, datastructures): similar string groups with disjoint se…
BrianLusina aa47a80
updating DIRECTORY.md
a1cdbce
refactor(datastructures, union-find): add type hints and doc comments
BrianLusina dc02f1e
refactor(strings, similar string groups): is similar string function
BrianLusina 69e2998
refactor(strings, similar-string-groups): add length check
BrianLusina File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| from datastructures.sets import DisjointSetUnion, UnionFind | ||
|
|
||
| __all__ = [ | ||
| "DisjointSetUnion", | ||
| "UnionFind" | ||
| ] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| from datastructures.sets.union_find import DisjointSetUnion, UnionFind | ||
|
|
||
| __all__ = ["DisjointSetUnion", "UnionFind"] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,74 @@ | ||
| class DisjointSetUnion: | ||
| """A class for the Union-Find (Disjoint Set Union) data structure.""" | ||
|
|
||
| def __init__(self, size: int): | ||
| """Initializes the data structure with 'size' elements, each in its own set.""" | ||
| if size <= 0: | ||
| raise ValueError("Size must be a positive integer.") | ||
| self.root = list(range(size)) | ||
| self.rank = [1] * size # For union by rank | ||
| self.count = size # Number of disjoint sets | ||
|
|
||
| def find(self, i: int) -> int: | ||
| """Finds the representative (root) of the set containing element 'i'.""" | ||
| if self.root[i] == i: | ||
| return i | ||
| # Path compression: make all nodes on the path point to the root | ||
| self.root[i] = self.find(self.root[i]) | ||
| return self.root[i] | ||
|
|
||
| def union(self, i: int, j: int) -> bool: | ||
| """ | ||
| Merges the sets containing elements 'i' and 'j'. | ||
| Returns True if a merge occurred, False if they were already in the same set. | ||
| """ | ||
| root_i = self.find(i) # 0 | ||
| root_j = self.find(j) # 1 | ||
|
|
||
| if root_i != root_j: | ||
| # Union by rank: attach the smaller tree to the larger tree | ||
| if self.rank[root_i] > self.rank[root_j]: | ||
| self.root[root_j] = root_i | ||
| elif self.rank[root_i] < self.rank[root_j]: | ||
| self.root[root_i] = root_j | ||
| else: | ||
| self.root[root_j] = root_i | ||
| self.rank[root_i] += 1 | ||
|
|
||
| self.count -= 1 | ||
| return True | ||
|
|
||
| return False | ||
|
|
||
| def get_count(self) -> int: | ||
| """Returns the current number of disjoint sets.""" | ||
| return self.count | ||
|
|
||
|
|
||
| class UnionFind: | ||
| """A minimal Union-Find data structure with path compression.""" | ||
|
|
||
| def __init__(self, size: int): | ||
| """Initializes the data structure with 'size' elements.""" | ||
| if size <= 0: | ||
| raise ValueError("Size must be a positive integer.") | ||
| self.parent = list(range(size)) | ||
|
|
||
| def find(self, x: int) -> int: | ||
| """Finds the representative (root) of the set containing element 'x'.""" | ||
| if self.parent[x] != x: | ||
| # Path compression | ||
| self.parent[x] = self.find(self.parent[x]) | ||
| return self.parent[x] | ||
|
|
||
| def union(self, x: int, y: int) -> bool: | ||
| """ | ||
| Merges the sets containing elements 'x' and 'y'. | ||
| Returns True if a merge occurred, False if already in same set. | ||
| """ | ||
| root_x = self.find(x) | ||
| root_y = self.find(y) | ||
| if root_x != root_y: | ||
| self.parent[root_y] = root_x | ||
| return True | ||
| return False | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,113 @@ | ||
| # Similar String Groups | ||
|
|
||
| Two strings x and y are considered similar if they are either exactly the same or can be made identical by swapping at | ||
| most two different characters in string x. | ||
|
|
||
| We define a similarity group as a set of strings where each string is similar to at least one other string in the group. | ||
| A string doesn't need to be directly similar to every other string in the group — it just needs to be connected to them | ||
| through a chain of similarities. | ||
|
|
||
| Given a list of strings strs, where each string is an anagram of the others, your task is to determine how many such | ||
| similarity groups exist in the list. | ||
|
|
||
| Constraints: | ||
|
|
||
| - 1 ≤ strs.length ≤ 300 | ||
| - 1 ≤ strs[i].length ≤ 300 | ||
| - strs[i] consists of lowercase letters only. | ||
| - All words in strs have the same length and are anagrams of each other. | ||
|
|
||
| --- | ||
|
|
||
| ## Examples | ||
|
|
||
|  | ||
|  | ||
|  | ||
|
|
||
| --- | ||
|
|
||
| ## Solution | ||
|
|
||
| This problem can be seen as a graph connectivity challenge. Each string is a node, and an edge exists between two nodes | ||
| if their corresponding strings are similar. Our goal is to count how many connected groups (components) exist in this | ||
| graph. | ||
|
|
||
| We solve this problem using the Union-Find (Disjoint Set Union) data structure to efficiently group similar strings. | ||
| Initially, each string is placed in its own group. We then iterate over all possible pairs of strings. For each pair at | ||
| indexes i and j, we check whether the two strings are similar — that is, either exactly the same or differ at exactly | ||
| two positions (meaning one swap can make them equal). If they are similar and currently belong to different groups | ||
| (i.e., their roots in the Union-Find structure are different), we perform a union operation to merge their groups. | ||
| Repeating this across all string pairs gradually reduces the number of distinct groups. Finally, we count the number of | ||
| unique roots in the Union-Find structure, which represents the number of similar string groups. | ||
|
|
||
| Here’s the step-by-step explanation of the solution: | ||
|
|
||
| 1. Initialize n = len(strs). | ||
| 2. Create a Union-Find (DSU) structure with n elements, where each element is its own parent. | ||
| 3. Define a function areSimilar(s1, s2) that returns TRUE if both strings s1 and s2 are similar according to the given | ||
| condition: | ||
| - Initialize an empty list diff = [] to track differences. | ||
| - Loop through both strings in parallel using zip. | ||
| - If characters differ at any position, record the mismatch in diff. | ||
| - Early exit if more than 2 differences and return FALSE. | ||
| - After the loop is completed, evaluate the result: | ||
| - len(diff) == 0 means the strings are identical. | ||
| - len(diff) == 2 and diff[0] == diff[1][::-1] means there are exactly two differences and the character pairs are | ||
| mirror images of each other. | ||
|
|
||
| 4. Loop over all pairs (i, j) such that 0 ≤ i < j < n. | ||
| 5. For each pair, use the areSimilar function to check if strs[i] and strs[j] are similar. | ||
| 6. If similar, use find(i) and find(j) to get their root parents. | ||
| 7. If the roots differ, merge them using union(i, j). | ||
| 8. After processing all pairs, iterate over all indexes i from 0 to n - 1 and find their root parents using find(i). | ||
| 9. Add each root to a set to track unique groups. | ||
| 10. Return the size of the set as the number of similarity groups. | ||
|
|
||
| Let’s look at the following illustration to get a better understanding of the solution: | ||
|
|
||
|  | ||
|  | ||
|  | ||
|  | ||
|  | ||
|  | ||
|  | ||
|
|
||
| ### Time Complexity | ||
| Let's break the time complexity down into two major components: | ||
|
|
||
| #### **Comparing all pairs of strings** | ||
|
|
||
| To check if two strings are similar, we compare them character by character, which takes _O(m)_ where m is the length | ||
| of each string. Given there are n strings and we compare all possible pairs of strings, there are O(n²) comparisons. | ||
| Therefore, the total time spent on comparisons is O(n²∗m). | ||
|
|
||
| #### **Union-Find operations (find and union)** | ||
|
|
||
| For each similar pair, we perform a find and possibly a union operation. With path compression, each operation takes | ||
| O(α(n)) time, where α(n) is nearly constant in practice. Since there are up to O(n²) similar pairs, the total time for | ||
| Union-Find operations is O(n²∗α(n)). | ||
|
|
||
| The comparison step dominates the time complexity, as m (the string length) is typically much larger than α(n), which | ||
| grows very slowly. Therefore, the overall time complexity is O(n²∗m). | ||
|
|
||
| ### Space Complexity | ||
|
|
||
| The space complexity of the algorithm comes from the following components: | ||
|
|
||
| #### **Union-Find parent array**: | ||
|
|
||
| Requires O(n) space to store the parent of each node (one per string). | ||
|
|
||
| #### **Temporary storage in areSimilar() function**: | ||
|
|
||
| Uses O(1) space — a constant-sized list to track the positions where the two strings differ. Since at most 2 differences | ||
| are allowed, space usage remains constant. | ||
|
|
||
| #### **Set to store unique groups (roots)**: | ||
|
|
||
| Requires O(n) space in the worst case, when all strings are in separate groups and each has a unique root. | ||
|
|
||
| The total space complexity is O(n), as all other components (e.g., temporary storage and sets) do not exceed linear | ||
| space relative to the input size. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| from typing import List | ||
| from datastructures import DisjointSetUnion, UnionFind | ||
|
|
||
|
|
||
| def num_similar_groups(strs: List[str]) -> int: | ||
| strs_len = len(strs) | ||
| if strs_len == 0: | ||
| return 0 | ||
|
|
||
| # All strings have the same length, per constraints | ||
| word_len = len(strs[0]) | ||
|
|
||
| # Initialize Union-Find with n elements, one for each string. | ||
| # The initial count is n (each string is its own group). | ||
| uf = DisjointSetUnion(strs_len) | ||
|
|
||
| def is_similar(s1: str, s2: str) -> bool: | ||
| """ | ||
| Checks if two strings are similar. | ||
| Similar means identical (0 diffs) or 1 swap (2 diffs). | ||
| """ | ||
| diff_count = 0 | ||
| positions_that_differ = [] | ||
| for k in range(word_len): | ||
| if s1[k] != s2[k]: | ||
| positions_that_differ.append(k) | ||
| diff_count += 1 | ||
|
|
||
| # Optimization: If more than 2 differences, | ||
| # they can't be similar. | ||
| if diff_count > 2: | ||
| return False | ||
|
|
||
| if diff_count == 2: | ||
| i = positions_that_differ[0] | ||
| j = positions_that_differ[1] | ||
| return s1[i] == s2[j] and s1[j] == s2[i] | ||
|
|
||
| # Must have 0 diffs (identical) or 2 diffs (one swap) | ||
| return diff_count == 0 or diff_count == 2 | ||
BrianLusina marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # Iterate over all unique pairs of strings | ||
| for i in range(strs_len): | ||
| for j in range(i + 1, strs_len): | ||
| # If the strings are similar, merge their groups. | ||
| # The union() method handles decrementing the count | ||
| # only if they were in different groups. | ||
| if is_similar(strs[i], strs[j]): | ||
| uf.union(i, j) | ||
|
|
||
| # The final count of disjoint sets is the number of groups | ||
| return uf.get_count() | ||
|
|
||
| # Helper: Decide if two strings are similar | ||
| def are_similar(s1, s2): | ||
| diff = [] | ||
| for a, b in zip(s1, s2): | ||
| if a != b: | ||
| diff.append((a, b)) | ||
| if len(diff) > 2: | ||
| return False | ||
|
|
||
| return (len(diff) == 0) or ( | ||
| len(diff) == 2 and diff[0] == diff[1][::-1] | ||
| ) | ||
|
|
||
| def num_similar_groups_2(strs: List[str]) -> int: | ||
| n = len(strs) | ||
| uf = DisjointSetUnion(n) | ||
coderabbitai[bot] marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| for i in range(n): | ||
| for j in range(i + 1, n): | ||
| if are_similar(strs[i], strs[j]): | ||
| uf.union(i, j) | ||
|
|
||
| roots = {uf.find(i) for i in range(n)} | ||
| return len(roots) | ||
Binary file added
BIN
+66 KB
pystrings/similar_string_groups/images/similar_string_groups_example_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+62.3 KB
pystrings/similar_string_groups/images/similar_string_groups_example_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+71.4 KB
pystrings/similar_string_groups/images/similar_string_groups_example_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+47.4 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+74.1 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+107 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+104 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+91.5 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+117 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+98.6 KB
pystrings/similar_string_groups/images/similar_string_groups_solution_7.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
60 changes: 60 additions & 0 deletions
60
pystrings/similar_string_groups/test_similar_string_groups.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| import unittest | ||
| from . import num_similar_groups, num_similar_groups_2 | ||
|
|
||
|
|
||
| class SimilarStringGroupsTestCase(unittest.TestCase): | ||
| def test_1(self): | ||
| strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"] | ||
| expected = 3 | ||
| actual = num_similar_groups(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_2(self): | ||
| strs = ["abc", "acb", "bac", "bca", "cab", "cba"] | ||
| expected = 1 | ||
| actual = num_similar_groups(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_3(self): | ||
| strs = ["abcd", "abdc", "acbd", "bdca"] | ||
| expected = 2 | ||
| actual = num_similar_groups(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_4(self): | ||
| strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs", | ||
| "fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"] | ||
| expected = 2 | ||
| actual = num_similar_groups(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
|
|
||
| class SimilarStringGroups2TestCase(unittest.TestCase): | ||
| def test_1(self): | ||
| strs = ["jhki", "kijh", "jkhi", "kihj", "ijhk"] | ||
| expected = 3 | ||
| actual = num_similar_groups_2(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_2(self): | ||
| strs = ["abc", "acb", "bac", "bca", "cab", "cba"] | ||
| expected = 1 | ||
| actual = num_similar_groups_2(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_3(self): | ||
| strs = ["abcd", "abdc", "acbd", "bdca"] | ||
| expected = 2 | ||
| actual = num_similar_groups_2(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
| def test_4(self): | ||
| strs = ["fgtdvepeqcfajhlzkwlpuhrwfcueqfbs","fgcdvppeqcfajhlzkwluehrwftuefqbs","fgtdvepeqcfajhlzkwlpuhrwfcuefqbs", | ||
| "fgcdvepeqcfajhlzkwluphrwftuefqbs","fgldvepeqcfajhlzkwcuphrwftuefqbs","fgtdvefeqcpajhlzkwlpuhrwfcuefqbs"] | ||
| expected = 2 | ||
| actual = num_similar_groups_2(strs) | ||
| self.assertEqual(expected, actual) | ||
|
|
||
|
|
||
| if __name__ == '__main__': | ||
| unittest.main() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.