diff --git a/DIRECTORY.md b/DIRECTORY.md index 84bba95c..37c48120 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -7,8 +7,6 @@ * [Test Intersection](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/arrays/intersection/test_intersection.py) * [Test Intersection One](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/arrays/intersection/test_intersection_one.py) * [Test Intersection Two](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/arrays/intersection/test_intersection_two.py) - * Majority Element - * [Test Majority Element](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/arrays/majority_element/test_majority_element.py) * Non Constructible Change * [Test Non Constructible Change](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/arrays/non_constructible_change/test_non_constructible_change.py) * Optimal Task Assignment @@ -70,6 +68,8 @@ * [Test Min Distance](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/dynamic_programming/min_distance/test_min_distance.py) * Unique Paths * [Test Unique Paths](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/dynamic_programming/unique_paths/test_unique_paths.py) + * Word Break + * [Test Word Break](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/dynamic_programming/word_break/test_word_break.py) * Fast And Slow * Circular Array Loop * [Test Circular Array Loop](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/fast_and_slow/circular_array_loop/test_circular_array_loop.py) @@ -83,12 +83,22 @@ * Frog Position After T Seconds * [Test Frog Position After T Seconds](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/graphs/frog_position_after_t_seconds/test_frog_position_after_t_seconds.py) * Greedy + * Gas Stations + * [Test Gas Stations](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/greedy/gas_stations/test_gas_stations.py) + * Majority Element + * [Test Majority Element](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/greedy/majority_element/test_majority_element.py) * Min Arrows * [Test Find Min Arrows](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/greedy/min_arrows/test_find_min_arrows.py) * Huffman * [Decoding](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/huffman/decoding.py) * [Encoding](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/huffman/encoding.py) * Intervals + * Insert Interval + * [Test Insert Interval](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/intervals/insert_interval/test_insert_interval.py) + * Meeting Rooms + * [Test Min Meeting Rooms](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/intervals/meeting_rooms/test_min_meeting_rooms.py) + * Merge Intervals + * [Test Merge Intervals](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/intervals/merge_intervals/test_merge_intervals.py) * Task Scheduler * [Test Task Scheduler](https://github.com/BrianLusina/PythonSnips/blob/master/algorithms/intervals/task_scheduler/test_task_scheduler.py) * Josephus Circle @@ -330,6 +340,9 @@ * [Node](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/ternary/node.py) * [Test Ternary Tree Paths](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/ternary/test_ternary_tree_paths.py) * Trie + * Alphabet Trie + * [Alphabet Trie](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/trie/alphabet_trie/alphabet_trie.py) + * [Alphabet Trie Node](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/trie/alphabet_trie/alphabet_trie_node.py) * Suffix * [Suffix Tree](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/trie/suffix/suffix_tree.py) * [Suffix Tree Node](https://github.com/BrianLusina/PythonSnips/blob/master/datastructures/trees/trie/suffix/suffix_tree_node.py) @@ -478,8 +491,6 @@ * [Test Candy](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/arrays/candy/test_candy.py) * Container With Most Water * [Test Container With Most Water](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/arrays/container_with_most_water/test_container_with_most_water.py) - * Gas Stations - * [Test Gas Stations](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/arrays/gas_stations/test_gas_stations.py) * H Index * [Test H Index](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/arrays/h_index/test_h_index.py) * Increasing Triplet Subsequence @@ -569,8 +580,6 @@ * [Hidden Cubic Numbers](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/hidden_cubic_numbers/hidden_cubic_numbers.py) * Matrix In Spiral Form * [Test Make Spiral](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/matrix_in_spiral_form/test_make_spiral.py) - * Meeting Rooms - * [Test Min Meeting Rooms](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/meeting_rooms/test_min_meeting_rooms.py) * Minimize The Absolute Difference * [Test Minimize Absolute Difference](https://github.com/BrianLusina/PythonSnips/blob/master/puzzles/minimize_the_absolute_difference/test_minimize_absolute_difference.py) * Next Permutation @@ -824,7 +833,6 @@ * [Test Min Window Substring](https://github.com/BrianLusina/PythonSnips/blob/master/tests/algorithms/sliding_window/test_min_window_substring.py) * Sorting * [Test Counting Sort](https://github.com/BrianLusina/PythonSnips/blob/master/tests/algorithms/sorting/test_counting_sort.py) - * [Test Merge Intervals](https://github.com/BrianLusina/PythonSnips/blob/master/tests/algorithms/sorting/test_merge_intervals.py) * Strings * [Test Validate Ipv4](https://github.com/BrianLusina/PythonSnips/blob/master/tests/algorithms/strings/test_validate_ipv4.py) * [Test Bubble Sort](https://github.com/BrianLusina/PythonSnips/blob/master/tests/algorithms/test_bubble_sort.py) diff --git a/algorithms/dynamic_programming/word_break/README.md b/algorithms/dynamic_programming/word_break/README.md new file mode 100644 index 00000000..c0fb918b --- /dev/null +++ b/algorithms/dynamic_programming/word_break/README.md @@ -0,0 +1,405 @@ +# Word Break + +You are given a string, s, and an array of strings, word_dict, representing a dictionary. Your task is to add spaces to +s to break it up into a sequence of valid words from word_dict. We are required to return an array of all possible +sequences of words (sentences). The order in which the sentences are listed is not significant. + +> Note: The same dictionary word may be reused multiple times in the segmentation. + +## Constraints + +- 1 <= s.length <= 20 +- 1 <= word_dict.length <= 1000 +- 1 <= word_dict[i].length <= 10 +- s and word_dict[i] consist of only lowercase English letters. +- All the strings in word_dict are unique. + +## Topics + +- Array +- Hash Table +- String +- Dynamic Programming +- Backtracking +- Trie +- Memoization + +## Solutions + +1. [Naive Approach](#naive-approach) +2. [Backtracking](#backtracking) +3. [Dynamic Programming - tabulation](#optimized-approach-using-dynamic-programming---tabulation) +4. [Dynamic Programming - memoization](#dynamic-programming---memoization) +5. [Trie Optimization](#trie-optimization) + +### Naive Approach + +The naive approach to solve this problem is to use a traditional recursive strategy in which we take each prefix of the +input string, s, and compare it to each word in the dictionary. If it matches, we take the string’s suffix and repeat +the process. + +Here is how the algorithm works: + +1. **Base case**: If the string is empty, there are no characters in the string that are left to process, so there’ll + be no sentences that can be formed. Hence, we return an empty array. +2. Otherwise, the string will not be empty, so we’ll iterate every word of the dictionary and check whether or not the + string starts with the current dictionary word. This ensures that only valid word combinations are considered: + - If it doesn’t start with the current dictionary word, no valid combinations can be formed from this word, so we + move on to the next dictionary word. + - If it does start with the current dictionary word, we have two options: + - If the length of the current dictionary word is equal to the length of the string, it means the entire string + can be formed from the current dictionary word. In this case, the string s is directly added to the result without + any further processing. + - **Recursive case**: Otherwise, the length of the current dictionary word will be less than the length of the + string. This means that the string can be broken down further. Therefore, we make a recursive call to evaluate + the remaining portion (suffix) of the string. + + - We’ll then concatenate the prefix and the result of the suffix computed by the recursive call above and store it in + the result. + +3. After all possible combinations have been explored, we return the result. + +The time complexity of this solution is O(k^n * m), where k is the number of words in the dictionary, `n` is the length +of the string, and `m` is the length of the longest word in the dictionary. + +The space complexity is O(k^n * n), where k is the number of words in the dictionary and `n` is the length of the string. + +### Backtracking + +Initially, we might think of a brute-force approach where we systematically explore all possible ways to break the +string into words from the dictionary. This leads us to the backtracking strategy, where we recursively try to form +words from the string and add them to a current sentence if they are in the dictionary. If the current prefix doesn't +lead to a valid solution, we backtrack by removing the last added word and trying the next possible word. This ensures +we explore all possible segmentations of the string. + +At each step, we consider all possible end indices for substrings starting from the current index. For each substring, +we check if it exists in the dictionary. If the substring is a valid word, we append it to the current sentence and +recursively call the function with the updated index, which is the end index of the substring plus one. + +If we reach the end of the string, it means we have found a valid segmentation, and we can add the current sentence to +the results. However, if we encounter a substring that is not a valid word, we backtrack by returning from that +recursive call and trying the next possible end index. + +The backtracking approach will be inefficient due to the large number of recursive calls, especially for longer strings. +To increase efficiency, we will convert the word dictionary into a set for constant-time lookups. However, the overall +time complexity remains high because we explore all possible partitions. + +The process is visualized below: + +![Backtracking Solution](./images/solution/word_break_backtracking_solution_1.png) + +#### Algorithm + +- Convert the `word_dict` array into an unordered set `word_set` for efficient lookups. +- Initialize an empty array `results` to store valid sentences. +- Initialize an empty string currentSentence to keep track of the sentence being constructed. +- Call the `backtrack` function with the input string `s`, `word_set`, `current_sentence`, `results`, and a starting + index set to 0, the beginning of the input string. + - Base case: If the `start_index` is equal to the length of the string, add the `current_sentence` to `results` and + return as it means that `current_sentence` represents a valid sentence. + - Iterate over possible `end_index` values from `start_index` + 1 to the end of the string. + - Extract the substring word from startIndex to `end_index - 1`. + - If word is found in `word_set`: + - Store the current currentSentence in `original_sentence`. + - Append word to `current_sentence` (with a space if needed). + - Recursively call `backtrack` with the updated `current_sentence` and `end_index`. + - Reset `current_sentence` to its original value (`original_sentence`) to backtrack and try the next `end_index`. + - Return from the backtrack function. +- Return results. + +#### Complexity Analysis + +Let n be the length of the input string. + +##### Time complexity: O(n⋅2^n) + +The algorithm explores all possible ways to break the string into words. In the worst case, where each character can be +treated as a word, the recursion tree has 2^n leaf nodes, resulting in an exponential time complexity. For each leaf +node, O(n) work is performed, so the overall complexity is O(n⋅2^n). + +##### Space complexity: O(2^n) + +The recursion stack can grow up to a depth of n, where each recursive call consumes additional space for storing the +current state. + +Since each position in the string can be a split point or not, and for n positions, there are 2^n possible combinations +of splits. Thus, in the worst case, each combination generates a different sentence that needs to be stored, leading to +exponential space complexity. + +### Optimized approach using dynamic programming - tabulation + +Since the recursive solution to this problem is very costly, let’s see if we can reduce this cost in any way. Dynamic +programming helps us avoid recomputing the same subproblems. Therefore, let’s analyze our recursive solution to see if +it has the properties needed for conversion to dynamic programming. + +- **Optimal substructure**: Given an input string ,s, that we want to break up into dictionary words, we find the first + word that matches a word from the dictionary, and then repeat the process for the remaining, shorter input string. + This means that, to solve the problem for input `q`, we need to solve the same problem for `p`, where `p` is at + least one character shorter than `q`. Therefore, this problem obeys the optimal substructure property. + +- **Overlapping subproblems**: The algorithm solves the same subproblems repeatedly. Consider input string “ancookbook” + and the dictionary [“an”, “book”, “cook”, “cookbook”]. The following is the partial call tree for the naive recursive + solution: + ```text + "ancookbook" + / \ + "ancookbook" "cookbook" + / \ / \ + "cookbook" ... "book" ... + ``` + +From the tree above, it can be seen that the subproblem “cookbook” is evaluated twice. To take advantage of these +opportunities for optimization, we will use bottom-up dynamic programming, also known as the tabulation approach. This +is an iterative method of solving dynamic programming problems. The idea is that if a prefix of the input string matches +any word `w` in the dictionary, we can split the string into two parts: the matching word and the suffix of the input +string. We start from an empty prefix which is the base case. The prefix would eventually develop into the complete +input string. + +> The tabulation approach is often more efficient than backtracking and memoization in terms of time and space complexity +because it avoids the overhead of recursive calls and stack usage. It also eliminates the need for a separate +memoization map, as the table itself serves as the storage for the subproblem solutions. + +Here’s how the algorithm works: + +- We initialize an empty lookup table, dp, of length, n+1, where dp[i] will correspond to the prefix of length i. This + table will be used to store the solutions to previously solved subproblems. It will have the following properties: + - The first entry of the table will represent a prefix of length 0 , i.e., an empty string “”. + - The rest of the entries will represent the other prefixes of the string s. For example, the input string “vegan” + will have the prefixes “v”, “ve”, “veg”, “vega”, and “vegan”. + - Each entry of the table will contain an array containing the sentences that can be formed from the respective prefix. + At this point, all the arrays are empty. +- For the base case, we add an empty string to the array corresponding to the first entry of the dp table. This is + because the only sentence that can be formed from an empty string is an empty string itself. +- Next, we traverse the input string by breaking it into its prefixes by including a single character, one at a time, + in each iteration. + - For the current prefix, we initialize an array, temp, that will store the valid sentences formed from that prefix. + Let’s suppose that the input string is “vegan”, and that the current prefix is “vega”. + - For all possible suffixes of the current prefix, we check if the suffix exists in the given dictionary. In our + example, this would mean checking the dictionary for the suffixes “vega”, “ega”, “ga”, and “a”. For each suffix, it + will either match a dictionary word, or not: + - If it does, we know that the suffix is a valid word from the dictionary and can be used as part of the solution. + Therefore, in the dp table, we retrieve all the possible sentences for the prefix to the left of this suffix. + Supposing that the current suffix of “vega” is “a”, and that “a” is present in the dictionary, we would retrieve + all the sentences already found for “veg”. This means that we reuse the solutions of the subproblem smaller than + the current subproblem. Now, we form new sentences for the current prefix by appending a space character and the + current suffix (which is a valid dictionary word) to each of the retrieved sentences. Supposing that the valid + sentences for the subproblem “veg” are “v eg”, and “ve g”, we will add these new sentences for the current + subproblem, “vega”: “veg a”, “v eg a”, and “ve g a”. We add the new sentences to the temp array of this prefix. + - If the suffix is not present in the dictionary, no sentences can be made from the current prefix, so the temp + array of that prefix remains empty. + - We repeat the above steps for all suffixes of the current prefix. + - We set the entry corresponding to the current prefix in the dp table equal to the temp array. +- We repeat the steps above for all prefixes of the input string. +- After all the prefixes have been evaluated, the last entry of the dp table will be an array containing all the + sentences formed from the largest prefix, i.e., the complete string. Therefore, we return this array. + +#### Solution summary + +To recap, the solution to this problem can be divided into the following six main steps: + +1. We create a 2D table where each entry corresponds to a prefix of the input string. At this point, each entry contains + an empty array. +2. We iterate over all prefixes of the input string. For each prefix, we iterate over all of its suffixes. +3. For each suffix, we check whether it’s a valid word, i.e., whether it’s present in the provided dictionary. +4. If the suffix is a valid word, we combine it with all valid sentences from the corresponding entry (in the table) of + the prefix to the left of it. +5. We store the array of all possible sentences that can be formed using the current prefix in the corresponding entry + of the table. +6. After processing all prefixes of the input string, we return the array in the last entry of our table. + +![Solution 1](images/solution/word_break_dynamic_programming_tabulation_solution_1.png) +![Solution 2](images/solution/word_break_dynamic_programming_tabulation_solution_2.png) +![Solution 3](images/solution/word_break_dynamic_programming_tabulation_solution_3.png) +![Solution 4](images/solution/word_break_dynamic_programming_tabulation_solution_4.png) +![Solution 5](images/solution/word_break_dynamic_programming_tabulation_solution_5.png) +![Solution 6](images/solution/word_break_dynamic_programming_tabulation_solution_6.png) +![Solution 7](images/solution/word_break_dynamic_programming_tabulation_solution_7.png) +![Solution 8](images/solution/word_break_dynamic_programming_tabulation_solution_8.png) +![Solution 9](images/solution/word_break_dynamic_programming_tabulation_solution_9.png) +![Solution 10](images/solution/word_break_dynamic_programming_tabulation_solution_10.png) +![Solution 11](images/solution/word_break_dynamic_programming_tabulation_solution_11.png) +![Solution 12](images/solution/word_break_dynamic_programming_tabulation_solution_12.png) +![Solution 13](images/solution/word_break_dynamic_programming_tabulation_solution_13.png) + +#### Time Complexity + +The time complexity of this solution is O(n^2 * v), where n is the length of the string `s` and v is the number of valid +combinations + +#### Space Complexity + +The space complexity is O(n * v), where n is the length of the string and v is the number of valid combinations stored in +the `dp` array. + +### Dynamic Programming - Memoization + +We can improve the efficiency of the backtracking method by using Memoization, which stores the results of subproblems +to avoid recalculating them. + +We use a depth-first search (DFS) function that recursively breaks the string into words. However, before performing a +recursive call, we check if the results for the current substring have already been computed and stored in a memoization +map (typically a dictionary or hash table). + +If the results of the current substring are found in the memoization map, we can directly return them without further +computation. If not, we proceed with the recursive call, computing the results and storing them in the memoization map +before returning them. + +By memoizing the results, we can reduce the number of computations by ensuring that each substring is processed only +once in average cases. + +#### Algorithm + +1. Convert the `wordDict` array into an unordered set `wordSet` for efficient lookups. +2. Initialize an empty unordered map `memoization` to store the results of subproblems. +3. Call the `dfs` function with the input string s, wordSet, and memoization. + - Check if the answer for the current `remainingSt`r(the remaining part of the string to be processed) are already + in `memoization`. If so, return them. + - Base Case: If `remainingStr` is empty, it means that all characters have been processed. An empty string represents + a valid sentence so return an array containing the empty string. + - Initialize an empty array `results`. + - Iterate from 1 to the length of `remainingStr`: + - Extract the substring `currentWord` from 0 to i to check if it is a valid word. + - If currentWord is found in `wordSet`: + - Recursively call `dfs` with `remainingStr.substr(i)`, wordSet, and memoization. + - Append currentWord and the recursive results to `results`(with a space if needed) to form valid sentences. + - Store the `results` for `remainingStr` in memoization. + - Return `results`. + +#### Complexity + +Let n be the length of the input string. + +##### Time complexity: O(n⋅2^n) + +While memoization avoids redundant computations, it does not change the overall number of subproblems that need to be +solved. In the worst case, there are still unique 2^n possible substrings that need to be explored, leading to an +exponential time complexity. For each subproblem, O(n) work is performed, so the overall complexity is O(n⋅2^n). + +##### Space complexity: O(n⋅2^n) + +The recursion stack can grow up to a depth of n, where each recursive call consumes additional space for storing the +current state. + +The memoization map needs to store the results for all possible substrings, which can be up to 2^n substrings of size n +in the worst case, resulting in an exponential space complexity. + +### Trie Optimization + +While the previous approaches focus on optimizing the search and computation process, we can also consider leveraging +efficient data structures to enhance the word lookup process. This leads us to the trie-based approach, which uses a +trie data structure to store the word dictionary, allowing efficient word lookup and prefix matching. + +The trie, also known as a prefix tree, is a tree-based data structure where each node represents a character in a word, +and the path from the root to a leaf node represents a complete word. This structure is particularly useful for problems +involving word segmentation because it allows for efficient prefix matching. + +Here, we first build a trie from the dictionary words. Each word is represented as a path in the trie, where each node +corresponds to a character in the word. + +By using the trie, we can quickly determine whether a substring can form a valid word without having to perform linear +searches or set lookups. This reduces the search space and improves the efficiency of the algorithm. + +In this approach, instead of recursively exploring the remaining substring and using memoization, we iterate from the +end of the input string to the beginning (in reverse order). For each starting index (startIdx), we attempt to find +valid sentences that can be formed from that index by iterating through the string and checking if the current substring +forms a valid word using the trie data structure. +When a valid word is encountered in the trie, we append it to the list of valid sentences for the current starting index. +If the current valid word is not the last word in the sentence, we combine it with the valid sentences formed from the +next index (endIdx + 1), which are retrieved from the dp dictionary. + +The valid sentences for each starting index are stored in the dp dictionary, ensuring that previously computed results +are reused. By using tabulation and storing the valid sentences for each starting index, we avoid redundant computations +and achieve significant time and space efficiency improvements compared to the standard backtracking method with +memoization. + +The trie-based approach offers advantages in terms of efficient word lookup and prefix matching, making it particularly +suitable for problems involving word segmentation or string manipulation. However, it comes with the additional overhead +of constructing and maintaining the trie data structure, which can be more memory-intensive for large dictionaries. + +#### Algorithm + +##### Initialize TrieNode Structure + +- Each TrieNode has two properties: + - isEnd: A boolean value indicating if the node marks the end of a word. + - children: An array of size 26 (for lowercase English letters) to store pointers to child nodes. +- The constructor initializes isEnd to false and all elements in children to null. + +##### Trie Class + +- The Trie class has a root pointer of type TrieNode. +- The constructor initializes the root with a new TrieNode object. +- The insert function: +- Takes a string word as input. +- Starts from the root node. +- For each character c in the word: + - Calculate the index corresponding to the character. + - If the child node at the calculated index doesn't exist, create a new TrieNode and assign it to that index. + - Move to the child node. +- After processing all characters, mark the current node's isEnd as true + +##### `wordBreak` Function + +- Create a Trie object. +- Insert all words from wordDict into the trie using the insert function. +- Initialize a map dp to store the results of subproblems. +- Iterate from the end of the string s to the beginning (in reverse order). +- For each starting index startIdx: + - Initialize a vector validSentences to store valid sentences starting from startIdx. + - Initialize a current_node pointer to the root of the trie. + - Iterate from startIdx to the end of the string. + - For each character c in the string: + - Calculate the index corresponding to c. + - Check if the child node at the calculated index exists in the trie. + - If the child node doesn't exist, break out of the inner loop. This means that the current substring cannot form + a valid word, so there is no need to continue checking the remaining characters. + - Move to the child node. + - Check if the current node's isEnd is true, indicating a valid word. + - If a valid word is found: + - Extract the current word from the string using substr. + - If it's the last word in the sentence (endIdx is the last index): + - Add the current word to validSentences. + - If it's not the last word: + - Retrieve the valid sentences formed by the remaining substring from dp[endIdx + 1]. + - Combine the current word with each sentence and add it to validSentences. + - Store the validSentences for the current startIdx in dp. +- Return the valid sentences stored in dp[0], which represents the valid sentences formed from the entire string. + +#### Complexity Analysis + +Let n be the length of the input string. + +##### Time complexity: O(n⋅2^n) + +Even though the trie-based approach uses an efficient data structure for word lookup, it still needs to explore all +possible ways to break the string into words. In the worst case, there are 2^n unique possible partitions, leading to +an exponential time complexity. O(n) work is performed for each partition, so the overall complexity is O(n⋅2^n). + +##### Space complexity: O(n⋅2^n) + +The trie data structure itself can have a maximum of 2^n nodes in the worst case, where each character in the string +represents a separate word. Additionally, the tabulation map used in this approach can also store up to 2^n strings of +size n, resulting in an overall exponential space complexity. + +---- + +### Further Thoughts on Complexity Analysis + +The complexity of this problem cannot be reduced from n⋅2^n; the worst-case scenario will still be (n⋅2^n). However, +using dynamic programming (DP) will make it a bit more efficient than backtracking overall because of the below test case. + +Consider the input "aaaaaa", with wordDict = ["a", "aa", "aaa", "aaaa", "aaaaa", "aaaaa"]. + +Every possible partition is a valid sentence, and there are 2^(n−1) such partitions. The algorithms cannot perform +better than this since they must generate all valid sentences. The cost of iterating over cached results will be +exponential, as every possible partition will be cached, resulting in the same runtime as regular backtracking. +Likewise, the space complexity will also be O(n⋅2^n) for the same reason—every partition is stored in memory. + +Another way to explain why the worst-case complexity is O(n⋅2^n) for all the algorithms is that, given an array of +length n, there are n+1 ways/intervals to partition it into two parts. Each interval has two choices: to split or not +to split. In the worst case, we will have to check all possibilities, which results in a time complexity of O(n⋅2^(n+1)), +which simplifies to O(n⋅2^n). This analysis is extremely similar to palindrome partitioning. + +Overall, this question is interesting because of the nature of this complexity. In an interview setting, if an +interviewer asks this question, the most expected solutions would be Backtracking and Trie, as they become natural +choices for the conditions and outputs we need. diff --git a/algorithms/dynamic_programming/word_break/__init__.py b/algorithms/dynamic_programming/word_break/__init__.py new file mode 100644 index 00000000..1d25e862 --- /dev/null +++ b/algorithms/dynamic_programming/word_break/__init__.py @@ -0,0 +1,259 @@ +from typing import List, Dict, Set +from datastructures.trees.trie import AlphabetTrie + + +def word_break_trie(s: str, word_dict: List[str]) -> List[str]: + """ + This adds spaces to s to break it up into a sequence of valid words from word_dict. + + This uses a Trie to store the words in the dictionary and a map to store the results of subproblems. + + Complexity: + Time: O(n*2^n): where n is the length of the string + Space: O(n*2^n): where n is the length of the string + + Args: + s: The input string + word_dict: The dictionary of words + Returns: + List of valid sentences + """ + # build the Trie from the word dictionary + trie = AlphabetTrie() + for word in word_dict: + trie.insert(word) + + # map to store results of subproblems + results: Dict[int, List[str]] = dict() + + # iterate from the end to the start of the string + for start_idx in range(len(s), -1, -1): + # store valid sentences starting from start_idx + valid_sentences = [] + + # initialize current node to the root of the Trie + current_node = trie.root + + # iterate from start_idx to the end of the string + for end_idx in range(start_idx, len(s)): + char = s[end_idx] + index = ord(char.lower()) - ord("a") + + # check if the current character exists in the trie + if not current_node.children[index]: + break + + # move to the next node in the trie + current_node = current_node.children[index] + + # check if we have found a valid word + if current_node.is_end_of_word: + current_word = s[start_idx : end_idx + 1] + + # if it is the last word, add it as a valid sentence + if end_idx == len(s) - 1: + valid_sentences.append(current_word) + else: + # if it's not the last word, append it to each sentence formed by the remaining substring + sentences_from_next_index = results.get(end_idx + 1, []) + for sentence in sentences_from_next_index: + valid_sentences.append(f"{current_word} {sentence}") + + # store the valid sentences for the current start index + results[start_idx] = valid_sentences + + # return the sentences formed from the entire string + return results.get(0, []) + + +def word_break_dp_tabulation(s: str, word_dict: List[str]) -> List[str]: + """ + This adds spaces to s to break it up into a sequence of valid words from word_dict. + + This uses dynamic programming with tabulation to store the words in the dictionary and a map to store the results + of subproblems. + + Complexity: + Time: O(n*2^n): where n is the length of the string + Space: O(n*2^n): where n is the length of the string + + Args: + s: The input string + word_dict: The dictionary of words + Returns: + List of valid sentences + """ + # Initializing the dp table of size s.length + 1 + dp = [[] for _ in range(len(s) + 1)] + # Setting the base case + dp[0] = [""] + + # For each substring in the input string, repeat the process. + for i in range(1, len(s) + 1): + prefix = s[:i] + + # An array to store the valid sentences formed from the current prefix being checked. + temp = [] + + # Iterate over the current prefix and break it down into all possible suffixes. + for j in range(0, i): + suffix = prefix[j:] + + # Check if the current suffix exists in word_dict. If it does, we know that it is a valid word + # and can be used as part of the solution. + if suffix in word_dict: + # Retrieve the valid sentences from the previously computed subproblem + for substring in dp[j]: + # Merge the suffix with the already calculated results + temp.append((substring + " " + suffix).strip()) + dp[i] = temp + + # returning all the sentences formed from the complete string s + return dp[len(s)] + + +def word_break_dp_tabulation_2(s: str, word_dict: List[str]) -> List[str]: + """ + This adds spaces to s to break it up into a sequence of valid words from word_dict. + + This uses dynamic programming with tabulation to store the words in the dictionary and a map to store the results + of subproblems. + + Complexity: + Time: O(n*2^n): where n is the length of the string + Space: O(n*2^n): where n is the length of the string + + Args: + s: The input string + word_dict: The dictionary of words + Returns: + List of valid sentences + """ + # map to store results of the subproblems + dp: Dict[int, List[str]] = dict() + + # iterate from the end of the string to the beginning + for start_idx in range(len(s), -1, -1): + # store valid sentences starting from start_idx + valid_sentences = [] + + # Iterate from start index to the end of the string + for end_idx in range(start_idx, len(s)): + # extract substring from start_idx to end_idx + current_word = s[start_idx : end_idx + 1] + + # Check if the current substring is a valid word + if current_word in word_dict: + # If it's the last word, add it as a valid sentence + if end_idx == len(s) - 1: + valid_sentences.append(current_word) + else: + # If it's not the last word, append it to each sentence formed by the remaining substring + sentences_from_next_index = dp.get(end_idx + 1, []) + for sentence in sentences_from_next_index: + valid_sentences.append(f"{current_word} {sentence}") + + # Store the valid sentences in dp + dp[start_idx] = valid_sentences + + # returning all the sentences formed from the complete string s + return dp.get(0, []) + + +def word_break_dp_memoization(s: str, word_dict: List[str]) -> List[str]: + """ + This adds spaces to s to break it up into a sequence of valid words from word_dict. + + This uses dynamic programming with memoization to store the words in the dictionary and a map to store the results + of subproblems. + + Complexity: + Time: O(n*2^n): where n is the length of the string + Space: O(n*2^n): where n is the length of the string + + Args: + s: The input string + word_dict: The dictionary of words + Returns: + List of valid sentences + """ + word_set: Set[str] = set(word_dict) + memoization: Dict[str, List[str]] = dict() + + def dfs(remaining_str: str, words_set: Set[str], memo: Dict) -> List[str]: + """ + Depth-first search to find all possible word combinations + Args: + remaining_str(str): the remaining string to search through + words_set(set): set of dictionary words to use to construct sentences + memo(dict): dictionary to improve computation of already processed words + Returns: + list: possible word combinations + """ + # check if the result for this substring is already memoized + if remaining_str in memo: + return memo[remaining_str] + + # base case: when the string is empty, return a list containing an empty string + if not remaining_str: + return [""] + + results = [] + for i in range(1, len(remaining_str) + 1): + current_word = remaining_str[:i] + # if the current substring is a valid word in the word set + if current_word in words_set: + for next_word in dfs(remaining_str[i:], words_set, memo): + # append current word and next word + results.append( + f"{current_word}{" " + next_word if next_word else ""}" + ) + + # memoize the results for the current substring + memo[remaining_str] = results + return results + + return dfs(s, word_set, memoization) + + +def word_break_backtrack(s: str, word_dict: List[str]) -> List[str]: + """ + This adds spaces to s to break it up into a sequence of valid words from word_dict. + + Uses backtracking to solve the problem. + + Args: + s: The input string + word_dict: The dictionary of words + Returns: + List of valid sentences + """ + # convert word dict into a set for O(1) lookups + word_set = set(word_dict) + results = [] + + def backtrack( + sentence: str, + words_set: Set[str], + current_sentence: List[str], + result: List[str], + start_index: int, + ): + # If we've reached the end of the string, add the current sentence to results + if start_index == len(sentence): + result.append(" ".join(current_sentence)) + return + + # Iterate over possible end indices + for end_index in range(start_index + 1, len(sentence) + 1): + word = sentence[start_index:end_index] + # If the word is in the set, proceed with backtracking + if word in words_set: + current_sentence.append(word) + # Recursively call backtrack with the new end index + backtrack(sentence, words_set, current_sentence, result, end_index) + # Remove the last word to backtrack + current_sentence.pop() + + backtrack(s, word_set, [], results, 0) + return results diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_backtracking_solution_1.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_backtracking_solution_1.png new file mode 100644 index 00000000..b769f084 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_backtracking_solution_1.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_1.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_1.png new file mode 100644 index 00000000..70ab258b Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_1.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_10.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_10.png new file mode 100644 index 00000000..39acc4d6 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_10.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_11.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_11.png new file mode 100644 index 00000000..c7aec9f1 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_11.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_12.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_12.png new file mode 100644 index 00000000..1d64d638 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_12.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_13.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_13.png new file mode 100644 index 00000000..4c7fd967 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_13.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_2.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_2.png new file mode 100644 index 00000000..b3bdb555 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_2.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_3.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_3.png new file mode 100644 index 00000000..691c10c4 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_3.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_4.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_4.png new file mode 100644 index 00000000..07fa608d Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_4.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_5.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_5.png new file mode 100644 index 00000000..5f6aaecc Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_5.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_6.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_6.png new file mode 100644 index 00000000..8a826ea0 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_6.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_7.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_7.png new file mode 100644 index 00000000..d9cd2f01 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_7.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_8.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_8.png new file mode 100644 index 00000000..53feab47 Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_8.png differ diff --git a/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_9.png b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_9.png new file mode 100644 index 00000000..8155b70f Binary files /dev/null and b/algorithms/dynamic_programming/word_break/images/solution/word_break_dynamic_programming_tabulation_solution_9.png differ diff --git a/algorithms/dynamic_programming/word_break/test_word_break.py b/algorithms/dynamic_programming/word_break/test_word_break.py new file mode 100644 index 00000000..25894ccd --- /dev/null +++ b/algorithms/dynamic_programming/word_break/test_word_break.py @@ -0,0 +1,94 @@ +import unittest +from typing import List +from parameterized import parameterized +from algorithms.dynamic_programming.word_break import ( + word_break_trie, + word_break_dp_tabulation, + word_break_dp_tabulation_2, + word_break_backtrack, + word_break_dp_memoization, +) + +WORD_BREAK_TEST_DATA = [ + ( + "magiclly", + ["ag", "al", "icl", "mag", "magic", "ly", "lly"], + ["mag icl ly", "magic lly"], + ), + ( + "raincoats", + ["rain", "oats", "coat", "s", "rains", "oat", "coats", "c"], + ["rain c oats", "rain c oat s", "rain coats", "rain coat s"], + ), + ( + "highway", + ["crash", "cream", "high", "highway", "low", "way"], + ["highway", "high way"], + ), + ("robocat", ["rob", "cat", "robo", "bo", "b"], ["robo cat"]), + ( + "cocomomo", + ["co", "mo", "coco", "momo"], + ["co co momo", "co co mo mo", "coco momo", "coco mo mo"], + ), + ( + "catsanddog", + ["cat", "cats", "and", "sand", "dog"], + ["cats and dog", "cat sand dog"], + ), + ( + "pineapplepenapple", + ["apple", "pen", "applepen", "pine", "pineapple"], + ["pine apple pen apple", "pineapple pen apple", "pine applepen apple"], + ), + ("catsandog", ["cats", "dog", "sand", "and", "cat"], []), +] + + +class WordBreakTestCases(unittest.TestCase): + @parameterized.expand(WORD_BREAK_TEST_DATA) + def test_word_break_trie(self, s: str, word_dict: List[str], expected: List[str]): + actual = word_break_trie(s, word_dict) + actual.sort() + expected.sort() + self.assertListEqual(expected, actual) + + @parameterized.expand(WORD_BREAK_TEST_DATA) + def test_word_break_dp_tabulation( + self, s: str, word_dict: List[str], expected: List[str] + ): + actual = word_break_dp_tabulation(s, word_dict) + actual.sort() + expected.sort() + self.assertListEqual(expected, actual) + + @parameterized.expand(WORD_BREAK_TEST_DATA) + def test_word_break_dp_tabulation_2( + self, s: str, word_dict: List[str], expected: List[str] + ): + actual = word_break_dp_tabulation_2(s, word_dict) + actual.sort() + expected.sort() + self.assertListEqual(expected, actual) + + @parameterized.expand(WORD_BREAK_TEST_DATA) + def test_word_break_backtrack( + self, s: str, word_dict: List[str], expected: List[str] + ): + actual = word_break_backtrack(s, word_dict) + actual.sort() + expected.sort() + self.assertListEqual(expected, actual) + + @parameterized.expand(WORD_BREAK_TEST_DATA) + def test_word_break_dp_memoization( + self, s: str, word_dict: List[str], expected: List[str] + ): + actual = word_break_dp_memoization(s, word_dict) + actual.sort() + expected.sort() + self.assertListEqual(expected, actual) + + +if __name__ == "__main__": + unittest.main() diff --git a/algorithms/greedy/gas_stations/test_gas_stations.py b/algorithms/greedy/gas_stations/test_gas_stations.py index 12a0c52a..67b1b7e2 100644 --- a/algorithms/greedy/gas_stations/test_gas_stations.py +++ b/algorithms/greedy/gas_stations/test_gas_stations.py @@ -5,14 +5,17 @@ class CanCompleteCircuitTestCase(unittest.TestCase): - @parameterized.expand([ - ([1,2,3,4,5],[3,4,5,1,2],3), - ([2,3,4],[3,4,3],-1), - ([1, 2],[2,1],1), - ]) + @parameterized.expand( + [ + ([1, 2, 3, 4, 5], [3, 4, 5, 1, 2], 3), + ([2, 3, 4], [3, 4, 3], -1), + ([1, 2], [2, 1], 1), + ] + ) def test_can_complete_circuit(self, gas: List[int], cost: List[int], expected: int): actual = can_complete_circuit(gas, cost) self.assertEqual(expected, actual) + if __name__ == "__main__": unittest.main() diff --git a/datastructures/trees/trie/__init__.py b/datastructures/trees/trie/__init__.py index 346e45bd..57400daf 100644 --- a/datastructures/trees/trie/__init__.py +++ b/datastructures/trees/trie/__init__.py @@ -2,5 +2,14 @@ from datastructures.trees.trie.trie import Trie from datastructures.trees.trie.suffix.suffix_tree_node import SuffixTreeNode from datastructures.trees.trie.suffix.suffix_tree import SuffixTree +from datastructures.trees.trie.alphabet_trie.alphabet_trie import AlphabetTrie +from datastructures.trees.trie.alphabet_trie.alphabet_trie_node import AlphabetTrieNode -__all__ = ["Trie", "TrieNode", "SuffixTreeNode", "SuffixTree"] +__all__ = [ + "Trie", + "TrieNode", + "SuffixTreeNode", + "SuffixTree", + "AlphabetTrie", + "AlphabetTrieNode", +] diff --git a/datastructures/trees/trie/alphabet_trie/README.md b/datastructures/trees/trie/alphabet_trie/README.md new file mode 100644 index 00000000..68e19080 --- /dev/null +++ b/datastructures/trees/trie/alphabet_trie/README.md @@ -0,0 +1,3 @@ +# Alphabet Trie + +This is a trie implementation for an alphabet of size 26 (a-z). diff --git a/datastructures/trees/trie/alphabet_trie/__init__.py b/datastructures/trees/trie/alphabet_trie/__init__.py new file mode 100644 index 00000000..c503c896 --- /dev/null +++ b/datastructures/trees/trie/alphabet_trie/__init__.py @@ -0,0 +1,7 @@ +from datastructures.trees.trie.alphabet_trie.alphabet_trie import AlphabetTrie +from datastructures.trees.trie.alphabet_trie.alphabet_trie_node import AlphabetTrieNode + +__all__ = [ + "AlphabetTrie", + "AlphabetTrieNode", +] diff --git a/datastructures/trees/trie/alphabet_trie/alphabet_trie.py b/datastructures/trees/trie/alphabet_trie/alphabet_trie.py new file mode 100644 index 00000000..f7ddece6 --- /dev/null +++ b/datastructures/trees/trie/alphabet_trie/alphabet_trie.py @@ -0,0 +1,17 @@ +from datastructures.trees.trie.alphabet_trie.alphabet_trie_node import AlphabetTrieNode + + +class AlphabetTrie: + def __init__(self): + self.root = AlphabetTrieNode() + + def insert(self, word: str) -> None: + if not word or not all('a' <= char.lower() <= 'z' for char in word): + raise ValueError("Word must contain only English letters (a-z)") + node = self.root + for char in word: + index = ord(char.lower()) - ord("a") + if not node.children[index]: + node.children[index] = AlphabetTrieNode() + node = node.children[index] + node.is_end_of_word = True diff --git a/datastructures/trees/trie/alphabet_trie/alphabet_trie_node.py b/datastructures/trees/trie/alphabet_trie/alphabet_trie_node.py new file mode 100644 index 00000000..4824ea84 --- /dev/null +++ b/datastructures/trees/trie/alphabet_trie/alphabet_trie_node.py @@ -0,0 +1,7 @@ +from typing import List, Optional + + +class AlphabetTrieNode: + def __init__(self): + self.children: List[Optional[AlphabetTrieNode]] = [None] * 26 + self.is_end_of_word: bool = False