Skip to content

Commit 761f7d1

Browse files
committed
feat(compression): Add Burrows-Wheeler Transform (BWT) and Move-to-Front (MTF)
1 parent 48ba1ae commit 761f7d1

File tree

4 files changed

+535
-0
lines changed

4 files changed

+535
-0
lines changed
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.Arrays;
4+
import java.util.HashMap;
5+
import java.util.Map;
6+
7+
/**
8+
* Implementation of the Burrows-Wheeler Transform (BWT) and its inverse.
9+
* <p>
10+
* BWT is a reversible data transformation algorithm that rearranges a string into runs of
11+
* similar characters. While not a compression algorithm itself, it significantly improves
12+
* the compressibility of data for subsequent algorithms like Move-to-Front encoding and
13+
* Run-Length Encoding.
14+
* </p>
15+
*
16+
* <p>The transform works by:
17+
* <ol>
18+
* <li>Generating all rotations of the input string</li>
19+
* <li>Sorting these rotations lexicographically</li>
20+
* <li>Taking the last column of the sorted matrix as output</li>
21+
* <li>Recording the index of the original string in the sorted matrix</li>
22+
* </ol>
23+
* </p>
24+
*
25+
* <p><b>Important:</b> The input string should end with a unique end-of-string marker
26+
* (typically '$') that:
27+
* <ul>
28+
* <li>Does not appear anywhere else in the text</li>
29+
* <li>Is lexicographically smaller than all other characters</li>
30+
* <li>Ensures unique rotations and enables correct inverse transformation</li>
31+
* </ul>
32+
* Without this marker, the inverse transform may not correctly reconstruct the original string.
33+
* </p>
34+
*
35+
* <p><b>Time Complexity:</b>
36+
* <ul>
37+
* <li>Forward transform: O(n² log n) where n is the string length</li>
38+
* <li>Inverse transform: O(n) using the LF-mapping technique</li>
39+
* </ul>
40+
* </p>
41+
*
42+
* <p><b>Example:</b></p>
43+
* <pre>
44+
* Input: "banana$"
45+
* Output: BWTResult("annb$aa", 4)
46+
* - "annb$aa" is the transformed string (groups similar characters)
47+
* - 4 is the index of the original string in the sorted rotations
48+
* </pre>
49+
*
50+
* @see <a href="https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform">Burrows–Wheeler transform (Wikipedia)</a>
51+
*/
52+
public final class BurrowsWheelerTransform {
53+
54+
private BurrowsWheelerTransform() {
55+
}
56+
57+
/**
58+
* A container for the result of the forward BWT.
59+
* <p>
60+
* Contains the transformed string and the index of the original string
61+
* in the sorted rotations matrix, both of which are required for the
62+
* inverse transformation.
63+
* </p>
64+
*/
65+
public static class BWTResult {
66+
/** The transformed string (last column of the sorted rotation matrix) */
67+
public final String transformed;
68+
69+
/** The index of the original string in the sorted rotations matrix */
70+
public final int originalIndex;
71+
72+
/**
73+
* Constructs a BWTResult with the transformed string and original index.
74+
*
75+
* @param transformed the transformed string (L-column)
76+
* @param originalIndex the index of the original string in sorted rotations
77+
*/
78+
public BWTResult(String transformed, int originalIndex) {
79+
this.transformed = transformed;
80+
this.originalIndex = originalIndex;
81+
}
82+
83+
@Override
84+
public boolean equals(Object obj) {
85+
if (this == obj) {
86+
return true;
87+
}
88+
if (obj == null || getClass() != obj.getClass()) {
89+
return false;
90+
}
91+
BWTResult bwtResult = (BWTResult) obj;
92+
return originalIndex == bwtResult.originalIndex && transformed.equals(bwtResult.transformed);
93+
}
94+
95+
@Override
96+
public int hashCode() {
97+
return 31 * transformed.hashCode() + originalIndex;
98+
}
99+
100+
@Override
101+
public String toString() {
102+
return "BWTResult[transformed=" + transformed + ", originalIndex=" + originalIndex + "]";
103+
}
104+
}
105+
106+
/**
107+
* Performs the forward Burrows-Wheeler Transform on the input string.
108+
* <p>
109+
* The algorithm generates all cyclic rotations of the input, sorts them
110+
* lexicographically, and returns the last column of this sorted matrix
111+
* along with the position of the original string.
112+
* </p>
113+
*
114+
* <p><b>Note:</b> It is strongly recommended that the input string ends with
115+
* a unique end-of-string marker (e.g., '$') that is lexicographically smaller
116+
* than any other character in the string. This ensures correct inversion.</p>
117+
*
118+
* @param text the input string to transform; must not be {@code null}
119+
* @return a {@link BWTResult} object containing the transformed string (L-column)
120+
* and the index of the original string in the sorted rotations matrix;
121+
* returns {@code BWTResult("", -1)} for empty input
122+
* @throws NullPointerException if {@code text} is {@code null}
123+
*/
124+
public static BWTResult transform(String text) {
125+
if (text == null || text.isEmpty()) {
126+
return new BWTResult("", -1);
127+
}
128+
129+
int n = text.length();
130+
131+
// Generate all rotations of the input string
132+
String[] rotations = new String[n];
133+
for (int i = 0; i < n; i++) {
134+
rotations[i] = text.substring(i) + text.substring(0, i);
135+
}
136+
137+
// Sort rotations lexicographically
138+
Arrays.sort(rotations);
139+
140+
// Extract the last column and find the original string's position
141+
StringBuilder lastColumn = new StringBuilder(n);
142+
int originalIndex = -1;
143+
for (int i = 0; i < n; i++) {
144+
lastColumn.append(rotations[i].charAt(n - 1));
145+
if (rotations[i].equals(text)) {
146+
originalIndex = i;
147+
}
148+
}
149+
150+
return new BWTResult(lastColumn.toString(), originalIndex);
151+
}
152+
153+
/**
154+
* Performs the inverse Burrows-Wheeler Transform using the LF-mapping technique.
155+
* <p>
156+
* The LF-mapping (Last-First mapping) is an efficient method to reconstruct
157+
* the original string from the BWT output without explicitly reconstructing
158+
* the entire sorted rotations matrix.
159+
* </p>
160+
*
161+
* <p>The algorithm works by:
162+
* <ol>
163+
* <li>Creating the first column by sorting the BWT string</li>
164+
* <li>Building a mapping from first column indices to last column indices</li>
165+
* <li>Following this mapping starting from the original index to reconstruct the string</li>
166+
* </ol>
167+
* </p>
168+
*
169+
* @param bwtString the transformed string (L-column) from the forward transform; must not be {@code null}
170+
* @param originalIndex the index of the original string row from the forward transform;
171+
* use -1 for empty strings
172+
* @return the original, untransformed string; returns empty string if input is empty or {@code originalIndex} is -1
173+
* @throws NullPointerException if {@code bwtString} is {@code null}
174+
* @throws IllegalArgumentException if {@code originalIndex} is out of valid range (except -1)
175+
*/
176+
public static String inverseTransform(String bwtString, int originalIndex) {
177+
if (bwtString == null || bwtString.isEmpty() || originalIndex == -1) {
178+
return "";
179+
}
180+
181+
int n = bwtString.length();
182+
if (originalIndex < 0 || originalIndex >= n) {
183+
throw new IllegalArgumentException("Original index must be between 0 and " + (n - 1) + ", got: " + originalIndex);
184+
}
185+
186+
char[] lastColumn = bwtString.toCharArray();
187+
char[] firstColumn = bwtString.toCharArray();
188+
Arrays.sort(firstColumn);
189+
190+
// Create the "next" array for LF-mapping.
191+
// next[i] stores the row index in the last column that corresponds to firstColumn[i]
192+
int[] next = new int[n];
193+
194+
// Track the count of each character seen so far in the last column
195+
Map<Character, Integer> countMap = new HashMap<>();
196+
197+
// Store the first occurrence index of each character in the first column
198+
Map<Character, Integer> firstOccurrence = new HashMap<>();
199+
200+
for (int i = 0; i < n; i++) {
201+
if (!firstOccurrence.containsKey(firstColumn[i])) {
202+
firstOccurrence.put(firstColumn[i], i);
203+
}
204+
}
205+
206+
// Build the LF-mapping
207+
for (int i = 0; i < n; i++) {
208+
char c = lastColumn[i];
209+
int count = countMap.getOrDefault(c, 0);
210+
int firstIndex = firstOccurrence.get(c);
211+
next[firstIndex + count] = i;
212+
countMap.put(c, count + 1);
213+
}
214+
215+
// Reconstruct the original string by following the LF-mapping
216+
StringBuilder originalString = new StringBuilder(n);
217+
int currentRow = originalIndex;
218+
for (int i = 0; i < n; i++) {
219+
originalString.append(firstColumn[currentRow]);
220+
currentRow = next[currentRow];
221+
}
222+
223+
return originalString.toString();
224+
}
225+
}
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.LinkedList;
5+
import java.util.List;
6+
import java.util.stream.Collectors;
7+
8+
/**
9+
* Implementation of the Move-to-Front (MTF) transform and its inverse.
10+
* <p>
11+
* MTF is a data transformation algorithm that encodes each symbol in the input
12+
* as its current position in a dynamically-maintained list, then moves that symbol
13+
* to the front of the list. This transformation is particularly effective when used
14+
* after the Burrows-Wheeler Transform (BWT), as BWT groups similar characters together.
15+
* </p>
16+
*
17+
* <p>The transform converts runs of repeated characters into sequences of small integers
18+
* (often zeros), which are highly compressible by subsequent entropy encoding algorithms
19+
* like Run-Length Encoding (RLE) or Huffman coding. This technique is used in the
20+
* bzip2 compression algorithm.
21+
* </p>
22+
*
23+
* <p><b>How it works:</b>
24+
* <ol>
25+
* <li>Maintain a list of symbols (the alphabet), initially in a fixed order</li>
26+
* <li>For each input symbol:
27+
* <ul>
28+
* <li>Output its current index in the list</li>
29+
* <li>Move that symbol to the front of the list</li>
30+
* </ul>
31+
* </li>
32+
* </ol>
33+
* This means frequently occurring symbols quickly move to the front and are encoded
34+
* with small indices (often 0), while rare symbols remain near the back.
35+
* </p>
36+
*
37+
* <p><b>Time Complexity:</b>
38+
* <ul>
39+
* <li>Forward transform: O(n × m) where n is input length and m is alphabet size</li>
40+
* <li>Inverse transform: O(n × m)</li>
41+
* </ul>
42+
* Note: Using {@link LinkedList} for O(1) insertions and O(m) search operations.
43+
* </p>
44+
*
45+
* <p><b>Example:</b></p>
46+
* <pre>
47+
* Input: "annb$aa"
48+
* Alphabet: "$abn" (initial order)
49+
* Output: [1, 3, 0, 3, 3, 3, 0]
50+
*
51+
* Step-by-step:
52+
* - 'a': index 1 in [$,a,b,n] → output 1, list becomes [a,$,b,n]
53+
* - 'n': index 3 in [a,$,b,n] → output 3, list becomes [n,a,$,b]
54+
* - 'n': index 0 in [n,a,$,b] → output 0, list stays [n,a,$,b]
55+
* - 'b': index 3 in [n,a,$,b] → output 3, list becomes [b,n,a,$]
56+
* - etc.
57+
*
58+
* Notice how repeated 'n' characters produce zeros after the first occurrence!
59+
* </pre>
60+
*
61+
* @see <a href="https://en.wikipedia.org/wiki/Move-to-front_transform">Move-to-front transform (Wikipedia)</a>
62+
*/
63+
public final class MoveToFront {
64+
65+
private MoveToFront() {
66+
}
67+
68+
/**
69+
* Performs the forward Move-to-Front transform.
70+
* <p>
71+
* Converts the input string into a list of integers, where each integer represents
72+
* the position of the corresponding character in a dynamically-maintained alphabet list.
73+
* </p>
74+
*
75+
* <p><b>Note:</b> All characters in the input text must exist in the provided alphabet,
76+
* otherwise an {@link IllegalArgumentException} is thrown. The alphabet should contain
77+
* all unique characters that may appear in the input.</p>
78+
*
79+
* @param text the input string to transform; if empty, returns an empty list
80+
* @param initialAlphabet a string containing the initial ordered set of symbols
81+
* (e.g., "$abn" or the full ASCII set); must not be empty
82+
* when {@code text} is non-empty
83+
* @return a list of integers representing the transformed data, where each integer
84+
* is the index of the corresponding input character in the current alphabet state
85+
* @throws IllegalArgumentException if {@code text} is non-empty and {@code initialAlphabet}
86+
* is {@code null} or empty
87+
* @throws IllegalArgumentException if any character in {@code text} is not found in
88+
* {@code initialAlphabet}
89+
*/
90+
public static List<Integer> transform(String text, String initialAlphabet) {
91+
if (text == null || text.isEmpty()) {
92+
return new ArrayList<>();
93+
}
94+
if (initialAlphabet == null || initialAlphabet.isEmpty()) {
95+
throw new IllegalArgumentException("Alphabet cannot be null or empty when text is not empty.");
96+
}
97+
98+
List<Integer> output = new ArrayList<>(text.length());
99+
100+
// Use LinkedList for O(1) add-to-front and O(n) remove operations
101+
// This is more efficient than ArrayList for the move-to-front pattern
102+
List<Character> alphabet = initialAlphabet.chars().mapToObj(c -> (char) c).collect(Collectors.toCollection(LinkedList::new));
103+
104+
for (char c : text.toCharArray()) {
105+
int index = alphabet.indexOf(c);
106+
if (index == -1) {
107+
throw new IllegalArgumentException("Symbol '" + c + "' not found in the initial alphabet.");
108+
}
109+
110+
output.add(index);
111+
112+
// Move the character to the front
113+
Character symbol = alphabet.remove(index);
114+
alphabet.addFirst(symbol);
115+
}
116+
return output;
117+
}
118+
119+
/**
120+
* Performs the inverse Move-to-Front transform.
121+
* <p>
122+
* Reconstructs the original string from the list of indices produced by the
123+
* forward transform. This requires the exact same initial alphabet that was
124+
* used in the forward transform.
125+
* </p>
126+
*
127+
* <p><b>Important:</b> The {@code initialAlphabet} parameter must be identical
128+
* to the one used in the forward transform, including character order, or the
129+
* output will be incorrect.</p>
130+
*
131+
* @param indices the list of integers from the forward transform; if empty or {@code null},
132+
* returns an empty string
133+
* @param initialAlphabet the exact same initial alphabet string used for the forward transform;
134+
* if {@code null} or empty, returns an empty string
135+
* @return the original, untransformed string
136+
* @throws IllegalArgumentException if any index in {@code indices} is negative or
137+
* exceeds the current alphabet size
138+
*/
139+
public static String inverseTransform(List<Integer> indices, String initialAlphabet) {
140+
if (indices == null || indices.isEmpty() || initialAlphabet == null || initialAlphabet.isEmpty()) {
141+
return "";
142+
}
143+
144+
StringBuilder output = new StringBuilder(indices.size());
145+
146+
// Use LinkedList for O(1) add-to-front and O(n) remove operations
147+
List<Character> alphabet = initialAlphabet.chars().mapToObj(c -> (char) c).collect(Collectors.toCollection(LinkedList::new));
148+
149+
for (int index : indices) {
150+
if (index < 0 || index >= alphabet.size()) {
151+
throw new IllegalArgumentException("Index " + index + " is out of bounds for the current alphabet of size " + alphabet.size() + ".");
152+
}
153+
154+
// Get the symbol at the index
155+
char symbol = alphabet.get(index);
156+
output.append(symbol);
157+
158+
// Move the symbol to the front (mirroring the forward transform)
159+
alphabet.remove(index);
160+
alphabet.addFirst(symbol);
161+
}
162+
return output.toString();
163+
}
164+
}

0 commit comments

Comments
 (0)