Skip to content

Commit b49cee9

Browse files
authored
Merge branch 'master' into sushant
2 parents f6c32e9 + 48ba1ae commit b49cee9

File tree

55 files changed

+6091
-293
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+6091
-293
lines changed

DIRECTORY.md

Lines changed: 99 additions & 4 deletions
Large diffs are not rendered by default.

pom.xml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@
8282
<plugin>
8383
<groupId>org.jacoco</groupId>
8484
<artifactId>jacoco-maven-plugin</artifactId>
85-
<version>0.8.13</version>
85+
<version>0.8.14</version>
8686
<executions>
8787
<execution>
8888
<goals>
@@ -112,14 +112,14 @@
112112
<dependency>
113113
<groupId>com.puppycrawl.tools</groupId>
114114
<artifactId>checkstyle</artifactId>
115-
<version>12.0.0</version>
115+
<version>12.1.0</version>
116116
</dependency>
117117
</dependencies>
118118
</plugin>
119119
<plugin>
120120
<groupId>com.github.spotbugs</groupId>
121121
<artifactId>spotbugs-maven-plugin</artifactId>
122-
<version>4.9.6.0</version>
122+
<version>4.9.8.1</version>
123123
<configuration>
124124
<excludeFilterFile>spotbugs-exclude.xml</excludeFilterFile>
125125
<includeTests>true</includeTests>
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.math.BigDecimal;
4+
import java.math.MathContext;
5+
import java.util.ArrayList;
6+
import java.util.Collections;
7+
import java.util.HashMap;
8+
import java.util.List;
9+
import java.util.Map;
10+
11+
/**
12+
* An implementation of the Arithmetic Coding algorithm.
13+
*
14+
* <p>
15+
* Arithmetic coding is a form of entropy encoding used in lossless data
16+
* compression. It encodes an entire message into a single number, a fraction n
17+
* where (0.0 <= n < 1.0). Unlike Huffman coding, which assigns a specific
18+
* bit sequence to each symbol, arithmetic coding represents the message as a
19+
* sub-interval of the [0, 1) interval.
20+
* </p>
21+
*
22+
* <p>
23+
* This implementation uses BigDecimal for precision to handle the shrinking
24+
* intervals, making it suitable for educational purposes to demonstrate the
25+
* core logic.
26+
* </p>
27+
*
28+
* <p>
29+
* Time Complexity: O(n*m) for compression and decompression where n is the
30+
* length of the input and m is the number of unique symbols, due to the need
31+
* to calculate symbol probabilities.
32+
* </p>
33+
*
34+
* <p>
35+
* References:
36+
* <ul>
37+
* <li><a href="https://en.wikipedia.org/wiki/Arithmetic_coding">Wikipedia:
38+
* Arithmetic coding</a></li>
39+
* </ul>
40+
* </p>
41+
*/
42+
public final class ArithmeticCoding {
43+
44+
private ArithmeticCoding() {
45+
}
46+
47+
/**
48+
* Compresses a string using the Arithmetic Coding algorithm.
49+
*
50+
* @param uncompressed The string to be compressed.
51+
* @return The compressed representation as a BigDecimal number.
52+
* @throws IllegalArgumentException if the input string is null or empty.
53+
*/
54+
public static BigDecimal compress(String uncompressed) {
55+
if (uncompressed == null || uncompressed.isEmpty()) {
56+
throw new IllegalArgumentException("Input string cannot be null or empty.");
57+
}
58+
59+
Map<Character, Symbol> probabilityTable = calculateProbabilities(uncompressed);
60+
61+
BigDecimal low = BigDecimal.ZERO;
62+
BigDecimal high = BigDecimal.ONE;
63+
64+
for (char symbol : uncompressed.toCharArray()) {
65+
BigDecimal range = high.subtract(low);
66+
Symbol sym = probabilityTable.get(symbol);
67+
68+
high = low.add(range.multiply(sym.high()));
69+
low = low.add(range.multiply(sym.low()));
70+
}
71+
72+
return low; // Return the lower bound of the final interval
73+
}
74+
75+
/**
76+
* Decompresses a BigDecimal number back into the original string.
77+
*
78+
* @param compressed The compressed BigDecimal number.
79+
* @param length The length of the original uncompressed string.
80+
* @param probabilityTable The probability table used during compression.
81+
* @return The original, uncompressed string.
82+
*/
83+
public static String decompress(BigDecimal compressed, int length, Map<Character, Symbol> probabilityTable) {
84+
StringBuilder decompressed = new StringBuilder();
85+
86+
// Create a sorted list of symbols for deterministic decompression, matching the
87+
// order used in calculateProbabilities
88+
List<Map.Entry<Character, Symbol>> sortedSymbols = new ArrayList<>(probabilityTable.entrySet());
89+
sortedSymbols.sort(Map.Entry.comparingByKey());
90+
91+
BigDecimal low = BigDecimal.ZERO;
92+
BigDecimal high = BigDecimal.ONE;
93+
94+
for (int i = 0; i < length; i++) {
95+
BigDecimal range = high.subtract(low);
96+
97+
// Find which symbol the compressed value falls into
98+
for (Map.Entry<Character, Symbol> entry : sortedSymbols) {
99+
Symbol sym = entry.getValue();
100+
101+
// Calculate the actual range for this symbol in the current interval
102+
BigDecimal symLow = low.add(range.multiply(sym.low()));
103+
BigDecimal symHigh = low.add(range.multiply(sym.high()));
104+
105+
// Check if the compressed value falls within this symbol's range
106+
if (compressed.compareTo(symLow) >= 0 && compressed.compareTo(symHigh) < 0) {
107+
decompressed.append(entry.getKey());
108+
109+
// Update the interval for the next iteration
110+
low = symLow;
111+
high = symHigh;
112+
break;
113+
}
114+
}
115+
}
116+
117+
return decompressed.toString();
118+
}
119+
120+
/**
121+
* Calculates the frequency and probability range for each character in the
122+
* input string in a deterministic order.
123+
*
124+
* @param text The input string.
125+
* @return A map from each character to a Symbol object containing its
126+
* probability range.
127+
*/
128+
public static Map<Character, Symbol> calculateProbabilities(String text) {
129+
Map<Character, Integer> frequencies = new HashMap<>();
130+
for (char c : text.toCharArray()) {
131+
frequencies.put(c, frequencies.getOrDefault(c, 0) + 1);
132+
}
133+
134+
// Sort the characters to ensure a deterministic order for the probability table
135+
List<Character> sortedKeys = new ArrayList<>(frequencies.keySet());
136+
Collections.sort(sortedKeys);
137+
138+
Map<Character, Symbol> probabilityTable = new HashMap<>();
139+
BigDecimal currentLow = BigDecimal.ZERO;
140+
int total = text.length();
141+
142+
for (char symbol : sortedKeys) {
143+
BigDecimal probability = BigDecimal.valueOf(frequencies.get(symbol)).divide(BigDecimal.valueOf(total), MathContext.DECIMAL128);
144+
BigDecimal high = currentLow.add(probability);
145+
probabilityTable.put(symbol, new Symbol(currentLow, high));
146+
currentLow = high;
147+
}
148+
149+
return probabilityTable;
150+
}
151+
152+
/**
153+
* Helper class to store the probability range [low, high) for a symbol.
154+
*/
155+
public record Symbol(BigDecimal low, BigDecimal high) {
156+
}
157+
}
Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
package com.thealgorithms.compression;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
/**
7+
* An implementation of the Lempel-Ziv 77 (LZ77) compression algorithm.
8+
* <p>
9+
* LZ77 is a lossless data compression algorithm that works by finding repeated
10+
* occurrences of data in a sliding window. It replaces subsequent occurrences
11+
* with references (offset, length) to the first occurrence within the window.
12+
* </p>
13+
* <p>
14+
* This implementation uses a simple sliding window and lookahead buffer approach.
15+
* Output format is a sequence of tuples (offset, length, next_character).
16+
* </p>
17+
* <p>
18+
* Time Complexity: O(n*W) in this naive implementation, where n is the input length
19+
* and W is the window size, due to the search for the longest match. More advanced
20+
* data structures (like suffix trees) can improve this.
21+
* </p>
22+
* <p>
23+
* References:
24+
* <ul>
25+
* <li><a href="https://en.wikipedia.org/wiki/LZ77_and_LZ78#LZ77">Wikipedia: LZ77</a></li>
26+
* </ul>
27+
* </p>
28+
*/
29+
public final class LZ77 {
30+
31+
private static final int DEFAULT_WINDOW_SIZE = 4096;
32+
private static final int DEFAULT_LOOKAHEAD_BUFFER_SIZE = 16;
33+
private static final char END_OF_STREAM = '\u0000';
34+
private LZ77() {
35+
}
36+
37+
/**
38+
* Represents a token in the LZ77 compressed output.
39+
* Stores the offset back into the window, the length of the match,
40+
* and the next character after the match (or END_OF_STREAM if at end).
41+
*/
42+
public record Token(int offset, int length, char nextChar) {
43+
}
44+
45+
/**
46+
* Compresses the input text using the LZ77 algorithm.
47+
*
48+
* @param text The input string to compress. Must not be null.
49+
* @param windowSize The size of the sliding window (search buffer). Must be positive.
50+
* @param lookaheadBufferSize The size of the lookahead buffer. Must be positive.
51+
* @return A list of {@link Token} objects representing the compressed data.
52+
* @throws IllegalArgumentException if windowSize or lookaheadBufferSize are not positive.
53+
*/
54+
public static List<Token> compress(String text, int windowSize, int lookaheadBufferSize) {
55+
if (text == null) {
56+
return new ArrayList<>();
57+
}
58+
if (windowSize <= 0 || lookaheadBufferSize <= 0) {
59+
throw new IllegalArgumentException("Window size and lookahead buffer size must be positive.");
60+
}
61+
62+
List<Token> compressedOutput = new ArrayList<>();
63+
int currentPosition = 0;
64+
65+
while (currentPosition < text.length()) {
66+
int bestMatchDistance = 0;
67+
int bestMatchLength = 0;
68+
69+
// Define the start of the search window
70+
int searchBufferStart = Math.max(0, currentPosition - windowSize);
71+
// Define the end of the lookahead buffer (don't go past text length)
72+
int lookaheadEnd = Math.min(currentPosition + lookaheadBufferSize, text.length());
73+
74+
// Search for the longest match in the window
75+
for (int i = searchBufferStart; i < currentPosition; i++) {
76+
int currentMatchLength = 0;
77+
78+
// Check how far the match extends into the lookahead buffer
79+
// This allows for overlapping matches (e.g., "aaa" can match with offset 1)
80+
while (currentPosition + currentMatchLength < lookaheadEnd) {
81+
int sourceIndex = i + currentMatchLength;
82+
83+
// Handle overlapping matches (run-length encoding within LZ77)
84+
// When we've matched beyond our starting position, wrap around using modulo
85+
if (sourceIndex >= currentPosition) {
86+
int offset = currentPosition - i;
87+
sourceIndex = i + (currentMatchLength % offset);
88+
}
89+
90+
if (text.charAt(sourceIndex) == text.charAt(currentPosition + currentMatchLength)) {
91+
currentMatchLength++;
92+
} else {
93+
break;
94+
}
95+
}
96+
97+
// If this match is longer than the best found so far
98+
if (currentMatchLength > bestMatchLength) {
99+
bestMatchLength = currentMatchLength;
100+
bestMatchDistance = currentPosition - i; // Calculate offset from current position
101+
}
102+
}
103+
104+
char nextChar;
105+
if (currentPosition + bestMatchLength < text.length()) {
106+
nextChar = text.charAt(currentPosition + bestMatchLength);
107+
} else {
108+
nextChar = END_OF_STREAM;
109+
}
110+
111+
// Add the token to the output
112+
compressedOutput.add(new Token(bestMatchDistance, bestMatchLength, nextChar));
113+
114+
// Move the current position forward
115+
// If we're at the end and had a match, just move by the match length
116+
if (nextChar == END_OF_STREAM) {
117+
currentPosition += bestMatchLength;
118+
} else {
119+
currentPosition += bestMatchLength + 1;
120+
}
121+
}
122+
123+
return compressedOutput;
124+
}
125+
126+
/**
127+
* Compresses the input text using the LZ77 algorithm with default buffer sizes.
128+
*
129+
* @param text The input string to compress. Must not be null.
130+
* @return A list of {@link Token} objects representing the compressed data.
131+
*/
132+
public static List<Token> compress(String text) {
133+
return compress(text, DEFAULT_WINDOW_SIZE, DEFAULT_LOOKAHEAD_BUFFER_SIZE);
134+
}
135+
136+
/**
137+
* Decompresses a list of LZ77 tokens back into the original string.
138+
*
139+
* @param compressedData The list of {@link Token} objects. Must not be null.
140+
* @return The original, uncompressed string.
141+
*/
142+
public static String decompress(List<Token> compressedData) {
143+
if (compressedData == null) {
144+
return "";
145+
}
146+
147+
StringBuilder decompressedText = new StringBuilder();
148+
149+
for (Token token : compressedData) {
150+
// Copy matched characters from the sliding window
151+
if (token.length > 0) {
152+
int startIndex = decompressedText.length() - token.offset;
153+
154+
// Handle overlapping matches (e.g., when length > offset)
155+
for (int i = 0; i < token.length; i++) {
156+
decompressedText.append(decompressedText.charAt(startIndex + i));
157+
}
158+
}
159+
160+
// Append the next character (if not END_OF_STREAM)
161+
if (token.nextChar != END_OF_STREAM) {
162+
decompressedText.append(token.nextChar);
163+
}
164+
}
165+
166+
return decompressedText.toString();
167+
}
168+
}

0 commit comments

Comments
 (0)