Skip to content

Commit 87cc59d

Browse files
authored
Merge pull request #55 from blacelle/IntroduceLongCompression
Introduce LongVariableByte
2 parents 16a89d7 + 3e1d40b commit 87cc59d

37 files changed

+2403
-13
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.classpath
2+
.settings
23
.project
34
*.class
45
*.csv

pom.xml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@
4848
<version>4.13.1</version>
4949
<scope>test</scope>
5050
</dependency>
51+
<dependency>
52+
<groupId>org.roaringbitmap</groupId>
53+
<artifactId>RoaringBitmap</artifactId>
54+
<version>0.9.35</version>
55+
<scope>test</scope>
56+
</dependency>
5157
</dependencies>
5258
<issueManagement>
5359
<system>GitHub Issue Tracking</system>

src/main/java/me/lemire/integercompression/ByteIntegerCODEC.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ public interface ByteIntegerCODEC {
1818
* Compress data from an array to another array.
1919
*
2020
* Both inpos and outpos are modified to represent how much data was
21-
* read and written to if 12 ints (inlength = 12) are compressed to 3
21+
* read and written to. If 12 ints (inlength = 12) are compressed to 3
2222
* bytes, then inpos will be incremented by 12 while outpos will be
23-
* incremented by 3 we use IntWrapper to pass the values by reference.
23+
* incremented by 3. We use IntWrapper to pass the values by reference.
2424
*
2525
* @param in
2626
* input array

src/main/java/me/lemire/integercompression/DeltaZigzagVariableByte.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ public void uncompress(int[] inBuf, IntWrapper inPos, int inLen,
105105

106106
int ip = inPos.get();
107107
int op = outPos.get();
108-
int vbcNum = 0, vbcShift = 24; // Varialbe Byte Context.
108+
int vbcNum = 0, vbcShift = 24; // Variable Byte Context.
109109
final int inPosLast = ip + inLen;
110110
while (ip < inPosLast) {
111111
// Fetch a byte value.

src/main/java/me/lemire/integercompression/IntegerCODEC.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@ public interface IntegerCODEC {
1818
* Compress data from an array to another array.
1919
*
2020
* Both inpos and outpos are modified to represent how much data was
21-
* read and written to if 12 ints (inlength = 12) are compressed to 3
21+
* read and written to. If 12 ints (inlength = 12) are compressed to 3
2222
* ints, then inpos will be incremented by 12 while outpos will be
23-
* incremented by 3 we use IntWrapper to pass the values by reference.
23+
* incremented by 3. We use IntWrapper to pass the values by reference.
2424
*
2525
* @param in
2626
* input array

src/main/java/me/lemire/integercompression/SkippableIntegerCODEC.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
/**
1212
* Interface describing a standard CODEC to compress integers. This is a
13-
* variation on the IntegerCODEC interface meant to be used for random access.
13+
* variation on the IntegerCODEC interface meant to be used for head access.
1414
*
1515
* The main difference is that we must specify the number of integers we wish to
1616
* decode. This information should be stored elsewhere.
@@ -25,8 +25,8 @@ public interface SkippableIntegerCODEC {
2525
* Compress data from an array to another array.
2626
*
2727
* Both inpos and outpos are modified to represent how much data was read
28-
* and written to if 12 ints (inlength = 12) are compressed to 3 ints, then
29-
* inpos will be incremented by 12 while outpos will be incremented by 3 we
28+
* and written to. If 12 ints (inlength = 12) are compressed to 3 ints, then
29+
* inpos will be incremented by 12 while outpos will be incremented by 3. We
3030
* use IntWrapper to pass the values by reference.
3131
*
3232
* @param in

src/main/java/me/lemire/integercompression/VariableByte.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,11 @@ public void uncompress(int[] in, IntWrapper inpos, int inlength, int[] out,
122122
for (int v = 0, shift = 0; p < finalp;) {
123123
val = in[p];
124124
int c = (byte) (val >>> s);
125+
// Shift to next byte
125126
s += 8;
127+
// Shift to next integer if s==32
126128
p += s>>5;
129+
// cycle from 31 to 0
127130
s = s & 31;
128131
v += ((c & 127) << shift);
129132
if ((c & 128) == 128) {
@@ -187,8 +190,11 @@ public void headlessUncompress(int[] in, IntWrapper inpos, int inlength, int[] o
187190
for (int v = 0, shift = 0; tmpoutpos < finaloutpos;) {
188191
val = in[p];
189192
int c = val >>> s;
193+
// Shift to next byte
190194
s += 8;
195+
// Shift to next integer if s==32
191196
p += s>>5;
197+
// cycle from 31 to 0
192198
s = s & 31;
193199
v += ((c & 127) << shift);
194200
if ((c & 128) == 128) {
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/**
2+
* This code is released under the
3+
* Apache License Version 2.0 http://www.apache.org/licenses/.
4+
*
5+
* (c) Daniel Lemire, http://lemire.me/en/
6+
*/
7+
8+
package me.lemire.longcompression;
9+
10+
import me.lemire.integercompression.IntWrapper;
11+
12+
/**
13+
* Interface describing a CODEC to compress longs to bytes.
14+
*
15+
* @author Benoit Lacelle
16+
*
17+
*/
18+
public interface ByteLongCODEC {
19+
/**
20+
* Compress data from an array to another array.
21+
*
22+
* Both inpos and outpos are modified to represent how much data was
23+
* read and written to. If 12 longs (inlength = 12) are compressed to 3
24+
* bytes, then inpos will be incremented by 12 while outpos will be
25+
* incremented by 3. We use IntWrapper to pass the values by reference.
26+
*
27+
* @param in
28+
* input array
29+
* @param inpos
30+
* location in the input array
31+
* @param inlength
32+
* how many longs to compress
33+
* @param out
34+
* output array
35+
* @param outpos
36+
* where to write in the output array
37+
*/
38+
public void compress(long[] in, IntWrapper inpos, int inlength,
39+
byte[] out, IntWrapper outpos);
40+
41+
/**
42+
* Uncompress data from an array to another array.
43+
*
44+
* Both inpos and outpos parameters are modified to indicate new
45+
* positions after read/write.
46+
*
47+
* @param in
48+
* array containing data in compressed form
49+
* @param inpos
50+
* where to start reading in the array
51+
* @param inlength
52+
* length of the compressed data (ignored by some
53+
* schemes)
54+
* @param out
55+
* array where to write the compressed output
56+
* @param outpos
57+
* where to write the compressed output in out
58+
*/
59+
public void uncompress(byte[] in, IntWrapper inpos, int inlength,
60+
long[] out, IntWrapper outpos);
61+
62+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
package me.lemire.longcompression;
2+
3+
/**
4+
* This is just like LongCODEC, except that it indicates that delta coding is
5+
* "integrated", so that you don't need a separate step for delta coding.
6+
*
7+
* @author Benoit Lacelle
8+
*/
9+
public interface IntegratedLongCODEC extends LongCODEC {
10+
11+
}
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
package me.lemire.longcompression;
2+
3+
import java.util.Arrays;
4+
5+
import me.lemire.integercompression.BinaryPacking;
6+
import me.lemire.integercompression.Composition;
7+
import me.lemire.integercompression.IntCompressor;
8+
import me.lemire.integercompression.IntWrapper;
9+
import me.lemire.integercompression.IntegerCODEC;
10+
import me.lemire.integercompression.VariableByte;
11+
12+
/**
13+
* A {@link LongCODEC} which split each long in a highpart (32 first bits) and a low part (32 last bits).
14+
*
15+
* @author Benoit Lacelle
16+
*
17+
*/
18+
public class LongAs2IntsCodec implements LongCODEC {
19+
final IntegerCODEC highPartsCodec;
20+
final IntegerCODEC lowPartsCodec;
21+
22+
public LongAs2IntsCodec(IntegerCODEC highPartsCodec, IntegerCODEC lowPartsCodec) {
23+
this.highPartsCodec = highPartsCodec;
24+
this.lowPartsCodec = lowPartsCodec;
25+
}
26+
27+
/**
28+
* By default, we expect longs to be slightly above Integer.MAX_VALUE. Hence highParts to be small and positive
29+
* integers. For lowParts, we rely on {@link IntCompressor} default IntegerCODEC
30+
*/
31+
public LongAs2IntsCodec() {
32+
this(new VariableByte(), new Composition(new BinaryPacking(), new VariableByte()));
33+
}
34+
35+
@Override
36+
public void compress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
37+
if (inlength == 0) {
38+
return;
39+
}
40+
41+
int[] highParts = new int[inlength];
42+
int[] lowParts = new int[inlength];
43+
44+
for (int i = 0; i < inlength; i++) {
45+
int inPosition = inpos.get() + i;
46+
47+
highParts[i] = RoaringIntPacking.high(in[inPosition]);
48+
lowParts[i] = RoaringIntPacking.low(in[inPosition]);
49+
}
50+
51+
// TODO What would be a relevant buffer size?
52+
int[] buffer = new int[inlength * 16];
53+
54+
int outPosition = outpos.get();
55+
56+
boolean hasLeftover;
57+
{
58+
// The first integer is reserved to hold the number of compressed ints
59+
IntWrapper highPartsOutPosition = new IntWrapper(1);
60+
61+
highPartsCodec.compress(highParts, new IntWrapper(), inlength, buffer, highPartsOutPosition);
62+
63+
// Record the compressedHighparts length
64+
buffer[0] = highPartsOutPosition.get() - 1;
65+
66+
for (int i = 0; i < highPartsOutPosition.get() / 2; i++) {
67+
long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
68+
out[outPosition++] = pack;
69+
}
70+
71+
if (1 == highPartsOutPosition.get() % 2) {
72+
// Shift the trailing integer as first in the buffer
73+
hasLeftover = true;
74+
buffer[0] = buffer[highPartsOutPosition.get() - 1];
75+
} else {
76+
hasLeftover = false;
77+
}
78+
}
79+
80+
{
81+
// The first integer is reserved to hold the number of compressed ints
82+
IntWrapper lowPartsOutPosition = new IntWrapper(1);
83+
if (hasLeftover) {
84+
// Keep the trailing int from highParts before the reserved int from lowParts compressed length
85+
lowPartsOutPosition.set(2);
86+
}
87+
88+
lowPartsCodec.compress(lowParts, new IntWrapper(0), inlength, buffer, lowPartsOutPosition);
89+
90+
// Record the compressedHighparts length
91+
buffer[hasLeftover ? 1 : 0] = lowPartsOutPosition.get() - (hasLeftover ? 2 : 1);
92+
93+
for (int i = 0; i < lowPartsOutPosition.get() / 2; i++) {
94+
long pack = RoaringIntPacking.pack(buffer[i * 2], buffer[i * 2 + 1]);
95+
out[outPosition++] = pack;
96+
}
97+
98+
if (1 == lowPartsOutPosition.get() % 2) {
99+
// The trailing integer is packed with a 0
100+
long pack = RoaringIntPacking.pack(buffer[lowPartsOutPosition.get() - 1], 0);
101+
out[outPosition++] = pack;
102+
}
103+
}
104+
105+
inpos.add(inlength);
106+
outpos.set(outPosition);
107+
}
108+
109+
/**
110+
* inlength is ignored by this codec. We may rely on it instead of storing the compressedLowPart length
111+
*/
112+
@Override
113+
public void uncompress(long[] in, IntWrapper inpos, int inlength, long[] out, IntWrapper outpos) {
114+
if (inlength == 0) {
115+
return;
116+
}
117+
118+
int longIndex = inpos.get();
119+
120+
int nbCompressedHighParts = RoaringIntPacking.high(in[longIndex]);
121+
int[] compressedHighParts = new int[nbCompressedHighParts];
122+
123+
// !highPart as we just read the highPart for nbCompressedHighParts
124+
boolean highPart = false;
125+
for (int i = 0; i < nbCompressedHighParts; i++) {
126+
int nextInt;
127+
if (highPart) {
128+
nextInt = RoaringIntPacking.high(in[longIndex + (i + 1) / 2]);
129+
} else {
130+
nextInt = RoaringIntPacking.low(in[longIndex + (i + 1) / 2]);
131+
}
132+
compressedHighParts[i] = nextInt;
133+
134+
highPart = !highPart;
135+
}
136+
137+
// TODO What would be a relevant buffer size?
138+
int[] buffer = new int[inlength * 16];
139+
140+
IntWrapper highPartsOutPosition = new IntWrapper();
141+
highPartsCodec.uncompress(compressedHighParts,
142+
new IntWrapper(),
143+
compressedHighParts.length,
144+
buffer,
145+
highPartsOutPosition);
146+
int[] highParts = Arrays.copyOf(buffer, highPartsOutPosition.get());
147+
148+
// +1 as we initially read nbCompressedHighParts
149+
int intIndexNbCompressedLowParts = longIndex * 2 + 1 + nbCompressedHighParts;
150+
int nbCompressedLowParts;
151+
if (highPart) {
152+
nbCompressedLowParts = RoaringIntPacking.high(in[intIndexNbCompressedLowParts / 2]);
153+
} else {
154+
nbCompressedLowParts = RoaringIntPacking.low(in[intIndexNbCompressedLowParts / 2]);
155+
}
156+
highPart = !highPart;
157+
158+
int[] compressedLowParts = new int[nbCompressedLowParts];
159+
for (int i = 0; i < nbCompressedLowParts; i++) {
160+
int nextInt;
161+
if (highPart) {
162+
nextInt = RoaringIntPacking.high(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
163+
} else {
164+
nextInt = RoaringIntPacking.low(in[(intIndexNbCompressedLowParts + 1 + i) / 2]);
165+
}
166+
compressedLowParts[i] = nextInt;
167+
168+
highPart = !highPart;
169+
}
170+
171+
IntWrapper lowPartsOutPosition = new IntWrapper();
172+
lowPartsCodec.uncompress(compressedLowParts,
173+
new IntWrapper(),
174+
compressedLowParts.length,
175+
buffer,
176+
lowPartsOutPosition);
177+
int[] lowParts = Arrays.copyOf(buffer, lowPartsOutPosition.get());
178+
assert highParts.length == lowParts.length;
179+
180+
int outposition = outpos.get();
181+
for (int i = 0; i < highParts.length; i++) {
182+
out[outposition++] = RoaringIntPacking.pack(highParts[i], lowParts[i]);
183+
}
184+
185+
inpos.add(inlength);
186+
outpos.set(outposition);
187+
}
188+
189+
}

0 commit comments

Comments
 (0)