Skip to content

Commit 8e6ebcf

Browse files
committed
[SYSTEMDS-3779] Added benchmark 'tests' with helpers for DDCLZW vs DDC
Signed-off-by: Luka Dekanozishvili <[email protected]>
1 parent 72a439b commit 8e6ebcf

File tree

1 file changed

+293
-0
lines changed

1 file changed

+293
-0
lines changed
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.sysds.test.component.compress.colgroup;
21+
22+
import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC;
23+
import org.apache.sysds.runtime.compress.colgroup.ColGroupDDCLZW;
24+
import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary;
25+
import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory;
26+
import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex;
27+
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
28+
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
29+
import org.junit.Test;
30+
31+
import java.util.Arrays;
32+
import java.util.stream.IntStream;
33+
34+
public class ColGroupDDCLZWBenchmark {
35+
private static final int BENCHMARK_ITERATIONS = 10;
36+
37+
private static final int[] DATA_SIZES = {1, 10, 100, 1000, 10000, 100_000};
38+
39+
private static class BenchmarkResult {
40+
int dataSize;
41+
42+
long ddcMemoryBytes;
43+
long ddcCompressionTimeNs;
44+
long ddcDecompressionTimeNs;
45+
46+
long ddclzwMemoryBytes;
47+
long ddclzwCompressionTimeNs;
48+
long ddclzwDecompressionTimeNs;
49+
50+
// Comparison info
51+
double memoryReduction;
52+
double compressionSpeedup;
53+
double decompressionSpeedup;
54+
55+
void calculateMetrics() {
56+
memoryReduction = (double) ddclzwMemoryBytes / ddcMemoryBytes;
57+
compressionSpeedup = (double) ddcCompressionTimeNs / ddclzwCompressionTimeNs;
58+
decompressionSpeedup = (double) ddcDecompressionTimeNs / ddclzwDecompressionTimeNs;
59+
}
60+
61+
/// Pretty-print a colorful percent text
62+
String formatPercent(double ratio) {
63+
double percent = (100.0 * (1.0 - ratio));
64+
String ansiColor = percent > 0 ? "\u001B[32m" : "\u001B[31m";
65+
return ansiColor + String.format("%6.2f%%", percent) + "\u001B[0m";
66+
}
67+
68+
@Override
69+
public String toString() {
70+
return String.format("Size: %7d | DDC: %8d bytes | DDCLZW: %8d bytes | " +
71+
"Memory reduction: %s | De-/Compression speedup: %.2f/%.2f times", dataSize, ddcMemoryBytes,
72+
ddclzwMemoryBytes, formatPercent(memoryReduction), decompressionSpeedup, compressionSpeedup);
73+
}
74+
}
75+
76+
// Pattern generators (array)
77+
private int[] genPatternRepeating(int size, int... pattern) {
78+
int[] result = new int[size];
79+
for(int i = 0; i < size; i++) {
80+
result[i] = pattern[i % pattern.length];
81+
}
82+
return result;
83+
}
84+
85+
/**
86+
* Args (10, 5) Generates a pattern like: [0, 0, 1, 1, 2, 2, 3, 3, 4, 4]
87+
*/
88+
private int[] genPatternDistributed(int size, int nUnique) {
89+
int[] result = new int[size];
90+
int runLength = size / nUnique;
91+
int pos = 0;
92+
for(int i = 0; i < nUnique && pos < size; i++) {
93+
int endPos = Math.min(pos + runLength, size);
94+
Arrays.fill(result, pos, endPos, i);
95+
pos = endPos;
96+
}
97+
return result;
98+
}
99+
100+
private int[] genPatternRandom(int size, int nUnique, long seed) {
101+
int[] result = new int[size];
102+
java.util.Random rand = new java.util.Random(seed);
103+
for(int i = 0; i < size; i++) {
104+
result[i] = rand.nextInt(nUnique);
105+
}
106+
return result;
107+
}
108+
109+
private void printBenchmarkTitle() {
110+
String callerMethodName = StackWalker.getInstance().walk(stream -> stream.skip(1).findFirst().get())
111+
.getMethodName();
112+
113+
System.out.println();
114+
System.out.println("=".repeat(80));
115+
System.out.println("Benchmark: " + callerMethodName);
116+
System.out.println("=".repeat(80));
117+
System.out.println();
118+
}
119+
120+
private ColGroupDDC createBenchmarkDDC(int[] mapping, int nUnique, int nCols) {
121+
IColIndex colIndexes = ColIndexFactory.create(nCols);
122+
123+
double[] dictValues = new double[nUnique * nCols];
124+
for(int i = 0; i < nUnique; i++) {
125+
for(int c = 0; c < nCols; c++) {
126+
dictValues[i * nCols + c] = (i + 1) * 10.0 + c;
127+
}
128+
}
129+
Dictionary dict = Dictionary.create(dictValues);
130+
131+
AMapToData data = MapToFactory.create(mapping.length, nUnique);
132+
for(int i = 0; i < mapping.length; i++) {
133+
data.set(i, mapping[i]);
134+
}
135+
136+
return (ColGroupDDC) ColGroupDDC.create(colIndexes, dict, data, null);
137+
}
138+
139+
private BenchmarkResult runBenchmark(int[] mapping, int nUnique, int nCols) {
140+
BenchmarkResult result = new BenchmarkResult();
141+
result.dataSize = mapping.length;
142+
143+
ColGroupDDC ddc = createBenchmarkDDC(mapping, nUnique, nCols);
144+
145+
// Measure DDC memory (though the method calculates how much storage it would take if the data structure were written to disk)
146+
result.ddcMemoryBytes = ddc.getExactSizeOnDisk();
147+
148+
// Measure DDC decompression time (it's already decompressed, so measure access time)
149+
long startTime = System.nanoTime();
150+
for(int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
151+
AMapToData mapping_copy = ddc.getMapToData();
152+
mapping_copy.getIndex(mapping.length / 2);
153+
}
154+
long endTime = System.nanoTime();
155+
result.ddcDecompressionTimeNs = (endTime - startTime) / BENCHMARK_ITERATIONS;
156+
157+
// Measure DDCLZW compression time
158+
startTime = System.nanoTime();
159+
ColGroupDDCLZW ddclzw = null;
160+
for(int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
161+
ddclzw = (ColGroupDDCLZW) ddc.convertToDDCLZW();
162+
}
163+
endTime = System.nanoTime();
164+
result.ddclzwCompressionTimeNs = (endTime - startTime) / BENCHMARK_ITERATIONS;
165+
166+
// Measure DDCLZW memory
167+
result.ddclzwMemoryBytes = ddclzw.getExactSizeOnDisk();
168+
169+
// Measure DDCLZW decompression time
170+
startTime = System.nanoTime();
171+
for(int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
172+
ColGroupDDC decompressed = (ColGroupDDC) ddclzw.convertToDDC();
173+
AMapToData mapping_copy = decompressed.getMapToData();
174+
mapping_copy.getIndex(mapping.length / 2);
175+
}
176+
endTime = System.nanoTime();
177+
result.ddclzwDecompressionTimeNs = (endTime - startTime) / BENCHMARK_ITERATIONS;
178+
179+
result.calculateMetrics();
180+
return result;
181+
}
182+
183+
@Test
184+
public void benchmarkRepeatingPatterns() {
185+
printBenchmarkTitle();
186+
for(int size : DATA_SIZES) {
187+
int[] mapping = genPatternRepeating(size, 0, 1, 2);
188+
BenchmarkResult result = runBenchmark(mapping, 3, 1);
189+
System.out.println(result);
190+
}
191+
}
192+
193+
@Test
194+
public void benchmarkDistributed() {
195+
printBenchmarkTitle();
196+
for(int size : DATA_SIZES) {
197+
int[] mapping = genPatternDistributed(size, 3);
198+
BenchmarkResult result = runBenchmark(mapping, 3, 1);
199+
System.out.println(result);
200+
}
201+
}
202+
203+
@Test
204+
public void benchmarkRandomData() {
205+
printBenchmarkTitle();
206+
for(int size : DATA_SIZES) {
207+
int[] mapping = genPatternRandom(size, 5, 42);
208+
BenchmarkResult result = runBenchmark(mapping, 5, 1);
209+
System.out.println(result);
210+
}
211+
}
212+
213+
@Test
214+
public void benchmarkMultiColumn() {
215+
printBenchmarkTitle();
216+
for(int size : DATA_SIZES) {
217+
int[] mapping = genPatternRepeating(size, 0, 1, 2, 1, 0);
218+
BenchmarkResult result = runBenchmark(mapping, 3, 3);
219+
System.out.println(result);
220+
}
221+
}
222+
223+
@Test
224+
public void benchmarkUniques() {
225+
printBenchmarkTitle();
226+
int size = 10000;
227+
for(int nUnique : new int[] {2, 5, 10, 20, 50}) {
228+
int[] mapping = genPatternRepeating(size, IntStream.range(0, nUnique).toArray());
229+
BenchmarkResult result = runBenchmark(mapping, nUnique, 1);
230+
System.out.println(result);
231+
}
232+
}
233+
234+
@Test
235+
public void benchmarkGetIdx() { // TODO: is this benchmark useful when the time complexity is completely different?
236+
printBenchmarkTitle();
237+
238+
final int[] DATA_SIZES_GET_IDX = {10, 50, 100};
239+
for(int size : DATA_SIZES_GET_IDX) {
240+
int[] mapping = genPatternRepeating(size, 0, 1, 2);
241+
ColGroupDDC ddc = createBenchmarkDDC(mapping, 3, 2);
242+
ColGroupDDCLZW ddclzw = (ColGroupDDCLZW) ddc.convertToDDCLZW();
243+
244+
// Benchmark DDC
245+
long startTime = System.nanoTime();
246+
for(int iter = 0; iter < BENCHMARK_ITERATIONS * 100; iter++) {
247+
ddc.getIdx(size / 2, 0);
248+
}
249+
long ddcTime = System.nanoTime() - startTime;
250+
251+
// Benchmark DDCLZW
252+
startTime = System.nanoTime();
253+
for(int iter = 0; iter < BENCHMARK_ITERATIONS * 100; iter++) {
254+
ddclzw.getIdx(size / 2, 0);
255+
}
256+
long ddclzwTime = System.nanoTime() - startTime;
257+
258+
System.out.printf("Size: %7d | DDC: %6.2f ms | DDCLZW: %6d ms | Slowdown: %.2f times\n", size,
259+
(double) ddcTime / 1_000_000, ddclzwTime / 1_000_000, (double) ddclzwTime / ddcTime);
260+
}
261+
}
262+
263+
@Test
264+
public void benchmarkSlice() {
265+
printBenchmarkTitle();
266+
267+
for(int size : DATA_SIZES) {
268+
int[] mapping = genPatternRepeating(size, 0, 1, 2);
269+
ColGroupDDC ddc = createBenchmarkDDC(mapping, 3, 1);
270+
ColGroupDDCLZW ddclzw = (ColGroupDDCLZW) ddc.convertToDDCLZW();
271+
272+
int sliceStart = size / 4;
273+
int sliceEnd = 3 * size / 4;
274+
275+
// Benchmark DDC
276+
long startTime = System.nanoTime();
277+
for(int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
278+
ddc.sliceRows(sliceStart, sliceEnd);
279+
}
280+
long ddcTime = System.nanoTime() - startTime;
281+
282+
// Benchmark DDCLZW
283+
startTime = System.nanoTime();
284+
for(int iter = 0; iter < BENCHMARK_ITERATIONS; iter++) {
285+
ddclzw.sliceRows(sliceStart, sliceEnd);
286+
}
287+
long ddclzwTime = System.nanoTime() - startTime;
288+
289+
System.out.printf("Size: %7d | Slice[%5d:%5d] | DDC: %6d ms | DDCLZW: %6d ms | Slowdown: %.2f times\n",
290+
size, sliceStart, sliceEnd, ddcTime / 1_000_000, ddclzwTime / 1_000_000, (double) ddclzwTime / ddcTime);
291+
}
292+
}
293+
}

0 commit comments

Comments
 (0)