Skip to content

Commit ff73acc

Browse files
committed
encoding and combining update
1 parent e34286d commit ff73acc

File tree

11 files changed

+888
-295
lines changed

11 files changed

+888
-295
lines changed

src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/DictionaryFactory.java

Lines changed: 386 additions & 112 deletions
Large diffs are not rendered by default.

src/main/java/org/apache/sysds/runtime/compress/estim/encoding/ConstEncoding.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@
1919

2020
package org.apache.sysds.runtime.compress.estim.encoding;
2121

22-
import java.util.Map;
23-
2422
import org.apache.commons.lang3.tuple.ImmutablePair;
2523
import org.apache.commons.lang3.tuple.Pair;
2624
import org.apache.sysds.runtime.compress.CompressionSettings;
2725
import org.apache.sysds.runtime.compress.estim.EstimationFactors;
26+
import org.apache.sysds.runtime.compress.utils.HashMapLongInt;
2827

2928
/** Const encoding for cases where the entire group of columns is the same value */
3029
public class ConstEncoding extends AEncode {
@@ -41,7 +40,7 @@ public IEncode combine(IEncode e) {
4140
}
4241

4342
@Override
44-
public Pair<IEncode, Map<Integer, Integer>> combineWithMap(IEncode e) {
43+
public Pair<IEncode, HashMapLongInt> combineWithMap(IEncode e) {
4544
return new ImmutablePair<>(e, null);
4645
}
4746

src/main/java/org/apache/sysds/runtime/compress/estim/encoding/DenseEncoding.java

Lines changed: 182 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,40 @@
1919

2020
package org.apache.sysds.runtime.compress.estim.encoding;
2121

22-
import java.util.HashMap;
23-
import java.util.Map;
24-
2522
import org.apache.commons.lang3.tuple.ImmutablePair;
2623
import org.apache.commons.lang3.tuple.Pair;
2724
import org.apache.sysds.runtime.compress.CompressedMatrixBlock;
2825
import org.apache.sysds.runtime.compress.CompressionSettings;
2926
import org.apache.sysds.runtime.compress.DMLCompressionException;
3027
import org.apache.sysds.runtime.compress.colgroup.mapping.AMapToData;
28+
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToChar;
29+
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToCharPByte;
3130
import org.apache.sysds.runtime.compress.colgroup.mapping.MapToFactory;
3231
import org.apache.sysds.runtime.compress.colgroup.offset.AIterator;
3332
import org.apache.sysds.runtime.compress.estim.EstimationFactors;
33+
import org.apache.sysds.runtime.compress.utils.HashMapLongInt;
3434

3535
/**
3636
* An Encoding that contains a value on each row of the input.
3737
*/
3838
public class DenseEncoding extends AEncode {
3939

40+
private static boolean zeroWarn = false;
41+
4042
private final AMapToData map;
4143

4244
public DenseEncoding(AMapToData map) {
4345
this.map = map;
4446

4547
if(CompressedMatrixBlock.debug) {
48+
// if(!zeroWarn) {
4649
int[] freq = map.getCounts();
47-
for(int i = 0; i < freq.length; i++) {
48-
if(freq[i] == 0)
49-
throw new DMLCompressionException("Invalid counts in fact contains 0");
50+
for(int i = 0; i < freq.length && !zeroWarn; i++) {
51+
if(freq[i] == 0) {
52+
LOG.warn("Dense encoding contains zero encoding, indicating not all dictionary entries are in use");
53+
zeroWarn = true;
54+
55+
}
5056
}
5157
}
5258
}
@@ -62,7 +68,7 @@ else if(e instanceof SparseEncoding)
6268
}
6369

6470
@Override
65-
public Pair<IEncode, Map<Integer, Integer>> combineWithMap(IEncode e) {
71+
public Pair<IEncode, HashMapLongInt> combineWithMap(IEncode e) {
6672
if(e instanceof EmptyEncoding || e instanceof ConstEncoding)
6773
return new ImmutablePair<>(this, null);
6874
else if(e instanceof SparseEncoding)
@@ -106,14 +112,14 @@ private AMapToData assignSparse(SparseEncoding e) {
106112
return ret;
107113
}
108114

109-
private final Pair<IEncode, Map<Integer, Integer>> combineSparseHashMap(final AMapToData ret) {
115+
private final Pair<IEncode, HashMapLongInt> combineSparseHashMap(final AMapToData ret) {
110116
final int size = ret.size();
111-
final Map<Integer, Integer> m = new HashMap<>(size);
117+
final HashMapLongInt m = new HashMapLongInt(100);
112118
for(int r = 0; r < size; r++) {
113119
final int prev = ret.getIndex(r);
114120
final int v = m.size();
115-
final Integer mv = m.putIfAbsent(prev, v);
116-
if(mv == null)
121+
final int mv = m.putIfAbsent(prev, v);
122+
if(mv == -1)
117123
ret.set(r, v);
118124
else
119125
ret.set(r, mv);
@@ -146,27 +152,47 @@ protected DenseEncoding combineDense(final DenseEncoding other) {
146152
final int nVL = lm.getUnique();
147153
final int nVR = rm.getUnique();
148154
final int size = map.size();
149-
final int maxUnique = nVL * nVR;
150-
155+
int maxUnique = nVL * nVR;
156+
final DenseEncoding retE;
151157
final AMapToData ret = MapToFactory.create(size, maxUnique);
152-
153-
if(maxUnique > size && maxUnique > 2048) {
158+
if(maxUnique < Math.max(nVL, nVR)) {// overflow
159+
final HashMapLongInt m = new HashMapLongInt(Math.max(100, size / 100));
160+
retE = combineDenseWithHashMapLong(lm, rm, size, nVL, ret, m);
161+
}
162+
else if(maxUnique > size && maxUnique > 2048) {
154163
// aka there is more maxUnique than rows.
155-
final Map<Integer, Integer> m = new HashMap<>(size);
156-
return combineDenseWithHashMap(lm, rm, size, nVL, ret, m);
164+
final HashMapLongInt m = new HashMapLongInt(Math.max(100, maxUnique / 100));
165+
retE = combineDenseWithHashMap(lm, rm, size, nVL, ret, m);
157166
}
158167
else {
159168
final AMapToData m = MapToFactory.create(maxUnique, maxUnique + 1);
160-
return combineDenseWithMapToData(lm, rm, size, nVL, ret, maxUnique, m);
169+
retE = combineDenseWithMapToData(lm, rm, size, nVL, ret, maxUnique, m);
170+
}
171+
172+
if(retE.getUnique() < 0) {
173+
String th = this.toString();
174+
String ot = other.toString();
175+
String cm = retE.toString();
176+
177+
if(th.length() > 1000)
178+
th = th.substring(0, 1000);
179+
if(ot.length() > 1000)
180+
ot = ot.substring(0, 1000);
181+
if(cm.length() > 1000)
182+
cm = cm.substring(0, 1000);
183+
throw new DMLCompressionException(
184+
"Failed to combine dense encodings correctly: Number unique values is lower than max input: \n\n" + th
185+
+ "\n\n" + ot + "\n\n" + cm);
161186
}
187+
return retE;
162188
}
163189

164-
private Pair<IEncode, Map<Integer, Integer>> combineDenseNoResize(final DenseEncoding other) {
165-
if(map == other.map) {
190+
private Pair<IEncode, HashMapLongInt> combineDenseNoResize(final DenseEncoding other) {
191+
if(map.equals(other.map)) {
166192
LOG.warn("Constructing perfect mapping, this could be optimized to skip hashmap");
167-
final Map<Integer, Integer> m = new HashMap<>(map.size());
193+
final HashMapLongInt m = new HashMapLongInt(Math.max(100, map.size() / 100));
168194
for(int i = 0; i < map.getUnique(); i++)
169-
m.put(i * i, i);
195+
m.putIfAbsent(i * (map.getUnique() + 1), i);
170196
return new ImmutablePair<>(this, m); // same object
171197
}
172198

@@ -176,40 +202,107 @@ private Pair<IEncode, Map<Integer, Integer>> combineDenseNoResize(final DenseEnc
176202
final int nVL = lm.getUnique();
177203
final int nVR = rm.getUnique();
178204
final int size = map.size();
179-
final int maxUnique = nVL * nVR;
205+
final int maxUnique = (int) Math.min((long) nVL * nVR, (long) size);
180206

181207
final AMapToData ret = MapToFactory.create(size, maxUnique);
182208

183-
final Map<Integer, Integer> m = new HashMap<>(Math.min(size, maxUnique));
209+
final HashMapLongInt m = new HashMapLongInt(Math.max(100, maxUnique / 1000));
184210
return new ImmutablePair<>(combineDenseWithHashMap(lm, rm, size, nVL, ret, m), m);
185-
186-
// there can be less unique.
187-
188-
// return new DenseEncoding(ret);
189211
}
190212

191-
private Pair<IEncode, Map<Integer, Integer>> combineSparseNoResize(final SparseEncoding other) {
213+
private Pair<IEncode, HashMapLongInt> combineSparseNoResize(final SparseEncoding other) {
192214
final AMapToData a = assignSparse(other);
193215
return combineSparseHashMap(a);
194216
}
195217

218+
protected final DenseEncoding combineDenseWithHashMapLong(final AMapToData lm, final AMapToData rm, final int size,
219+
final long nVL, final AMapToData ret, HashMapLongInt m) {
220+
if(ret instanceof MapToChar)
221+
for(int r = 0; r < size; r++)
222+
addValHashMapChar((long) lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, (MapToChar) ret);
223+
else
224+
for(int r = 0; r < size; r++)
225+
addValHashMap((long) lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, ret);
226+
return new DenseEncoding(ret.resize(m.size()));
227+
}
228+
196229
protected final DenseEncoding combineDenseWithHashMap(final AMapToData lm, final AMapToData rm, final int size,
197-
final int nVL, final AMapToData ret, Map<Integer, Integer> m) {
230+
final int nVL, final AMapToData ret, HashMapLongInt m) {
231+
// JIT compile instance checks.
232+
if(ret instanceof MapToChar)
233+
combineDenseWIthHashMapCharOut(lm, rm, size, nVL, (MapToChar) ret, m);
234+
else if(ret instanceof MapToCharPByte)
235+
combineDenseWIthHashMapPByteOut(lm, rm, size, nVL, (MapToCharPByte) ret, m);
236+
else
237+
combineDenseWithHashMapGeneric(lm, rm, size, nVL, ret, m);
238+
ret.setUnique(m.size());
239+
return new DenseEncoding(ret);
198240

241+
}
242+
243+
private final void combineDenseWIthHashMapPByteOut(final AMapToData lm, final AMapToData rm, final int size,
244+
final int nVL, final MapToCharPByte ret, HashMapLongInt m) {
245+
for(int r = 0; r < size; r++)
246+
addValHashMapCharByte(lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, ret);
247+
}
248+
249+
private final void combineDenseWIthHashMapCharOut(final AMapToData lm, final AMapToData rm, final int size,
250+
final int nVL, final MapToChar ret, HashMapLongInt m) {
251+
if(lm instanceof MapToChar && rm instanceof MapToChar)
252+
combineDenseWIthHashMapAllChar(lm, rm, size, nVL, ret, m);
253+
else// some other combination
254+
combineDenseWIthHashMapCharOutGeneric(lm, rm, size, nVL, ret, m);
255+
}
256+
257+
private final void combineDenseWIthHashMapCharOutGeneric(final AMapToData lm, final AMapToData rm, final int size,
258+
final int nVL, final MapToChar ret, HashMapLongInt m) {
259+
for(int r = 0; r < size; r++)
260+
addValHashMapChar(lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, ret);
261+
}
262+
263+
private final void combineDenseWIthHashMapAllChar(final AMapToData lm, final AMapToData rm, final int size,
264+
final int nVL, final MapToChar ret, HashMapLongInt m) {
265+
final MapToChar lmC = (MapToChar) lm;
266+
final MapToChar rmC = (MapToChar) rm;
267+
for(int r = 0; r < size; r++)
268+
addValHashMapChar(lmC.getIndex(r) + rmC.getIndex(r) * nVL, r, m, ret);
269+
270+
}
271+
272+
protected final void combineDenseWithHashMapGeneric(final AMapToData lm, final AMapToData rm, final int size,
273+
final int nVL, final AMapToData ret, HashMapLongInt m) {
199274
for(int r = 0; r < size; r++)
200275
addValHashMap(lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, ret);
201-
return new DenseEncoding(ret.resize(m.size()));
202276
}
203277

204278
protected final DenseEncoding combineDenseWithMapToData(final AMapToData lm, final AMapToData rm, final int size,
205279
final int nVL, final AMapToData ret, final int maxUnique, final AMapToData m) {
280+
if(m instanceof MapToChar)
281+
return combineDenseWithMapToDataToChar(lm, rm, size, nVL, ret, maxUnique, (MapToChar) m);
282+
else
283+
return combineDenseWithMapToDataGeneric(lm, rm, size, nVL, ret, maxUnique, m);
284+
285+
}
286+
287+
protected final DenseEncoding combineDenseWithMapToDataToChar(final AMapToData lm, final AMapToData rm,
288+
final int size, final int nVL, final AMapToData ret, final int maxUnique, final MapToChar m) {
289+
int newUID = 1;
290+
for(int r = 0; r < size; r++)
291+
newUID = addValMapToDataChar(lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, newUID, ret);
292+
ret.setUnique(newUID - 1);
293+
return new DenseEncoding(ret);
294+
}
295+
296+
protected final DenseEncoding combineDenseWithMapToDataGeneric(final AMapToData lm, final AMapToData rm,
297+
final int size, final int nVL, final AMapToData ret, final int maxUnique, final AMapToData m) {
206298
int newUID = 1;
207299
for(int r = 0; r < size; r++)
208300
newUID = addValMapToData(lm.getIndex(r) + rm.getIndex(r) * nVL, r, m, newUID, ret);
209-
return new DenseEncoding(ret.resize(newUID - 1));
301+
ret.setUnique(newUID - 1);
302+
return new DenseEncoding(ret);
210303
}
211304

212-
protected static int addValMapToData(final int nv, final int r, final AMapToData map, int newId,
305+
protected static int addValMapToDataChar(final int nv, final int r, final MapToChar map, int newId,
213306
final AMapToData d) {
214307
int mv = map.getIndex(nv);
215308
if(mv == 0)
@@ -218,11 +311,56 @@ protected static int addValMapToData(final int nv, final int r, final AMapToData
218311
return newId;
219312
}
220313

221-
protected static void addValHashMap(final int nv, final int r, final Map<Integer, Integer> map,
314+
protected static int addValMapToData(final int nv, final int r, final AMapToData map, int newId,
222315
final AMapToData d) {
316+
int mv = map.getIndex(nv);
317+
if(mv == 0)
318+
mv = map.setAndGet(nv, newId++);
319+
d.set(r, mv - 1);
320+
return newId;
321+
}
322+
323+
protected static void addValHashMap(final int nv, final int r, final HashMapLongInt map, final AMapToData d) {
223324
final int v = map.size();
224-
final Integer mv = map.putIfAbsent(nv, v);
225-
if(mv == null)
325+
final int mv = map.putIfAbsent(nv, v);
326+
if(mv == -1)
327+
d.set(r, v);
328+
else
329+
d.set(r, mv);
330+
}
331+
332+
protected static void addValHashMapChar(final int nv, final int r, final HashMapLongInt map, final MapToChar d) {
333+
final int v = map.size();
334+
final int mv = map.putIfAbsent(nv, v);
335+
if(mv == -1)
336+
d.set(r, v);
337+
else
338+
d.set(r, mv);
339+
}
340+
341+
protected static void addValHashMapCharByte(final int nv, final int r, final HashMapLongInt map,
342+
final MapToCharPByte d) {
343+
final int v = map.size();
344+
final int mv = map.putIfAbsent(nv, v);
345+
if(mv == -1)
346+
d.set(r, v);
347+
else
348+
d.set(r, mv);
349+
}
350+
351+
protected static void addValHashMapChar(final long nv, final int r, final HashMapLongInt map, final MapToChar d) {
352+
final int v = map.size();
353+
final int mv = map.putIfAbsent(nv, v);
354+
if(mv == -1)
355+
d.set(r, v);
356+
else
357+
d.set(r, mv);
358+
}
359+
360+
protected static void addValHashMap(final long nv, final int r, final HashMapLongInt map, final AMapToData d) {
361+
final int v = map.size();
362+
final int mv = map.putIfAbsent(nv, v);
363+
if(mv == -1)
226364
d.set(r, v);
227365
else
228366
d.set(r, mv);
@@ -237,13 +375,18 @@ public int getUnique() {
237375
public EstimationFactors extractFacts(int nRows, double tupleSparsity, double matrixSparsity,
238376
CompressionSettings cs) {
239377
int largestOffs = 0;
240-
241378
int[] counts = map.getCounts();
242379
for(int i = 0; i < counts.length; i++)
243380
if(counts[i] > largestOffs)
244381
largestOffs = counts[i];
245-
else if(counts[i] == 0)
246-
throw new DMLCompressionException("Invalid count of 0 all values should have at least one instance");
382+
else if(counts[i] == 0) {
383+
if(!zeroWarn) {
384+
LOG.warn("Invalid count of 0 all values should have at least one instance index: " + i + " of "
385+
+ counts.length);
386+
zeroWarn = true;
387+
}
388+
counts[i] = 1;
389+
}
247390

248391
if(cs.isRLEAllowed())
249392
return new EstimationFactors(map.getUnique(), nRows, largestOffs, counts, 0, nRows, map.countRuns(), false,

src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EmptyEncoding.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,11 @@
1919

2020
package org.apache.sysds.runtime.compress.estim.encoding;
2121

22-
import java.util.Map;
23-
2422
import org.apache.commons.lang3.tuple.ImmutablePair;
2523
import org.apache.commons.lang3.tuple.Pair;
2624
import org.apache.sysds.runtime.compress.CompressionSettings;
2725
import org.apache.sysds.runtime.compress.estim.EstimationFactors;
26+
import org.apache.sysds.runtime.compress.utils.HashMapLongInt;
2827

2928
/**
3029
* Empty encoding for cases where the entire group of columns is zero
@@ -41,7 +40,7 @@ public IEncode combine(IEncode e) {
4140
}
4241

4342
@Override
44-
public Pair<IEncode, Map<Integer, Integer>> combineWithMap(IEncode e) {
43+
public Pair<IEncode, HashMapLongInt> combineWithMap(IEncode e) {
4544
return new ImmutablePair<>(e, null);
4645
}
4746

src/main/java/org/apache/sysds/runtime/compress/estim/encoding/EncodingFactory.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,16 @@ else if(alen - apos > nCol / 4) { // return a dense encoding
229229

230230
// Iteration 3 of non zero indexes, make a Offset Encoding to know what cells are zero and not.
231231
// not done yet
232-
final AOffset o = OffsetFactory.createOffset(aix, apos, alen);
233-
return new SparseEncoding(d, o, m.getNumColumns());
232+
try{
233+
234+
final AOffset o = OffsetFactory.createOffset(aix, apos, alen);
235+
return new SparseEncoding(d, o, m.getNumColumns());
236+
}
237+
catch(Exception e){
238+
String mes = Arrays.toString(Arrays.copyOfRange(aix, apos, alen)) + "\n" + apos + " " + alen;
239+
mes += Arrays.toString(Arrays.copyOfRange(avals, apos, alen));
240+
throw new DMLRuntimeException(mes, e);
241+
}
234242
}
235243
}
236244

0 commit comments

Comments
 (0)