1919
2020package org .apache .sysds .runtime .compress .estim .encoding ;
2121
22- import java .util .HashMap ;
23- import java .util .Map ;
24-
2522import org .apache .commons .lang3 .tuple .ImmutablePair ;
2623import org .apache .commons .lang3 .tuple .Pair ;
2724import org .apache .sysds .runtime .compress .CompressedMatrixBlock ;
2825import org .apache .sysds .runtime .compress .CompressionSettings ;
2926import org .apache .sysds .runtime .compress .DMLCompressionException ;
3027import org .apache .sysds .runtime .compress .colgroup .mapping .AMapToData ;
28+ import org .apache .sysds .runtime .compress .colgroup .mapping .MapToChar ;
29+ import org .apache .sysds .runtime .compress .colgroup .mapping .MapToCharPByte ;
3130import org .apache .sysds .runtime .compress .colgroup .mapping .MapToFactory ;
3231import org .apache .sysds .runtime .compress .colgroup .offset .AIterator ;
3332import org .apache .sysds .runtime .compress .estim .EstimationFactors ;
33+ import org .apache .sysds .runtime .compress .utils .HashMapLongInt ;
3434
3535/**
3636 * An Encoding that contains a value on each row of the input.
3737 */
3838public class DenseEncoding extends AEncode {
3939
40+ private static boolean zeroWarn = false ;
41+
4042 private final AMapToData map ;
4143
4244 public DenseEncoding (AMapToData map ) {
4345 this .map = map ;
4446
4547 if (CompressedMatrixBlock .debug ) {
48+ // if(!zeroWarn) {
4649 int [] freq = map .getCounts ();
47- for (int i = 0 ; i < freq .length ; i ++) {
48- if (freq [i ] == 0 )
49- throw new DMLCompressionException ("Invalid counts in fact contains 0" );
50+ for (int i = 0 ; i < freq .length && !zeroWarn ; i ++) {
51+ if (freq [i ] == 0 ) {
52+ LOG .warn ("Dense encoding contains zero encoding, indicating not all dictionary entries are in use" );
53+ zeroWarn = true ;
54+
55+ }
5056 }
5157 }
5258 }
@@ -62,7 +68,7 @@ else if(e instanceof SparseEncoding)
6268 }
6369
6470 @ Override
65- public Pair <IEncode , Map < Integer , Integer > > combineWithMap (IEncode e ) {
71+ public Pair <IEncode , HashMapLongInt > combineWithMap (IEncode e ) {
6672 if (e instanceof EmptyEncoding || e instanceof ConstEncoding )
6773 return new ImmutablePair <>(this , null );
6874 else if (e instanceof SparseEncoding )
@@ -106,14 +112,14 @@ private AMapToData assignSparse(SparseEncoding e) {
106112 return ret ;
107113 }
108114
109- private final Pair <IEncode , Map < Integer , Integer > > combineSparseHashMap (final AMapToData ret ) {
115+ private final Pair <IEncode , HashMapLongInt > combineSparseHashMap (final AMapToData ret ) {
110116 final int size = ret .size ();
111- final Map < Integer , Integer > m = new HashMap <>( size );
117+ final HashMapLongInt m = new HashMapLongInt ( 100 );
112118 for (int r = 0 ; r < size ; r ++) {
113119 final int prev = ret .getIndex (r );
114120 final int v = m .size ();
115- final Integer mv = m .putIfAbsent (prev , v );
116- if (mv == null )
121+ final int mv = m .putIfAbsent (prev , v );
122+ if (mv == - 1 )
117123 ret .set (r , v );
118124 else
119125 ret .set (r , mv );
@@ -146,27 +152,47 @@ protected DenseEncoding combineDense(final DenseEncoding other) {
146152 final int nVL = lm .getUnique ();
147153 final int nVR = rm .getUnique ();
148154 final int size = map .size ();
149- final int maxUnique = nVL * nVR ;
150-
155+ int maxUnique = nVL * nVR ;
156+ final DenseEncoding retE ;
151157 final AMapToData ret = MapToFactory .create (size , maxUnique );
152-
153- if (maxUnique > size && maxUnique > 2048 ) {
158+ if (maxUnique < Math .max (nVL , nVR )) {// overflow
159+ final HashMapLongInt m = new HashMapLongInt (Math .max (100 , size / 100 ));
160+ retE = combineDenseWithHashMapLong (lm , rm , size , nVL , ret , m );
161+ }
162+ else if (maxUnique > size && maxUnique > 2048 ) {
154163 // aka there is more maxUnique than rows.
155- final Map < Integer , Integer > m = new HashMap <>( size );
156- return combineDenseWithHashMap (lm , rm , size , nVL , ret , m );
164+ final HashMapLongInt m = new HashMapLongInt ( Math . max ( 100 , maxUnique / 100 ) );
165+ retE = combineDenseWithHashMap (lm , rm , size , nVL , ret , m );
157166 }
158167 else {
159168 final AMapToData m = MapToFactory .create (maxUnique , maxUnique + 1 );
160- return combineDenseWithMapToData (lm , rm , size , nVL , ret , maxUnique , m );
169+ retE = combineDenseWithMapToData (lm , rm , size , nVL , ret , maxUnique , m );
170+ }
171+
172+ if (retE .getUnique () < 0 ) {
173+ String th = this .toString ();
174+ String ot = other .toString ();
175+ String cm = retE .toString ();
176+
177+ if (th .length () > 1000 )
178+ th = th .substring (0 , 1000 );
179+ if (ot .length () > 1000 )
180+ ot = ot .substring (0 , 1000 );
181+ if (cm .length () > 1000 )
182+ cm = cm .substring (0 , 1000 );
183+ throw new DMLCompressionException (
184+ "Failed to combine dense encodings correctly: Number unique values is lower than max input: \n \n " + th
185+ + "\n \n " + ot + "\n \n " + cm );
161186 }
187+ return retE ;
162188 }
163189
164- private Pair <IEncode , Map < Integer , Integer > > combineDenseNoResize (final DenseEncoding other ) {
165- if (map == other .map ) {
190+ private Pair <IEncode , HashMapLongInt > combineDenseNoResize (final DenseEncoding other ) {
191+ if (map . equals ( other .map ) ) {
166192 LOG .warn ("Constructing perfect mapping, this could be optimized to skip hashmap" );
167- final Map < Integer , Integer > m = new HashMap <>( map .size ());
193+ final HashMapLongInt m = new HashMapLongInt ( Math . max ( 100 , map .size () / 100 ));
168194 for (int i = 0 ; i < map .getUnique (); i ++)
169- m .put (i * i , i );
195+ m .putIfAbsent (i * ( map . getUnique () + 1 ) , i );
170196 return new ImmutablePair <>(this , m ); // same object
171197 }
172198
@@ -176,40 +202,107 @@ private Pair<IEncode, Map<Integer, Integer>> combineDenseNoResize(final DenseEnc
176202 final int nVL = lm .getUnique ();
177203 final int nVR = rm .getUnique ();
178204 final int size = map .size ();
179- final int maxUnique = nVL * nVR ;
205+ final int maxUnique = ( int ) Math . min (( long ) nVL * nVR , ( long ) size ) ;
180206
181207 final AMapToData ret = MapToFactory .create (size , maxUnique );
182208
183- final Map < Integer , Integer > m = new HashMap <> (Math .min ( size , maxUnique ));
209+ final HashMapLongInt m = new HashMapLongInt (Math .max ( 100 , maxUnique / 1000 ));
184210 return new ImmutablePair <>(combineDenseWithHashMap (lm , rm , size , nVL , ret , m ), m );
185-
186- // there can be less unique.
187-
188- // return new DenseEncoding(ret);
189211 }
190212
191- private Pair <IEncode , Map < Integer , Integer > > combineSparseNoResize (final SparseEncoding other ) {
213+ private Pair <IEncode , HashMapLongInt > combineSparseNoResize (final SparseEncoding other ) {
192214 final AMapToData a = assignSparse (other );
193215 return combineSparseHashMap (a );
194216 }
195217
218+ protected final DenseEncoding combineDenseWithHashMapLong (final AMapToData lm , final AMapToData rm , final int size ,
219+ final long nVL , final AMapToData ret , HashMapLongInt m ) {
220+ if (ret instanceof MapToChar )
221+ for (int r = 0 ; r < size ; r ++)
222+ addValHashMapChar ((long ) lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , (MapToChar ) ret );
223+ else
224+ for (int r = 0 ; r < size ; r ++)
225+ addValHashMap ((long ) lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , ret );
226+ return new DenseEncoding (ret .resize (m .size ()));
227+ }
228+
196229 protected final DenseEncoding combineDenseWithHashMap (final AMapToData lm , final AMapToData rm , final int size ,
197- final int nVL , final AMapToData ret , Map <Integer , Integer > m ) {
230+ final int nVL , final AMapToData ret , HashMapLongInt m ) {
231+ // JIT compile instance checks.
232+ if (ret instanceof MapToChar )
233+ combineDenseWIthHashMapCharOut (lm , rm , size , nVL , (MapToChar ) ret , m );
234+ else if (ret instanceof MapToCharPByte )
235+ combineDenseWIthHashMapPByteOut (lm , rm , size , nVL , (MapToCharPByte ) ret , m );
236+ else
237+ combineDenseWithHashMapGeneric (lm , rm , size , nVL , ret , m );
238+ ret .setUnique (m .size ());
239+ return new DenseEncoding (ret );
198240
241+ }
242+
243+ private final void combineDenseWIthHashMapPByteOut (final AMapToData lm , final AMapToData rm , final int size ,
244+ final int nVL , final MapToCharPByte ret , HashMapLongInt m ) {
245+ for (int r = 0 ; r < size ; r ++)
246+ addValHashMapCharByte (lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , ret );
247+ }
248+
249+ private final void combineDenseWIthHashMapCharOut (final AMapToData lm , final AMapToData rm , final int size ,
250+ final int nVL , final MapToChar ret , HashMapLongInt m ) {
251+ if (lm instanceof MapToChar && rm instanceof MapToChar )
252+ combineDenseWIthHashMapAllChar (lm , rm , size , nVL , ret , m );
253+ else // some other combination
254+ combineDenseWIthHashMapCharOutGeneric (lm , rm , size , nVL , ret , m );
255+ }
256+
257+ private final void combineDenseWIthHashMapCharOutGeneric (final AMapToData lm , final AMapToData rm , final int size ,
258+ final int nVL , final MapToChar ret , HashMapLongInt m ) {
259+ for (int r = 0 ; r < size ; r ++)
260+ addValHashMapChar (lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , ret );
261+ }
262+
263+ private final void combineDenseWIthHashMapAllChar (final AMapToData lm , final AMapToData rm , final int size ,
264+ final int nVL , final MapToChar ret , HashMapLongInt m ) {
265+ final MapToChar lmC = (MapToChar ) lm ;
266+ final MapToChar rmC = (MapToChar ) rm ;
267+ for (int r = 0 ; r < size ; r ++)
268+ addValHashMapChar (lmC .getIndex (r ) + rmC .getIndex (r ) * nVL , r , m , ret );
269+
270+ }
271+
272+ protected final void combineDenseWithHashMapGeneric (final AMapToData lm , final AMapToData rm , final int size ,
273+ final int nVL , final AMapToData ret , HashMapLongInt m ) {
199274 for (int r = 0 ; r < size ; r ++)
200275 addValHashMap (lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , ret );
201- return new DenseEncoding (ret .resize (m .size ()));
202276 }
203277
204278 protected final DenseEncoding combineDenseWithMapToData (final AMapToData lm , final AMapToData rm , final int size ,
205279 final int nVL , final AMapToData ret , final int maxUnique , final AMapToData m ) {
280+ if (m instanceof MapToChar )
281+ return combineDenseWithMapToDataToChar (lm , rm , size , nVL , ret , maxUnique , (MapToChar ) m );
282+ else
283+ return combineDenseWithMapToDataGeneric (lm , rm , size , nVL , ret , maxUnique , m );
284+
285+ }
286+
287+ protected final DenseEncoding combineDenseWithMapToDataToChar (final AMapToData lm , final AMapToData rm ,
288+ final int size , final int nVL , final AMapToData ret , final int maxUnique , final MapToChar m ) {
289+ int newUID = 1 ;
290+ for (int r = 0 ; r < size ; r ++)
291+ newUID = addValMapToDataChar (lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , newUID , ret );
292+ ret .setUnique (newUID - 1 );
293+ return new DenseEncoding (ret );
294+ }
295+
296+ protected final DenseEncoding combineDenseWithMapToDataGeneric (final AMapToData lm , final AMapToData rm ,
297+ final int size , final int nVL , final AMapToData ret , final int maxUnique , final AMapToData m ) {
206298 int newUID = 1 ;
207299 for (int r = 0 ; r < size ; r ++)
208300 newUID = addValMapToData (lm .getIndex (r ) + rm .getIndex (r ) * nVL , r , m , newUID , ret );
209- return new DenseEncoding (ret .resize (newUID - 1 ));
301+ ret .setUnique (newUID - 1 );
302+ return new DenseEncoding (ret );
210303 }
211304
212- protected static int addValMapToData (final int nv , final int r , final AMapToData map , int newId ,
305+ protected static int addValMapToDataChar (final int nv , final int r , final MapToChar map , int newId ,
213306 final AMapToData d ) {
214307 int mv = map .getIndex (nv );
215308 if (mv == 0 )
@@ -218,11 +311,56 @@ protected static int addValMapToData(final int nv, final int r, final AMapToData
218311 return newId ;
219312 }
220313
221- protected static void addValHashMap (final int nv , final int r , final Map < Integer , Integer > map ,
314+ protected static int addValMapToData (final int nv , final int r , final AMapToData map , int newId ,
222315 final AMapToData d ) {
316+ int mv = map .getIndex (nv );
317+ if (mv == 0 )
318+ mv = map .setAndGet (nv , newId ++);
319+ d .set (r , mv - 1 );
320+ return newId ;
321+ }
322+
323+ protected static void addValHashMap (final int nv , final int r , final HashMapLongInt map , final AMapToData d ) {
223324 final int v = map .size ();
224- final Integer mv = map .putIfAbsent (nv , v );
225- if (mv == null )
325+ final int mv = map .putIfAbsent (nv , v );
326+ if (mv == -1 )
327+ d .set (r , v );
328+ else
329+ d .set (r , mv );
330+ }
331+
332+ protected static void addValHashMapChar (final int nv , final int r , final HashMapLongInt map , final MapToChar d ) {
333+ final int v = map .size ();
334+ final int mv = map .putIfAbsent (nv , v );
335+ if (mv == -1 )
336+ d .set (r , v );
337+ else
338+ d .set (r , mv );
339+ }
340+
341+ protected static void addValHashMapCharByte (final int nv , final int r , final HashMapLongInt map ,
342+ final MapToCharPByte d ) {
343+ final int v = map .size ();
344+ final int mv = map .putIfAbsent (nv , v );
345+ if (mv == -1 )
346+ d .set (r , v );
347+ else
348+ d .set (r , mv );
349+ }
350+
351+ protected static void addValHashMapChar (final long nv , final int r , final HashMapLongInt map , final MapToChar d ) {
352+ final int v = map .size ();
353+ final int mv = map .putIfAbsent (nv , v );
354+ if (mv == -1 )
355+ d .set (r , v );
356+ else
357+ d .set (r , mv );
358+ }
359+
360+ protected static void addValHashMap (final long nv , final int r , final HashMapLongInt map , final AMapToData d ) {
361+ final int v = map .size ();
362+ final int mv = map .putIfAbsent (nv , v );
363+ if (mv == -1 )
226364 d .set (r , v );
227365 else
228366 d .set (r , mv );
@@ -237,13 +375,18 @@ public int getUnique() {
237375 public EstimationFactors extractFacts (int nRows , double tupleSparsity , double matrixSparsity ,
238376 CompressionSettings cs ) {
239377 int largestOffs = 0 ;
240-
241378 int [] counts = map .getCounts ();
242379 for (int i = 0 ; i < counts .length ; i ++)
243380 if (counts [i ] > largestOffs )
244381 largestOffs = counts [i ];
245- else if (counts [i ] == 0 )
246- throw new DMLCompressionException ("Invalid count of 0 all values should have at least one instance" );
382+ else if (counts [i ] == 0 ) {
383+ if (!zeroWarn ) {
384+ LOG .warn ("Invalid count of 0 all values should have at least one instance index: " + i + " of "
385+ + counts .length );
386+ zeroWarn = true ;
387+ }
388+ counts [i ] = 1 ;
389+ }
247390
248391 if (cs .isRLEAllowed ())
249392 return new EstimationFactors (map .getUnique (), nRows , largestOffs , counts , 0 , nRows , map .countRuns (), false ,
0 commit comments