2020import java .util .function .Predicate ;
2121
2222/**
23- * Removes duplicate values from multivalued positions, and keeps only the top N.
23+ * Removes duplicate values from multivalued positions, and keeps only the ones that pass the filters.
24+ * <p>
25+ * Clone of {@link MultivalueDedupeLong}, but for it accepts a predicate and nulls flag to filter the values.
26+ * </p>
2427 */
2528public class TopNMultivalueDedupeLong {
2629 /**
@@ -37,7 +40,7 @@ public class TopNMultivalueDedupeLong {
3740 /**
3841 * Whether the hash expects nulls or not.
3942 */
40- final boolean hasNull ;
43+ final boolean acceptNulls ;
4144 /**
4245 * A predicate to test if a value is part of the top N or not.
4346 */
@@ -54,139 +57,12 @@ public class TopNMultivalueDedupeLong {
5457 */
5558 int w ;
5659
57- public TopNMultivalueDedupeLong (LongBlock block , boolean hasNull , Predicate <Long > isAcceptable ) {
60+ public TopNMultivalueDedupeLong (LongBlock block , boolean acceptNulls , Predicate <Long > isAcceptable ) {
5861 this .block = block ;
59- this .hasNull = hasNull ;
62+ this .acceptNulls = acceptNulls ;
6063 this .isAcceptable = isAcceptable ;
6164 }
6265
63- /**
64- * Remove duplicate values from each position and write the results to a
65- * {@link Block} using an adaptive algorithm based on the size of the input list.
66- */
67- public LongBlock dedupeToBlockAdaptive (BlockFactory blockFactory ) {
68- if (block .mvDeduplicated ()) {
69- block .incRef ();
70- return block ;
71- }
72- try (LongBlock .Builder builder = blockFactory .newLongBlockBuilder (block .getPositionCount ())) {
73- for (int p = 0 ; p < block .getPositionCount (); p ++) {
74- int count = block .getValueCount (p );
75- int first = block .getFirstValueIndex (p );
76- switch (count ) {
77- case 0 -> builder .appendNull ();
78- case 1 -> builder .appendLong (block .getLong (first ));
79- default -> {
80- /*
81- * It's better to copyMissing when there are few unique values
82- * and better to copy and sort when there are many unique values.
83- * The more duplicate values there are the more comparatively worse
84- * copyAndSort is. But we don't know how many unique values there
85- * because our job is to find them. So we use the count of values
86- * as a proxy that is fast to test. It's not always going to be
87- * optimal but it has the nice property of being quite quick on
88- * short lists and not n^2 levels of terrible on long ones.
89- *
90- * It'd also be possible to make a truly hybrid mechanism that
91- * switches from copyMissing to copyUnique once it collects enough
92- * unique values. The trouble is that the switch is expensive and
93- * makes kind of a "hole" in the performance of that mechanism where
94- * you may as well have just gone with either of the two other
95- * strategies. So we just don't try it for now.
96- */
97- if (count < ALWAYS_COPY_MISSING ) {
98- copyMissing (first , count );
99- writeUniquedWork (builder );
100- } else {
101- copyAndSort (first , count );
102- deduplicatedSortedWork (builder );
103- }
104- }
105- }
106- }
107- return builder .build ();
108- }
109- }
110-
111- /**
112- * Remove duplicate values from each position and write the results to a
113- * {@link Block} using an algorithm with very low overhead but {@code n^2}
114- * case complexity for larger. Prefer {@link #dedupeToBlockAdaptive}
115- * which picks based on the number of elements at each position.
116- */
117- public LongBlock dedupeToBlockUsingCopyAndSort (BlockFactory blockFactory ) {
118- if (block .mvDeduplicated ()) {
119- block .incRef ();
120- return block ;
121- }
122- try (LongBlock .Builder builder = blockFactory .newLongBlockBuilder (block .getPositionCount ())) {
123- for (int p = 0 ; p < block .getPositionCount (); p ++) {
124- int count = block .getValueCount (p );
125- int first = block .getFirstValueIndex (p );
126- switch (count ) {
127- case 0 -> builder .appendNull ();
128- case 1 -> builder .appendLong (block .getLong (first ));
129- default -> {
130- copyAndSort (first , count );
131- deduplicatedSortedWork (builder );
132- }
133- }
134- }
135- return builder .build ();
136- }
137- }
138-
139- /**
140- * Remove duplicate values from each position and write the results to a
141- * {@link Block} using an algorithm that sorts all values. It has a higher
142- * overhead for small numbers of values at each position than
143- * {@link #dedupeToBlockUsingCopyMissing} for large numbers of values the
144- * performance is dominated by the {@code n*log n} sort. Prefer
145- * {@link #dedupeToBlockAdaptive} unless you need the results sorted.
146- */
147- public LongBlock dedupeToBlockUsingCopyMissing (BlockFactory blockFactory ) {
148- if (block .mvDeduplicated ()) {
149- block .incRef ();
150- return block ;
151- }
152- try (LongBlock .Builder builder = blockFactory .newLongBlockBuilder (block .getPositionCount ())) {
153- for (int p = 0 ; p < block .getPositionCount (); p ++) {
154- int count = block .getValueCount (p );
155- int first = block .getFirstValueIndex (p );
156- switch (count ) {
157- case 0 -> builder .appendNull ();
158- case 1 -> builder .appendLong (block .getLong (first ));
159- default -> {
160- copyMissing (first , count );
161- writeUniquedWork (builder );
162- }
163- }
164- }
165- return builder .build ();
166- }
167- }
168-
169- /**
170- * Sort values from each position and write the results to a {@link Block}.
171- */
172- public LongBlock sortToBlock (BlockFactory blockFactory , boolean ascending ) {
173- try (LongBlock .Builder builder = blockFactory .newLongBlockBuilder (block .getPositionCount ())) {
174- for (int p = 0 ; p < block .getPositionCount (); p ++) {
175- int count = block .getValueCount (p );
176- int first = block .getFirstValueIndex (p );
177- switch (count ) {
178- case 0 -> builder .appendNull ();
179- case 1 -> builder .appendLong (block .getLong (first ));
180- default -> {
181- copyAndSort (first , count );
182- writeSortedWork (builder , ascending );
183- }
184- }
185- }
186- return builder .build ();
187- }
188- }
189-
19066 /**
19167 * Dedupe values, add them to the hash, and build an {@link IntBlock} of
19268 * their hashes. This block is suitable for passing as the grouping block
@@ -200,7 +76,7 @@ public MultivalueDedupe.HashResult hashAdd(BlockFactory blockFactory, LongHash h
20076 int first = block .getFirstValueIndex (p );
20177 switch (count ) {
20278 case 0 -> {
203- if (hasNull ) {
79+ if (acceptNulls ) {
20480 sawNull = true ;
20581 builder .appendInt (0 );
20682 } else {
@@ -237,7 +113,7 @@ public IntBlock hashLookup(BlockFactory blockFactory, LongHash hash) {
237113 int first = block .getFirstValueIndex (p );
238114 switch (count ) {
239115 case 0 -> {
240- if (hasNull ) {
116+ if (acceptNulls ) {
241117 builder .appendInt (0 );
242118 } else {
243119 builder .appendNull ();
@@ -262,68 +138,6 @@ public IntBlock hashLookup(BlockFactory blockFactory, LongHash hash) {
262138 }
263139 }
264140
265- /**
266- * Build a {@link BatchEncoder} which deduplicates values at each position
267- * and then encodes the results into a {@link byte[]} which can be used for
268- * things like hashing many fields together.
269- */
270- public BatchEncoder batchEncoder (int batchSize ) {
271- block .incRef ();
272- return new BatchEncoder .Longs (batchSize ) {
273- @ Override
274- protected void readNextBatch () {
275- int position = firstPosition ();
276- if (w > 0 ) {
277- // The last block didn't fit so we have to *make* it fit
278- ensureCapacity (w );
279- startPosition ();
280- encodeUniquedWork (this );
281- endPosition ();
282- position ++;
283- }
284- for (; position < block .getPositionCount (); position ++) {
285- int count = block .getValueCount (position );
286- int first = block .getFirstValueIndex (position );
287- switch (count ) {
288- case 0 -> encodeNull ();
289- case 1 -> {
290- long v = block .getLong (first );
291- if (hasCapacity (1 )) {
292- startPosition ();
293- encode (v );
294- endPosition ();
295- } else {
296- work [0 ] = v ;
297- w = 1 ;
298- return ;
299- }
300- }
301- default -> {
302- if (count < ALWAYS_COPY_MISSING ) {
303- copyMissing (first , count );
304- } else {
305- copyAndSort (first , count );
306- convertSortedWorkToUnique ();
307- }
308- if (hasCapacity (w )) {
309- startPosition ();
310- encodeUniquedWork (this );
311- endPosition ();
312- } else {
313- return ;
314- }
315- }
316- }
317- }
318- }
319-
320- @ Override
321- public void close () {
322- block .decRef ();
323- }
324- };
325- }
326-
327141 /**
328142 * Copy all value from the position into {@link #work} and then
329143 * sorts it {@code n * log(n)}.
@@ -378,52 +192,6 @@ void copyMissing(int first, int count) {
378192 }
379193 }
380194
381- /**
382- * Writes an already deduplicated {@link #work} to a {@link LongBlock.Builder}.
383- */
384- private void writeUniquedWork (LongBlock .Builder builder ) {
385- if (w == 1 ) {
386- builder .appendLong (work [0 ]);
387- return ;
388- }
389- builder .beginPositionEntry ();
390- for (int i = 0 ; i < w ; i ++) {
391- builder .appendLong (work [i ]);
392- }
393- builder .endPositionEntry ();
394- }
395-
396- /**
397- * Writes a sorted {@link #work} to a {@link LongBlock.Builder}, skipping duplicates.
398- */
399- private void deduplicatedSortedWork (LongBlock .Builder builder ) {
400- builder .beginPositionEntry ();
401- long prev = work [0 ];
402- builder .appendLong (prev );
403- for (int i = 1 ; i < w ; i ++) {
404- if (prev != work [i ]) {
405- prev = work [i ];
406- builder .appendLong (prev );
407- }
408- }
409- builder .endPositionEntry ();
410- }
411-
412- /**
413- * Writes a {@link #work} to a {@link LongBlock.Builder}.
414- */
415- private void writeSortedWork (LongBlock .Builder builder , boolean ascending ) {
416- builder .beginPositionEntry ();
417- for (int i = 0 ; i < w ; i ++) {
418- if (ascending ) {
419- builder .appendLong (work [i ]);
420- } else {
421- builder .appendLong (work [w - i - 1 ]);
422- }
423- }
424- builder .endPositionEntry ();
425- }
426-
427195 /**
428196 * Writes an already deduplicated {@link #work} to a hash.
429197 */
@@ -607,30 +375,6 @@ private void hashLookupSortedWork(LongHash hash, IntBlock.Builder builder) {
607375 builder .endPositionEntry ();
608376 }
609377
610- /**
611- * Writes a deduplicated {@link #work} to a {@link BatchEncoder.Longs}.
612- */
613- private void encodeUniquedWork (BatchEncoder .Longs encoder ) {
614- for (int i = 0 ; i < w ; i ++) {
615- encoder .encode (work [i ]);
616- }
617- }
618-
619- /**
620- * Converts {@link #work} from sorted array to a deduplicated array.
621- */
622- private void convertSortedWorkToUnique () {
623- long prev = work [0 ];
624- int end = w ;
625- w = 1 ;
626- for (int i = 1 ; i < end ; i ++) {
627- if (false == valuesEqual (prev , work [i ])) {
628- prev = work [i ];
629- work [w ++] = prev ;
630- }
631- }
632- }
633-
634378 private void grow (int size ) {
635379 work = ArrayUtil .grow (work , size );
636380 }
0 commit comments