Skip to content

Commit 0c72eb2

Browse files
committed
Allow indexed data to be a correctly-sized array-list.
1 parent 60a499c commit 0c72eb2

File tree

3 files changed

+61
-21
lines changed

3 files changed

+61
-21
lines changed

java/tech/v3/dataset/ByteValidity.java

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import clojure.lang.IDeref;
55
import java.util.function.LongConsumer;
66
import ham_fisted.ArrayLists;
7+
import ham_fisted.IMutList;
78
import org.roaringbitmap.RoaringBitmap;
89

910

@@ -29,27 +30,40 @@ public ValidityBase(long nElems, long maxIndexes) {
2930
idx = 0;
3031
}
3132
}
32-
public static class ValidityIndexReducer extends ValidityBase {
33-
public ValidityIndexReducer(long nElems, long maxIndexes) {
34-
super(nElems, maxIndexes);
33+
public static class ValidityIndexReducer implements LongConsumer, IDeref {
34+
IMutList indexes;
35+
public final long maxIdx;
36+
int idx;
37+
public ValidityIndexReducer(IMutList indexes, long maxIdx) {
38+
this.indexes = indexes;
39+
this.maxIdx = maxIdx;
40+
this.idx = 0;
41+
}
42+
public int trimIndexes() {
43+
int nIndexes = indexes.size();
44+
//empty loop intentional
45+
for(;nIndexes > 0 && indexes.getLong(nIndexes-1) >= maxIdx; --nIndexes);
46+
47+
return nIndexes;
3548
}
3649
public void accept(long value) {
3750
if(value != 0) {
3851
int intVal = (int)value;
3952
int offset = idx * 8;
40-
if( (intVal & 1) == 1) indexes[nIndexes++] = offset;
41-
if( (intVal & 2) == 2) indexes[nIndexes++] = offset+1;
42-
if( (intVal & 4) == 4) indexes[nIndexes++] = offset+2;
43-
if( (intVal & 8) == 8) indexes[nIndexes++] = offset+3;
44-
if( (intVal & 16) == 16) indexes[nIndexes++] = offset+4;
45-
if( (intVal & 32) == 32) indexes[nIndexes++] = offset+5;
46-
if( (intVal & 64) == 64) indexes[nIndexes++] = offset+6;
47-
if( (intVal & 128) == 128) indexes[nIndexes++] = offset+7;
53+
if( (intVal & 1) == 1) indexes.addLong(offset);
54+
if( (intVal & 2) == 2) indexes.addLong(offset+1);
55+
if( (intVal & 4) == 4) indexes.addLong(offset+2);
56+
if( (intVal & 8) == 8) indexes.addLong(offset+3);
57+
if( (intVal & 16) == 16) indexes.addLong(offset+4);
58+
if( (intVal & 32) == 32) indexes.addLong(offset+5);
59+
if( (intVal & 64) == 64) indexes.addLong(offset+6);
60+
if( (intVal & 128) == 128) indexes.addLong(offset+7);
4861
}
4962
++idx;
5063
}
5164
public Object deref() {
52-
return ArrayLists.toList(indexes).subList(0, trimIndexes(indexes, nIndexes, nElems)); }
65+
return indexes.subList(0, trimIndexes());
66+
}
5367
}
5468
public static class MissingIndexReducer extends ValidityBase {
5569
public MissingIndexReducer(long nElems, long maxIndexes) {

src/tech/v3/dataset/impl/sparse_column.clj

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,16 @@
88
[tech.v3.dataset.protocols :as ds-proto]
99
[tech.v3.dataset.impl.column :as col-impl]
1010
[tech.v3.dataset.string-table :as str-t]
11+
[tech.v3.dataset.dynamic-int-list :as int-list]
1112
[ham-fisted.set :as set]
1213
[ham-fisted.api :as hamf]
1314
[ham-fisted.reduce :as hamf-rf]
1415
[ham-fisted.function :as hamf-fn]
1516
[ham-fisted.iterator :as hamf-iter])
1617
(:import [ham_fisted IMutList ArrayLists ITypedReduce ChunkedList]
1718
[tech.v3.datatype Buffer ObjectReader ElemwiseDatatype LongReader DoubleReader]
18-
[java.util Arrays Iterator Map]))
19+
[java.util Arrays Iterator Map List]
20+
[tech.v3.dataset.string_table StringTable]))
1921

2022
(set! *warn-on-reflection* true)
2123
(set! *unchecked-math* :warn-on-boxed)
@@ -30,8 +32,8 @@
3032
idx (Arrays/binarySearch ^bytes idx-ary (unchecked-byte idx)))
3133
:int16 (hamf-fn/long-unary-operator
3234
idx (Arrays/binarySearch ^shorts idx-ary (unchecked-short idx)))
33-
:int32 (hamf-fn/long-unary-operator
34-
idx (Arrays/binarySearch ^shorts idx-ary (unchecked-short idx)))
35+
:int32 (hamf-fn/long-unary-operator
36+
idx (Arrays/binarySearch ^ints idx-ary (unchecked-int idx)))
3537
(hamf-fn/long-unary-operator
3638
idx (Arrays/binarySearch ^longs idx-ary (unchecked-long idx))))))
3739

@@ -258,6 +260,11 @@
258260
^SparseCol [indexes data ^long rc metadata]
259261
(SparseCol. indexes data rc metadata nil nil))
260262

263+
(defn- as-string-table
264+
^StringTable [d] (when (and (instance? StringTable d)
265+
(.-str->int ^StringTable d))
266+
d))
267+
261268
(defn ->scol
262269
^SparseCol [col]
263270
(cond
@@ -277,11 +284,21 @@
277284
:else
278285
(hamf/long-array valid-indexes)))
279286
col-dt (dt/elemwise-datatype col)
280-
buf-rdr (dt/->reader col)
281-
data (dt/make-container col-dt (.size valid-indexes))
282-
dst (dt/->buffer data)]
283-
(reduce (hamf-rf/indexed-long-accum _acc dst-idx src-idx
284-
(.writeObject dst dst-idx (.readObject buf-rdr src-idx)))
287+
buf-rdr (dt/->reader (ds-proto/column-data col))
288+
^IMutList data (if (identical? :string col-dt)
289+
(if-let [^StringTable strt (as-string-table (ds-proto/column-data col))]
290+
(StringTable. (.-int->str strt)
291+
(.-str->int strt)
292+
(int-list/dynamic-int-list (dt/ecount valid-indexes)))
293+
(str-t/make-string-table))
294+
(dt/make-list col-dt (.size valid-indexes)))]
295+
(reduce (case (tech.v3.datatype.casting/simple-operation-space col-dt)
296+
:int64 (fn [_acc ^long src-idx]
297+
(.addLong data (.readLong buf-rdr src-idx)))
298+
:float64 (fn [_acc ^long src-idx]
299+
(.addDouble data (.readDouble buf-rdr src-idx)))
300+
(fn [_acc ^long src-idx]
301+
(.add data (.readObject buf-rdr src-idx))))
285302
nil valid-indexes)
286303
(SparseCol. valid-indexes data rc (meta col) nil nil))))
287304

src/tech/v3/libs/arrow.clj

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1513,7 +1513,16 @@ Dependent block frames are not supported!!")
15131513
(->> validity
15141514
(hamf-rf/reduce-reducer
15151515
(hamf-rf/long-consumer-reducer
1516-
#(tech.v3.dataset.ByteValidity$ValidityIndexReducer. n-elems (* 8 (dtype/ecount validity)))))))
1516+
#(tech.v3.dataset.ByteValidity$ValidityIndexReducer. (cond
1517+
(< n-elems Byte/MAX_VALUE)
1518+
(hamf/byte-array-list)
1519+
(< n-elems Short/MAX_VALUE)
1520+
(hamf/short-array-list)
1521+
(< n-elems Integer/MAX_VALUE)
1522+
(hamf/int-array-list)
1523+
:else
1524+
(hamf/long-array-list))
1525+
n-elems)))))
15171526

15181527
(defn ^:no-doc byte-buffer->bitwise-boolean-buffer
15191528
^Buffer[bitbuffer ^long n-elems]

0 commit comments

Comments
 (0)