Skip to content

Commit 2e14d77

Browse files
committed
Sparse support in arrow v1
1 parent 332dff7 commit 2e14d77

File tree

10 files changed

+388
-263
lines changed

10 files changed

+388
-263
lines changed

deps.edn

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,13 @@
244244
"--illegal-access=permit" "--add-opens=java.base/java.lang=ALL-UNNAMED"
245245
"--add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED"
246246
"--add-opens=java.base/java.util.concurrent=ALL-UNNAMED"
247+
"--add-opens=java.base/java.lang=ALL-UNNAMED"
248+
"--add-opens=java.base/java.math=ALL-UNNAMED"
249+
"--add-opens=java.base/jdk.internal.foreign=ALL-UNNAMED"
250+
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
251+
"--add-opens=java.base/java.io=ALL-UNNAMED"
252+
"--add-opens=java.base/java.util=ALL-UNNAMED"
253+
"--add-opens=java.base/java.lang.ref=ALL-UNNAMED"
247254
]
248255
}
249256

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package tech.v3.dataset;
2+
3+
import clojure.lang.RT;
4+
import clojure.lang.IDeref;
5+
import java.util.function.LongConsumer;
6+
import ham_fisted.ArrayLists;
7+
import org.roaringbitmap.RoaringBitmap;
8+
9+
10+
public class ByteValidity {
11+
public static int trimIndexes(int[] indexes, int nIndexes, long maxIdx) {
12+
while(nIndexes > 0) {
13+
if(Integer.toUnsignedLong(indexes[nIndexes-1]) >= maxIdx)
14+
--nIndexes;
15+
else
16+
break;
17+
}
18+
return nIndexes;
19+
}
20+
public static abstract class ValidityBase implements LongConsumer, IDeref {
21+
long nElems;
22+
int idx;
23+
int nIndexes;
24+
int[] indexes;
25+
public ValidityBase(long nElems, long maxIndexes) {
26+
this.nElems = nElems;
27+
indexes = new int[(int)maxIndexes];
28+
nIndexes = 0;
29+
idx = 0;
30+
}
31+
}
32+
public static class ValidityIndexReducer extends ValidityBase {
33+
public ValidityIndexReducer(long nElems, long maxIndexes) {
34+
super(nElems, maxIndexes);
35+
}
36+
public void accept(long value) {
37+
if(value != 0) {
38+
int intVal = (int)value;
39+
int offset = idx * 8;
40+
if( (intVal & 1) == 1) indexes[nIndexes++] = offset;
41+
if( (intVal & 2) == 2) indexes[nIndexes++] = offset+1;
42+
if( (intVal & 4) == 4) indexes[nIndexes++] = offset+2;
43+
if( (intVal & 8) == 8) indexes[nIndexes++] = offset+3;
44+
if( (intVal & 16) == 16) indexes[nIndexes++] = offset+4;
45+
if( (intVal & 32) == 32) indexes[nIndexes++] = offset+5;
46+
if( (intVal & 64) == 64) indexes[nIndexes++] = offset+6;
47+
if( (intVal & 128) == 128) indexes[nIndexes++] = offset+7;
48+
}
49+
++idx;
50+
}
51+
public Object deref() {
52+
return ArrayLists.toList(indexes).subList(0, trimIndexes(indexes, nIndexes, nElems)); }
53+
}
54+
public static class MissingIndexReducer extends ValidityBase {
55+
public MissingIndexReducer(long nElems, long maxIndexes) {
56+
super(nElems, maxIndexes);
57+
}
58+
public void accept(long value) {
59+
if(value != -1) {
60+
int intVal = (int)value;
61+
int offset = idx * 8;
62+
if( (intVal & 1) != 1) indexes[nIndexes++] = offset;
63+
if( (intVal & 2) != 2) indexes[nIndexes++] = offset+1;
64+
if( (intVal & 4) != 4) indexes[nIndexes++] = offset+2;
65+
if( (intVal & 8) != 8) indexes[nIndexes++] = offset+3;
66+
if( (intVal & 16) != 16) indexes[nIndexes++] = offset+4;
67+
if( (intVal & 32) != 32) indexes[nIndexes++] = offset+5;
68+
if( (intVal & 64) != 64) indexes[nIndexes++] = offset+6;
69+
if( (intVal & 128) != 128) indexes[nIndexes++] = offset+7;
70+
}
71+
++idx;
72+
}
73+
public Object deref() {
74+
RoaringBitmap rb = new RoaringBitmap();
75+
rb.addN(indexes, 0, trimIndexes(indexes, nIndexes, nElems));
76+
return rb;
77+
}
78+
}
79+
}

src/tech/v3/dataset/base.clj

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -858,9 +858,7 @@
858858
(throw (Exception.
859859
(format "Column %s does not have :string datatype"
860860
(ds-col/column-name col)))))
861-
(if (not (instance? StringTable (.data ^Column col)))
862-
(str-table/string-table-from-strings col)
863-
(.data ^Column col)))
861+
(str-table/->string-table (ds-proto/column-data col)))
864862

865863

866864
(defn ensure-dataset-string-tables

src/tech/v3/dataset/impl/column.clj

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
(set! *unchecked-math* :warn-on-boxed)
3131

3232

33-
(declare new-column)
33+
(declare new-column construct-column)
3434

3535

3636
(defn ->persistent-map
@@ -381,6 +381,8 @@
381381
(row-count [this] (dtype/ecount data))
382382
ds-proto/PMissing
383383
(missing [this] missing)
384+
(num-missing [this] (dtype/ecount missing))
385+
(any-missing? [this] (boolean (not (.isEmpty missing))))
384386
ds-proto/PValidRows
385387
(valid-rows [this] (.xor (->bitmap (hamf/range (dtype/ecount this))) missing))
386388
ds-proto/PSelectRows
@@ -400,6 +402,8 @@
400402
(is-column? [_this] true)
401403
(column-buffer [_this] data)
402404
(empty-column? [this] (== (dtype/ecount this) (dtype/ecount missing)))
405+
(column-data [_this] data)
406+
(with-column-data [this new-data] (construct-column missing new-data metadata))
403407
ds-proto/PColumnName
404408
(column-name [this] (get metadata :name))
405409
IMutList

src/tech/v3/dataset/impl/dataset.clj

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,10 @@
254254
ds-proto/PMissing
255255
(missing [this]
256256
(RoaringBitmap/or (.iterator ^Iterable (lznc/map ds-proto/missing (.values this)))))
257+
(num-missing [this]
258+
(dtype/ecount (ds-proto/missing this)))
259+
(any-missing? [this]
260+
(boolean (some ds-proto/any-missing? (.values this))))
257261

258262
ds-proto/PValidRows
259263
(valid-rows [this]

src/tech/v3/dataset/impl/sparse_column.clj

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
[tech.v3.datatype.pprint :as dtype-pp]
88
[tech.v3.dataset.protocols :as ds-proto]
99
[tech.v3.dataset.impl.column :as col-impl]
10+
[tech.v3.dataset.string-table :as str-t]
1011
[ham-fisted.set :as set]
1112
[ham-fisted.api :as hamf]
1213
[ham-fisted.reduce :as hamf-rf]
@@ -18,7 +19,7 @@
1819

1920
(set! *warn-on-reflection* true)
2021
(set! *unchecked-math* :warn-on-boxed)
21-
(declare scol->reader reduce-scol kv-reduce-scol)
22+
(declare scol->reader reduce-scol kv-reduce-scol construct-sparse-col)
2223

2324
(defn- index-bin-search
2425
^clojure.lang.IFn$LL [^IMutList indexes]
@@ -87,12 +88,16 @@
8788
(let [miss-data (set/difference (bitmap/->bitmap (hamf/range rc)) (bitmap/->bitmap indexes))]
8889
(set! (.-msng this) miss-data)
8990
miss-data))))
91+
(num-missing [this] (- rc (.size indexes)))
92+
(any-missing? [this] (not (== rc (.size indexes))))
9093
ds-proto/PValidRows
9194
(valid-rows [this] indexes)
9295
ds-proto/PColumn
9396
(is-column? [this] true)
9497
(column-buffer [this] [indexes data])
9598
(empty-column? [this] (.isEmpty indexes))
99+
(column-data [this] data)
100+
(with-column-data [this new-data] (construct-sparse-col indexes new-data rc metadata))
96101
ds-proto/PColumnName
97102
(column-name [this] (get metadata :name))
98103
ds-proto/PRowCount
@@ -280,9 +285,6 @@
280285
nil valid-indexes)
281286
(SparseCol. valid-indexes data rc (meta col) nil nil))))
282287

283-
(defn is-scol? [sc] (instance? SparseCol sc))
284-
(defn as-scol ^SparseCol [sc] (when (instance? SparseCol sc) sc))
285-
286288
(defn scol->reader
287289
(^Buffer [^SparseCol scol] (scol->reader scol 0 (.-rc scol)))
288290
(^Buffer [^SparseCol scol ^long sidx ^long eidx]
@@ -356,14 +358,18 @@
356358
(ChunkedList/sublistCheck ssidx seidx rc)
357359
(sparse-rows ds (+ sidx ssidx) (+ sidx seidx)))))))
358360

359-
(defn ds->sparse-ds
360-
[ds]
361-
(let [rc (long (ds-proto/row-count ds))]
362-
(reduce (fn [ds col]
363-
(if (> (dt/ecount (ds/missing col)) (quot rc 2))
364-
(assoc ds (ds-proto/column-name col) (->scol col))
365-
ds))
366-
ds (.values ^java.util.Map ds))))
361+
(defn ->sparse-ds
362+
([ds] (->sparse-ds ds 0.5))
363+
([ds missing-frac]
364+
(let [rc (long (ds-proto/row-count ds))
365+
missing-cutoff (long (* rc (double missing-frac)))]
366+
(reduce (fn [ds col]
367+
(if (>= (dt/ecount (ds-proto/missing col)) missing-cutoff)
368+
(assoc ds (ds-proto/column-name col) (->scol col))
369+
ds))
370+
ds (.values ^java.util.Map ds)))))
371+
372+
(defn is-sparse? [col] (instance? SparseCol col))
367373

368374
(comment
369375
(require '[tech.v3.dataset :as ds])

src/tech/v3/dataset/protocols.clj

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,12 @@
1414
(^long column-count [this]))
1515

1616
(defprotocol PMissing
17-
(missing [this]))
17+
(missing [this])
18+
(num-missing [this])
19+
(any-missing? [this]))
20+
21+
(defn missing-count
22+
^long [this] (num-missing this))
1823

1924
(defprotocol PValidRows
2025
(valid-rows [this]))
@@ -30,9 +35,13 @@
3035

3136
(defprotocol PColumn
3237
(is-column? [col])
33-
(column-buffer [col])
34-
(empty-column? [col]))
35-
38+
(column-buffer [col]
39+
"Return either the backing dataset for dense columns or tuple of [indexes,data] for sparse columns")
40+
(empty-column? [col])
41+
(column-data [col]
42+
"Return the backing data store. Note for sparse columns this may be much shorter than the column")
43+
(with-column-data [col new-data]
44+
"new data must be same length as old data"))
3645

3746
(defprotocol PDataset
3847
(is-dataset? [item])

src/tech/v3/dataset/string_table.clj

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -189,24 +189,28 @@
189189
(StringTable. (hamf/vec int->str) (.clone str->int)
190190
(compress-indexes indexes (long (.size str->int))))))
191191

192-
(defn fast-string-container []
193-
(let [str->int (hamf/java-hashmap)
194-
int->str (ArrayList.)
195-
_ (do (.put str->int "" 0)
196-
(.add int->str ""))]
197-
(FastStringContainer. (hamf/long-array-list)
198-
int->str str->int)))
192+
(defn fast-string-container
193+
([str->int int->str]
194+
(FastStringContainer. (hamf/long-array-list)
195+
int->str str->int))
196+
([]
197+
(let [str->int (hamf/java-hashmap)
198+
int->str (ArrayList.)]
199+
(.put str->int "" 0)
200+
(.add int->str "")
201+
(fast-string-container str->int int->str))))
199202

200203
(defn string-table-from-strings
201-
[str-data]
202-
(hamf-rf/reduce-reducer (hamf-rf/consumer-reducer fast-string-container) str-data))
204+
([str-data] (string-table-from-strings (fast-string-container) str-data))
205+
([fast-string-container str-data]
206+
(hamf-rf/reduce-reducer (hamf-rf/consumer-reducer (constantly fast-string-container)) str-data)))
203207

204208

205209
(defn ->string-table
206210
^StringTable [str-t]
207-
(errors/when-not-errorf (instance? StringTable str-t)
208-
"string table is wrong type: %s" str-t)
209-
str-t)
211+
(if (instance? StringTable str-t)
212+
str-t
213+
(string-table-from-strings str-t)))
210214

211215

212216
(defn indices

0 commit comments

Comments
 (0)