File tree Expand file tree Collapse file tree 3 files changed +28
-18
lines changed
Expand file tree Collapse file tree 3 files changed +28
-18
lines changed Original file line number Diff line number Diff line change 11# Changelog
2+ # 7.037
3+ * Nippy loading is about 2x faster in the case of large string tables.
4+ * Arrow read pathways support : text-as-strings ? to mirror : strings-as-text ? on the write side so you can save out uncompressed data in the fastest-to-read format.
5+
26# 7.036
37 * Major optimization (>9x!) loading of arrow files when large string tables/dictionaries are used.
48
Original file line number Diff line number Diff line change 918918 string-data ^bytes string-data
919919 offsets (dtype/->buffer offsets)
920920 n-elems (dec (.lsize offsets))
921- ^IMutList int->str
922- (->> (dtype/make-reader
923- :string n-elems
924- (let [start-off (.readLong offsets idx)
925- end-off (.readLong offsets (inc idx))]
926- (String. string-data start-off (- end-off start-off))))
927- (dtype/make-container :list :string ))
928- str->int (HashMap. (dtype/ecount int->str))]
929- (dotimes [idx n-elems]
930- (.put str->int (.get int->str idx) idx))
931- (StringTable. int->str str->int int-data))
921+ str-rdr (dtype/make-reader
922+ :string n-elems
923+ (let [start-off (.readLong offsets idx)
924+ end-off (.readLong offsets (inc idx))]
925+ (String. string-data start-off (- end-off start-off))))
926+ str-ary (hamf/object-array n-elems)]
927+ (hamf/pgroups n-elems (fn [^long sidx ^long eidx]
928+ (loop [idx sidx]
929+ (when (< idx eidx)
930+ (let [start-off (.readLong offsets idx)
931+ end-off (.readLong offsets (inc idx))]
932+ (aset str-ary idx (String. string-data start-off (- end-off start-off))))
933+ (recur (inc idx))))))
934+ (StringTable. (ham_fisted.ArrayLists/toList str-ary) nil int-data))
932935 (= version 2 )
933936 (let [^List int->str (dtype-list/wrap-container string-table)
934937 str->int (HashMap. (dtype/ecount int->str))
Original file line number Diff line number Diff line change @@ -1313,7 +1313,7 @@ Dependent block frames are not supported!!")
13131313
13141314
13151315(defn- string-data->column-data
1316- [dict-map encoding offset-buf-dtype buffers n-elems]
1316+ [dict-map encoding offset-buf-dtype buffers n-elems options ]
13171317 (if encoding
13181318 (StringTable. (get-in dict-map [(:id encoding) :strings ])
13191319 nil
@@ -1322,11 +1322,13 @@ Dependent block frames are not supported!!")
13221322 (get-in encoding [:index-type :datatype ]))
13231323 (native-buffer/->jvm-array 0 n-elems)
13241324 (dyn-int-list/make-from-container )))
1325- (let [[offsets varchar-data] buffers]
1326- (-> (offsets-data->string-reader (native-buffer/set-native-datatype
1327- offsets offset-buf-dtype)
1328- varchar-data n-elems)
1329- (string-reader->text-reader )))))
1325+ (let [[offsets varchar-data] buffers
1326+ str-rdr (offsets-data->string-reader (native-buffer/set-native-datatype
1327+ offsets offset-buf-dtype)
1328+ varchar-data n-elems)]
1329+ (if-not (:text-as-strings? options)
1330+ (string-reader->text-reader )
1331+ str-rdr))))
13301332
13311333
13321334(defn- int8-buf->missing
@@ -1415,7 +1417,8 @@ Dependent block frames are not supported!!")
14151417 dict-map encoding
14161418 (get-in field [:field-type :offset-buffer-datatype ])
14171419 data-buffers
1418- (:n-elems node))
1420+ (:n-elems node)
1421+ options)
14191422 (field-metadata field)
14201423 (node-buf->missing node validity-buf))))
14211424
You can’t perform that action at this time.
0 commit comments