119119 [org.apache.arrow.vector.types.pojo Field Schema ArrowType$Int
120120 ArrowType$Utf8 ArrowType$Timestamp ArrowType$Time DictionaryEncoding FieldType
121121 ArrowType$FloatingPoint ArrowType$Bool ArrowType$Date ArrowType$Duration
122- ArrowType$LargeUtf8 ArrowType$Null ArrowType$List ArrowType$Binary]
122+ ArrowType$LargeUtf8 ArrowType$Null ArrowType$List ArrowType$Binary ArrowType$FixedSizeBinary ]
123123 [org.apache.arrow.flatbuf CompressionType]
124124 [org.apache.arrow.vector.types MetadataVersion]
125125 [org.apache.arrow.vector.ipc WriteChannel]
132132 [java.io OutputStream InputStream ByteArrayOutputStream ByteArrayInputStream]
133133 [java.nio ByteBuffer ByteOrder ShortBuffer IntBuffer LongBuffer DoubleBuffer
134134 FloatBuffer]
135- [java.util List ArrayList Map HashMap Map$Entry Iterator Set]
135+ [java.util List ArrayList Map HashMap Map$Entry Iterator Set UUID ]
136136 [java.util.concurrent ForkJoinTask]
137137 [java.time ZoneId]
138138 [java.nio.channels WritableByteChannel]
@@ -435,6 +435,9 @@ Dependent block frames are not supported!!")
435435 (datafy [this] {:id (.getId this)
436436 :ordered? (.isOrdered this)
437437 :index-type (datafy (.getIndexType this))})
438+ ArrowType$FixedSizeBinary
439+ (datafy [this] {:datatype :fixed-size-binary
440+ :byte-width (.getByteWidth this)})
438441 ArrowType$List
439442 (datafy [this]
440443 {:datatype :list }))
@@ -710,10 +713,7 @@ Dependent block frames are not supported!!")
710713 (ft-fn (ArrowType$Utf8. ) encoding)
711714 ; ;If no encoding is provided then just save the string as text
712715 (ft-fn (ArrowType$Utf8. )))
713- :uuid (do
714- (when (== 1 (long (swap! uuid-warn-counter inc)))
715- (log/warn " Columns of type UUID are converted to type text when serializing to Arrow" ))
716- (ft-fn (ArrowType$Utf8. )))
716+ :uuid (ft-fn (ArrowType$FixedSizeBinary. 16 ))
717717 :text (ft-fn (ArrowType$Utf8. ))
718718 :encoded-text (ft-fn (ArrowType$Utf8. ))))))
719719
@@ -722,6 +722,9 @@ Dependent block frames are not supported!!")
722722 ^Field [dictionaries {strings-as-text? :strings-as-text? }
723723 col]
724724 (let [colmeta (meta col)
725+ colmeta (if (identical? :uuid (get colmeta :datatype ))
726+ (assoc colmeta ARROW_EXTENSION_NAME ARROW_UUID_NAME)
727+ colmeta)
725728 nullable? (boolean
726729 (or (:nullable? colmeta)
727730 (not (empty? (ds-proto/missing col)))))
@@ -1225,6 +1228,18 @@ Dependent block frames are not supported!!")
12251228 (throw (Exception. " Numeric buffer missing concrete representation" ))))])
12261229 (case col-dt
12271230 :boolean [(boolean-bytes cbuf)]
1231+ :uuid (let [data (byte-array (* 16 (dtype/ecount cbuf)))
1232+ wbuf (-> (java.nio.ByteBuffer/wrap data)
1233+ (.order java.nio.ByteOrder/BIG_ENDIAN))]
1234+ (reduce (fn [_ ^UUID v]
1235+ (if v
1236+ (do
1237+ (.putLong wbuf (.getMostSignificantBits v))
1238+ (.putLong wbuf (.getLeastSignificantBits v)))
1239+ (do
1240+ (.putLong wbuf 0 ) (.putLong wbuf 0 ))))
1241+ nil cbuf)
1242+ [(java.nio.ByteBuffer/wrap data)])
12281243 :string (let [str-t (ds-base/ensure-column-string-table col)
12291244 indices (dtype-proto/->array-buffer (str-table/indices str-t))]
12301245 [(nio-buffer/as-nio-buffer indices)])
@@ -1633,8 +1648,45 @@ Dependent block frames are not supported!!")
16331648 (field-metadata field)
16341649 (node-buf->missing node validity-buf))))))
16351650
1636- (defmethod ^:private preparse-field :default
1651+ (def ^{:private true
1652+ :tag String} ARROW_EXTENSION_NAME " ARROW:extension:name" )
1653+ (def ^{:private true
1654+ :tag String} ARROW_UUID_NAME " arrow.uuid" )
1655+
1656+ (defmethod ^:private preparse-field :fixed-size-binary
16371657 [field ^Iterator node-iter ^Iterator buf-iter dict-map options]
1658+ (let [node (.next node-iter)
1659+ buffers [(.next buf-iter) (.next buf-iter)]
1660+ n-elems (long (:n-elems node))
1661+ field-width (long (get-in field [:field-type :byte-width ]))]
1662+ (fn parse-fixed-binary-field
1663+ [decompressor]
1664+ (let [[validity-buf data-buf] (decompressor buffers)
1665+ ^bytes data-ary (if (instance? NativeBuffer data-buf)
1666+ (native-buffer/->jvm-array data-buf 0 (dtype/ecount data-buf))
1667+ (dtype/->array data-buf))
1668+ fm (field-metadata field)]
1669+ (col-impl/new-column
1670+ (:name field)
1671+ (if (= ARROW_UUID_NAME (get fm ARROW_EXTENSION_NAME))
1672+ (let [longsdata (-> (java.nio.ByteBuffer/wrap data-ary)
1673+ (.order (java.nio.ByteOrder/BIG_ENDIAN )))]
1674+ (println " is uuid" )
1675+ (dtype/make-reader :uuid n-elems
1676+ (let [lidx (* idx 16 )]
1677+ (java.util.UUID. (.getLong longsdata lidx)
1678+ (.getLong longsdata (+ lidx 8 ))))))
1679+ (let [ll (ArrayLists/toList data-ary)]
1680+ (println " is obj" )
1681+ (dtype/make-reader :object n-elems
1682+ (let [lidx (* idx field-width)]
1683+ (.subList ll lidx (+ lidx field-width))))))
1684+ fm
1685+ (node-buf->missing node validity-buf))))))
1686+
1687+
1688+ (defmethod ^:private preparse-field :default
1689+ [field ^Iterator node-iter ^Iterator buf-iter dict-map options]
16381690 (assert (= 0 (count (:children field)))
16391691 (format " Field %s cannot be parsed with default parser" field))
16401692 (let [field-dtype (get-in field [:field-type :datatype ])
@@ -2094,16 +2146,7 @@ Please use stream->dataset-seq.")))
20942146 ; ;datatypes
20952147 (reduce
20962148 (fn [ds col]
2097- (cond
2098- (= :uuid (dtype/elemwise-datatype col))
2099- (let [missing (ds-proto/missing col)
2100- metadata (meta col)]
2101- (assoc ds (metadata :name )
2102- #:tech.v3.dataset{:data (mapv (comp #(Text. %) str) col)
2103- :missing missing
2104- :metadata metadata
2105- :name (metadata :name )}))
2106- (and (= :string (dtype/elemwise-datatype col))
2149+ (if (and (= :string (dtype/elemwise-datatype col))
21072150 (not (:strings-as-text? options)))
21082151 (if (and (nil? prev-ds)
21092152 (instance? StringTable (.data ^Column col)))
@@ -2131,7 +2174,6 @@ Please use stream->dataset-seq.")))
21312174 :metadata (assoc metadata
21322175 ::previous-string-table prev-str-t)
21332176 :name (metadata :name )})))))
2134- :else
21352177 ds))
21362178 ds
21372179 (ds-base/columns ds))))
0 commit comments