|
24 | 24 | [org.apache.iceberg.rest RESTCatalog] |
25 | 25 | [org.apache.iceberg.aws.s3 S3FileIOProperties] |
26 | 26 | [org.apache.hadoop.conf Configuration] |
27 | | - [org.apache.spark.sql SparkSession]) |
| 27 | + [org.apache.spark.sql SparkSession] |
| 28 | + (org.apache.parquet.avro AvroParquetReader) |
| 29 | + (org.apache.parquet.conf PlainParquetConfiguration) |
| 30 | + (org.apache.parquet.io SeekableInputStream InputFile)) |
28 | 31 | (:gen-class)) |
29 | 32 |
|
30 | 33 | (def PORT 8090) |
|
159 | 162 | json/write-str)] |
160 | 163 | {:body response-body})) |
161 | 164 |
|
| 165 | +(defn avro->json |
| 166 | + [avro] |
| 167 | + (-> avro |
| 168 | + .toString |
| 169 | + json/read-str)) |
| 170 | + |
| 171 | +(defn new-seekable-input-stream |
| 172 | + [in-ba] |
| 173 | + (let [pos (atom 0)] |
| 174 | + (letfn [(read1-byte [] |
| 175 | + (let [x (aget in-ba @pos)] |
| 176 | + (swap! pos inc) |
| 177 | + (byte x))) |
| 178 | + (read1 [] |
| 179 | + (let [x (read1-byte) |
| 180 | + x-u (bit-and x 0xff)] |
| 181 | + (int x-u))) |
| 182 | + (read-array [out-array] |
| 183 | + (let [to-read (alength out-array)] |
| 184 | + (doseq [i (range to-read)] |
| 185 | + (let [x (read1-byte)] |
| 186 | + (aset-byte out-array i x)))))] |
| 187 | + (proxy [SeekableInputStream] [] |
| 188 | + (read |
| 189 | + ([] |
| 190 | + (read1)) |
| 191 | + ([byte-buffer] |
| 192 | + :todo)) |
| 193 | + (getPos [] |
| 194 | + @pos) |
| 195 | + (seek [new-pos] |
| 196 | + (reset! pos new-pos)) |
| 197 | + (readFully |
| 198 | + ([out-array] |
| 199 | + (if (bytes? out-array) |
| 200 | + (read-array out-array) |
| 201 | + ;; java.nio.ByteBuffer |
| 202 | + (let [to-read (.remaining out-array) |
| 203 | + tmp (byte-array to-read (byte 0))] |
| 204 | + (read-array tmp) |
| 205 | + (.put out-array |
| 206 | + tmp |
| 207 | + (+ (.position out-array) (.arrayOffset out-array)) |
| 208 | + (.remaining out-array))))) |
| 209 | + ([out-array start len] |
| 210 | + (doseq [i (range len)] |
| 211 | + (let [x (read1-byte)] |
| 212 | + (aset-byte out-array (+ start i) x))))))))) |
| 213 | + |
| 214 | +(defn new-mem-input-file |
| 215 | + [in-ba] |
| 216 | + (proxy [InputFile] [] |
| 217 | + (getLength [] |
| 218 | + (alength in-ba)) |
| 219 | + (newStream [] |
| 220 | + (new-seekable-input-stream in-ba)))) |
| 221 | + |
| 222 | +(defn read-parquet-avro |
| 223 | + [input-file] |
| 224 | + (with-open [reader (AvroParquetReader/genericRecordReader |
| 225 | + input-file |
| 226 | + (PlainParquetConfiguration. |
| 227 | + {"parquet.avro.readInt96AsFixed" "true"}))] |
| 228 | + (loop [record (.read reader) |
| 229 | + acc []] |
| 230 | + (if record |
| 231 | + (recur (.read reader) |
| 232 | + (->> record |
| 233 | + avro->json |
| 234 | + (conj acc))) |
| 235 | + acc)))) |
| 236 | + |
| 237 | +(defn read-parquet |
| 238 | + [data-raw] |
| 239 | + (with-open [in (io/input-stream data-raw) |
| 240 | + out (java.io.ByteArrayOutputStream.)] |
| 241 | + (io/copy in out) |
| 242 | + (-> out |
| 243 | + .toByteArray |
| 244 | + new-mem-input-file |
| 245 | + read-parquet-avro))) |
| 246 | + |
| 247 | +(defn handle-read-parquet |
| 248 | + [request] |
| 249 | + (let [result (read-parquet (:body request))] |
| 250 | + {:body (json/write-str result)})) |
| 251 | + |
162 | 252 | (defroutes app-routes |
163 | 253 | (GET "/scan/:ns/:table" [ns table] (handle-scan-table ns table)) |
164 | 254 | (GET "/partitions/:ns/:table" [ns table] (handle-table-partitions ns table)) |
165 | | - (POST "/sql" request (handle-spark-sql request))) |
| 255 | + (POST "/sql" request (handle-spark-sql request)) |
| 256 | + (POST "/read-parquet" request (handle-read-parquet request))) |
166 | 257 |
|
167 | 258 | (defn- block-forever |
168 | 259 | [] |
|
0 commit comments