Skip to content

Commit 65af4e8

Browse files
authored
Merge pull request #5 from thalesmg/spark-clj
feat(spark): add dedicated query container
2 parents 5e4c2e4 + cb2ac62 commit 65af4e8

File tree

8 files changed

+187
-1
lines changed

8 files changed

+187
-1
lines changed

spark-query/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
/.cpcache/
2+
/target/

spark-query/Dockerfile

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
ARG BUILD_FROM=clojure:temurin-17-tools-deps-1.12.0.1530-bookworm-slim
2+
ARG RUN_FROM=eclipse-temurin:11-jdk-ubi9-minimal
3+
4+
FROM ${BUILD_FROM} AS builder
5+
6+
RUN mkdir -p /usr/src/app
7+
WORKDIR /usr/src/app
8+
9+
# Cache layer with dependencies
10+
COPY ./deps.edn /usr/src/app/deps.edn
11+
RUN clojure -X:deps prep && \
12+
clojure -P -T:build
13+
14+
COPY ./build.clj /usr/src/app/build.clj
15+
COPY ./src /usr/src/app/src
16+
RUN clojure -T:build uber && \
17+
mv target/spark-query-*-standalone.jar app-standalone.jar
18+
19+
FROM ${RUN_FROM} AS runner
20+
21+
COPY --from=builder /usr/src/app/app-standalone.jar /opt/proxy/app-standalone.jar
22+
23+
WORKDIR /opt/proxy
24+
25+
CMD ["java", "-jar", "app-standalone.jar"]

spark-query/build.clj

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
(ns build
2+
(:require [clojure.tools.build.api :as b]))
3+
4+
(def lib 'spark-query)
5+
(def version "0.0.0")
6+
(def class-dir "target/classes")
7+
(def basis (b/create-basis {:project "deps.edn"}))
8+
(def uber-file (format "target/%s-%s-standalone.jar" (name lib) version))
9+
10+
(defn clean [_]
11+
(b/delete {:path "target"}))
12+
13+
(defn uber [_]
14+
(clean nil)
15+
(b/copy-dir {:src-dirs ["src" "resources"]
16+
:target-dir class-dir})
17+
(b/compile-clj {:basis basis
18+
:src-dirs ["src"]
19+
:class-dir class-dir})
20+
(b/uber {:class-dir class-dir
21+
:uber-file uber-file
22+
:basis basis
23+
:main 'spark-query.core}))

spark-query/deps.edn

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{:deps {http-kit/http-kit {:mvn/version "2.8.0"}
2+
compojure/compojure {:mvn/version "1.7.1"}
3+
org.clojure/data.json {:mvn/version "2.5.1"}
4+
5+
org.apache.iceberg/iceberg-aws-bundle {:mvn/version "1.8.1"}
6+
;; software.amazon.awssdk/protocol-core {:mvn/version "2.31.14"}
7+
org.apache.iceberg/iceberg-spark-runtime-3.5_2.13 {:mvn/version "1.8.1"}
8+
org.apache.hadoop/hadoop-common {:mvn/version "3.4.1"}
9+
;; org.apache.iceberg/iceberg-aws {:mvn/version "1.8.1"}
10+
;; software.amazon.s3tables/s3-tables-catalog-for-iceberg-runtime {:mvn/version "0.1.5"}
11+
software.amazon.awssdk/s3tables {:mvn/version "2.31.12"}
12+
org.apache.hadoop/hadoop-aws {:mvn/version "3.4.1"}
13+
org.apache.hadoop/hadoop-mapreduce-client-core {:mvn/version "3.4.1"}
14+
com.github.luben/zstd-jni {:mvn/version "1.5.7-2"}}
15+
16+
:aliases
17+
{:build {:deps {io.github.clojure/tools.build {:mvn/version "0.10.8"}}
18+
:ns-default build}
19+
:fmt/check {:extra-deps {cljfmt/cljfmt {:mvn/version "0.9.2"}}
20+
:main-opts ["--main" "cljfmt.main" "check"]}
21+
:fmt/fix {:extra-deps {cljfmt/cljfmt {:mvn/version "0.9.2"}}
22+
:main-opts ["--main" "cljfmt.main" "fix"]}
23+
:test {:extra-paths ["test"]
24+
:extra-deps { ;; https://github.com/cognitect-labs/test-runner
25+
io.github.cognitect-labs/test-runner {:git/tag "v0.5.1" :git/sha "dfb30dd"}
26+
;; https://clojars.org/nubank/matcher-combinators
27+
nubank/matcher-combinators {:mvn/version "3.8.5"}}
28+
:main-opts ["-m" "cognitect.test-runner"]
29+
:exec-fn cognitect.test-runner.api/test}}}
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
(ns spark-query.core
2+
(:require
3+
[clojure.data.json :as json]
4+
[clojure.java.io :as io]
5+
[clojure.pprint :refer [pprint]]
6+
[org.httpkit.server :as server]
7+
[compojure.core :refer [defroutes GET POST]])
8+
(:import
9+
[java.util Properties]
10+
[org.apache.iceberg
11+
CatalogProperties]
12+
[org.apache.iceberg.catalog
13+
Catalog
14+
Namespace
15+
TableIdentifier]
16+
[org.apache.iceberg.data
17+
IcebergGenerics
18+
Record]
19+
[org.apache.iceberg.rest RESTCatalog]
20+
[org.apache.iceberg.aws.s3 S3FileIOProperties]
21+
[org.apache.hadoop.conf Configuration])
22+
(:gen-class))
23+
24+
(def PORT 8090)
25+
26+
(defn- record->vec
27+
[record]
28+
(let [size (.size record)
29+
columns (->> record
30+
.struct
31+
.fields
32+
(mapv #(.name %)))
33+
values (mapv #(.get record %) (range size))]
34+
(zipmap columns values)))
35+
36+
(defn open-catalog
37+
[]
38+
(let [catalog-props {CatalogProperties/CATALOG_IMPL "org.apache.iceberg.rest.RESTCatalog"
39+
CatalogProperties/URI "http://iceberg-rest:8181"
40+
CatalogProperties/WAREHOUSE_LOCATION, "s3a://warehouse/wh"
41+
CatalogProperties/FILE_IO_IMPL "org.apache.iceberg.aws.s3.S3FileIO"
42+
S3FileIOProperties/ENDPOINT "http://minio.net:9000"}
43+
catalog (RESTCatalog.)
44+
catalog-config (Configuration.)
45+
_ (doto catalog
46+
(.setConf catalog-config)
47+
(.initialize "demo" catalog-props))]
48+
catalog))
49+
50+
(def CATALOG (atom nil))
51+
52+
(defn get-catalog
53+
[]
54+
(if @CATALOG
55+
@CATALOG
56+
(let [catalog (open-catalog)]
57+
(reset! CATALOG catalog)
58+
catalog)))
59+
60+
(defn load-table
61+
[catalog ns table]
62+
(let [ns-id (Namespace/of (into-array String [ns]))
63+
table-id (TableIdentifier/of ns-id table)
64+
table (.loadTable catalog table-id)]
65+
table))
66+
67+
(defn scan-table [ns-in table-in]
68+
(let [table (load-table (get-catalog) ns-in table-in)
69+
rows (-> table
70+
IcebergGenerics/read
71+
.build
72+
.iterator
73+
iterator-seq
74+
(into []))
75+
response-body (->> rows
76+
(mapv record->vec)
77+
json/write-str)]
78+
{:body response-body}))
79+
80+
(defroutes app-routes
81+
(GET "/scan/:ns/:table" [ns table] (scan-table ns table)))
82+
83+
(defn- block-forever
84+
[]
85+
(while true
86+
(Thread/sleep 60000)))
87+
88+
(defn -main
89+
[& args]
90+
(try
91+
(println "starting server on port" PORT)
92+
(server/run-server app-routes {:port PORT})
93+
(println "started server on port" PORT)
94+
(block-forever)
95+
(catch Exception e
96+
(println (.getMessage e))
97+
(.printStackTrace e)
98+
(System/exit 1))))

spark-query/vsn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1.0.0

spark/Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ RUN apt-get update && \
2525
openjdk-17-jdk \
2626
build-essential \
2727
software-properties-common \
28+
rlwrap \
2829
ssh && \
2930
apt-get clean && \
3031
rm -rf /var/lib/apt/lists/*
@@ -42,6 +43,13 @@ RUN curl https://github.com/SpencerPark/IJava/releases/download/v1.3.0/ijava-1.3
4243
&& python3 install.py --sys-prefix \
4344
&& rm ijava-1.3.0.zip
4445

46+
# Install Clojure
47+
RUN curl -L -O https://github.com/clojure/brew-install/releases/latest/download/linux-install.sh && \
48+
chmod +x linux-install.sh && \
49+
./linux-install.sh && \
50+
rm ./linux-install.sh && \
51+
clojure --version
52+
4553
# Optional env variables
4654
ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"}
4755
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH

spark/vsn

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.0
1+
1.0.1

0 commit comments

Comments
 (0)