Skip to content

Commit bd6a38a

Browse files
authored
Merge pull request #12 from thalesmg/spark-sql
feat(spark-query): add endpoint to run arbitrary spark sql, make image more malleable
2 parents 3262ad6 + d5f2467 commit bd6a38a

File tree

5 files changed

+81
-23
lines changed

5 files changed

+81
-23
lines changed

spark-query/Dockerfile

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,15 @@
1-
ARG BUILD_FROM=clojure:temurin-17-tools-deps-1.12.0.1530-bookworm-slim
2-
ARG RUN_FROM=eclipse-temurin:11-jdk-ubi9-minimal
1+
## Using an image with clojure so we may use `add-lib` in nREPL
2+
ARG RUN_FROM=clojure:temurin-17-tools-deps-1.12.0.1530-bookworm-slim
33

4-
FROM ${BUILD_FROM} AS builder
4+
FROM ${RUN_FROM}
55

66
RUN mkdir -p /usr/src/app
77
WORKDIR /usr/src/app
88

99
# Cache layer with dependencies
1010
COPY ./deps.edn /usr/src/app/deps.edn
11-
RUN clojure -X:deps prep && \
12-
clojure -P -T:build
11+
RUN clojure -X:deps prep
1312

14-
COPY ./build.clj /usr/src/app/build.clj
1513
COPY ./src /usr/src/app/src
16-
RUN clojure -T:build uber && \
17-
mv target/spark-query-*-standalone.jar app-standalone.jar
1814

19-
FROM ${RUN_FROM} AS runner
20-
21-
COPY --from=builder /usr/src/app/app-standalone.jar /opt/proxy/app-standalone.jar
22-
23-
WORKDIR /opt/proxy
24-
25-
CMD ["java", "-jar", "app-standalone.jar"]
15+
CMD ["clojure", "-M:run", "-m", "spark-query.core"]

spark-query/Dockerfile-uberjar

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
ARG BUILD_FROM=clojure:temurin-17-tools-deps-1.12.0.1530-bookworm-slim
2+
ARG RUN_FROM=eclipse-temurin:11-jdk-ubi9-minimal
3+
4+
FROM ${BUILD_FROM} AS builder
5+
6+
RUN mkdir -p /usr/src/app
7+
WORKDIR /usr/src/app
8+
9+
# Cache layer with dependencies
10+
COPY ./deps.edn /usr/src/app/deps.edn
11+
RUN clojure -X:deps prep && \
12+
clojure -P -T:build
13+
14+
COPY ./build.clj /usr/src/app/build.clj
15+
COPY ./src /usr/src/app/src
16+
RUN clojure -T:build uber && \
17+
mv target/spark-query-*-standalone.jar app-standalone.jar
18+
19+
FROM ${RUN_FROM} AS runner
20+
21+
COPY --from=builder /usr/src/app/app-standalone.jar /opt/proxy/app-standalone.jar
22+
23+
WORKDIR /opt/proxy
24+
25+
CMD ["java", "-jar", "app-standalone.jar"]

spark-query/deps.edn

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
{:deps {http-kit/http-kit {:mvn/version "2.8.0"}
22
compojure/compojure {:mvn/version "1.7.1"}
3+
ring/ring-devel {:mvn/version "1.14.1"}
34
org.clojure/data.json {:mvn/version "2.5.1"}
45

56
nrepl/nrepl {:mvn/version "1.3.1"}
67
cider/cider-nrepl {:mvn/version "0.55.1"}
8+
com.bhauman/rebel-readline-cljs {:mvn/version "0.1.5"}
9+
10+
org.apache.spark/spark-sql_2.13 {:mvn/version "3.5.5"}
711

812
org.apache.iceberg/iceberg-aws-bundle {:mvn/version "1.8.1"}
913
;; software.amazon.awssdk/protocol-core {:mvn/version "2.31.14"}
@@ -19,6 +23,9 @@
1923
:aliases
2024
{:build {:deps {io.github.clojure/tools.build {:mvn/version "0.10.8"}}
2125
:ns-default build}
26+
:run {:jvm-opts ["--add-exports" "java.base/sun.nio.ch=ALL-UNNAMED"
27+
"--add-opens=java.base/java.nio=ALL-UNNAMED"
28+
]}
2229
:fmt/check {:extra-deps {cljfmt/cljfmt {:mvn/version "0.9.2"}}
2330
:main-opts ["--main" "cljfmt.main" "check"]}
2431
:fmt/fix {:extra-deps {cljfmt/cljfmt {:mvn/version "0.9.2"}}

spark-query/src/spark_query/core.clj

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
[cider.nrepl :refer [cider-nrepl-handler]]
77
[nrepl.server :as nrepl-server]
88
[org.httpkit.server :as server]
9+
[ring.middleware.reload :refer [wrap-reload]]
10+
[ring.util.request :as ring-req]
911
[compojure.core :refer [defroutes GET POST]])
1012
(:import
1113
[java.util Properties]
@@ -20,10 +22,12 @@
2022
Record]
2123
[org.apache.iceberg.rest RESTCatalog]
2224
[org.apache.iceberg.aws.s3 S3FileIOProperties]
23-
[org.apache.hadoop.conf Configuration])
25+
[org.apache.hadoop.conf Configuration]
26+
[org.apache.spark.sql SparkSession])
2427
(:gen-class))
2528

2629
(def PORT 8090)
30+
(def NREPL-PORT 7890)
2731

2832
(defn- record->vec
2933
[record]
@@ -49,6 +53,19 @@
4953
(.initialize "demo" catalog-props))]
5054
catalog))
5155

56+
(defn spark-session
57+
[]
58+
(-> (SparkSession/builder)
59+
(.config "spark.sql.defaultCatalog" "demo")
60+
(.config "spark.sql.catalog.demo" "org.apache.iceberg.spark.SparkCatalog")
61+
(.config "spark.sql.catalog.demo.type" "rest")
62+
(.config "spark.sql.catalog.demo.uri" "http://iceberg-rest:8181")
63+
(.config "spark.sql.catalog.demo.io-impl" "org.apache.iceberg.aws.s3.S3FileIO")
64+
(.config "spark.sql.catalog.demo.warehouse" "s3://warehouse/wh/")
65+
(.config "spark.sql.catalog.demo.s3.endpoint" "http://minio.net:9000")
66+
(.master "local")
67+
.getOrCreate))
68+
5269
(def CATALOG (atom nil))
5370

5471
(defn get-catalog
@@ -108,15 +125,17 @@
108125
.partition
109126
partition-data->vec)))))
110127

111-
(defn handle-scan-table [ns-in table-in]
128+
(defn handle-scan-table
129+
[ns-in table-in]
112130
(let [table (load-table (get-catalog) ns-in table-in)
113131
rows (scan-table table)
114132
response-body (->> rows
115133
(mapv record->vec)
116134
json/write-str)]
117135
{:body response-body}))
118136

119-
(defn handle-table-partitions [ns-in table-in]
137+
(defn handle-table-partitions
138+
[ns-in table-in]
120139
(let [table (load-table (get-catalog) ns-in table-in)
121140
partitions-from-meta (table-partitions-from-meta table)
122141
partitions-from-data (table-partitions-from-data table)
@@ -125,21 +144,38 @@
125144
response-body (json/write-str response)]
126145
{:body response-body}))
127146

147+
(defn handle-spark-sql
148+
[request]
149+
(let [sql (ring-req/body-string request)
150+
session (spark-session)
151+
dataset (.sql session sql)
152+
_ (.show dataset)
153+
response-body (-> dataset
154+
.toJSON
155+
.toLocalIterator
156+
iterator-seq
157+
(->> (map json/read-str))
158+
json/write-str)]
159+
{:body response-body}))
160+
128161
(defroutes app-routes
129162
(GET "/scan/:ns/:table" [ns table] (handle-scan-table ns table))
130-
(GET "/partitions/:ns/:table" [ns table] (handle-table-partitions ns table)))
163+
(GET "/partitions/:ns/:table" [ns table] (handle-table-partitions ns table))
164+
(POST "/sql" request (handle-spark-sql request)))
131165

132166
(defn- block-forever
133167
[]
134168
(while true
135169
(Thread/sleep 60000)))
136170

137171
(defn -main
138-
[& args]
172+
[& _args]
139173
(try
174+
(println "starting nREPL server on port" NREPL-PORT)
175+
(nrepl-server/start-server :port NREPL-PORT :bind "0.0.0.0" :handler cider-nrepl-handler)
176+
(println "started nREPL server on port" NREPL-PORT)
140177
(println "starting server on port" PORT)
141-
(nrepl-server/start-server :port 7890 :bind "0.0.0.0" :handler cider-nrepl-handler)
142-
(server/run-server app-routes {:port PORT})
178+
(server/run-server (wrap-reload #'app-routes) {:port PORT})
143179
(println "started server on port" PORT)
144180
(block-forever)
145181
(catch Exception e

spark-query/vsn

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
1.0.3
1+
1.0.4

0 commit comments

Comments
 (0)