fix: Spark job creation in end-to-end-security demo (#38)

sbernauer · web-flow · commit 5d7ae2b34a79 · 2024-04-18T10:51:30.000Z
* fix: Spark job creation in end-to-end-security demo

* change command to list
diff --git a/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml b/demos/data-lakehouse-iceberg-trino-spark/create-spark-ingestion-job.yaml
@@ -13,7 +13,7 @@ spec:
       initContainers:
         - name: wait-for-kafka
           image: docker.stackable.tech/stackable/tools:1.0.0-stackable24.3.0
-          command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/instance=kafka -l app.kubernetes.io/name=kafka"]
+          command: ["bash", "-c", "echo 'Waiting for all kafka brokers to be ready' && kubectl wait --for=condition=ready --timeout=30m pod -l app.kubernetes.io/name=kafka -l app.kubernetes.io/instance=kafka"]
       containers:
         - name: create-spark-ingestion-job
           image: docker.stackable.tech/stackable/tools:1.0.0-stackable24.3.0
diff --git a/demos/demos-v2.yaml b/demos/demos-v2.yaml
@@ -44,7 +44,8 @@ demos:
       - keycloak
     manifests:
       - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/end-to-end-security/create-trino-tables.yaml
-      - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/end-to-end-security/spark-report.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/end-to-end-security/serviceaccount.yaml
+      - plainYaml: https://raw.githubusercontent.com/stackabletech/demos/main/demos/end-to-end-security/create-spark-report.yaml
     supportedNamespaces: []
     resourceRequests:
       cpu: 6250m
diff --git a/demos/end-to-end-security/create-spark-report.yaml b/demos/end-to-end-security/create-spark-report.yaml
@@ -0,0 +1,173 @@
+# We can't simply create the SparkApplication object here as we have to wait for Kafka to be ready because
+# * We currently don't restart failed Spark applications (see https://github.com/stackabletech/spark-k8s-operator/issues/157)
+# * We currently auto-create topics and we need all the brokers to be available so that the topic is distributed among all the brokers
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: create-spark-report
+spec:
+  template:
+    spec:
+      serviceAccountName: demo-serviceaccount
+      initContainers:
+        - name: wait-for-trino-tables
+          image: docker.stackable.tech/stackable/tools:1.0.0-stackable24.3.0
+          command:
+            - bash
+            - -euo
+            - pipefail
+            - -c
+            - |
+              echo "Waiting for Job create-tables-in-trino to complete"
+              kubectl wait --timeout=30m --for=condition=complete job/create-tables-in-trino
+      containers:
+        - name: create-spark-report
+          image: docker.stackable.tech/stackable/tools:1.0.0-stackable24.3.0
+          command:
+            - bash
+            - -euo
+            - pipefail
+            - -c
+            - |
+              echo "Submitting Spark report"
+              kubectl apply -f /tmp/manifest/spark-report.yaml
+          volumeMounts:
+            - name: manifest
+              mountPath: /tmp/manifest
+      volumes:
+        - name: manifest
+          configMap:
+            name: spark-report-manifest
+      restartPolicy: OnFailure
+  backoffLimit: 50
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: spark-report-manifest
+data:
+  spark-report.yaml: |
+    ---
+    apiVersion: spark.stackable.tech/v1alpha1
+    kind: SparkApplication
+    metadata:
+      name: spark-report
+    spec:
+      sparkImage:
+        productVersion: 3.5.1
+      mode: cluster
+      mainApplicationFile: local:///stackable/spark/jobs/spark-report.py
+      deps:
+        packages:
+          - org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0
+      sparkConf:
+        spark.driver.extraClassPath: /stackable/config/hdfs
+        spark.executor.extraClassPath: /stackable/config/hdfs
+        spark.hadoop.hive.metastore.kerberos.principal: hive/hive-iceberg.default.svc.cluster.local@KNAB.COM
+        spark.hadoop.hive.metastore.sasl.enabled: "true"
+        spark.kerberos.keytab: /stackable/kerberos/keytab
+        spark.kerberos.principal: spark/spark.default.svc.cluster.local@KNAB.COM
+        spark.sql.catalog.lakehouse: org.apache.iceberg.spark.SparkCatalog
+        spark.sql.catalog.lakehouse.type: hive
+        spark.sql.catalog.lakehouse.uri: thrift://hive-iceberg:9083
+        spark.sql.defaultCatalog: lakehouse
+        spark.sql.extensions: org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
+      job:
+        config:
+          volumeMounts: &volumeMounts
+            - name: script
+              mountPath: /stackable/spark/jobs
+            - name: hdfs-config
+              mountPath: /stackable/config/hdfs
+            - name: kerberos
+              mountPath: /stackable/kerberos
+            # Yes, I'm too lazy to fiddle around with JVM arguments... (-Djava.security.krb5.conf=/example/path/krb5.conf)
+            - name: kerberos
+              mountPath: /etc/krb5.conf
+              subPath: krb5.conf
+        envOverrides: &envOverrides
+          KERBEROS_REALM: KNAB.COM
+        # As the envOverrides are not working
+        podOverrides:
+          spec:
+            containers:
+              - name: spark-submit
+                env:
+                  - name: KERBEROS_REALM
+                    value: KNAB.COM
+      driver:
+        config:
+          volumeMounts: *volumeMounts
+          resources: # I would like to run this stack on my Laptop
+            cpu:
+              min: 100m
+        envOverrides: *envOverrides
+        podOverrides: &podOverrides
+          spec:
+            containers:
+              - name: spark
+                # As the envOverrides are not working
+                env:
+                  - name: KERBEROS_REALM
+                    value: KNAB.COM
+      executor:
+        replicas: 1
+        config:
+          volumeMounts: *volumeMounts
+          resources: # I would like to run this stack on my Laptop
+            cpu:
+              min: 250m
+        envOverrides: *envOverrides
+        podOverrides: *podOverrides
+      volumes:
+        - name: script
+          configMap:
+            name: spark-report-script
+        - name: hdfs-config
+          configMap:
+            name: hdfs
+        - name: kerberos
+          ephemeral:
+            volumeClaimTemplate:
+              metadata:
+                annotations:
+                  secrets.stackable.tech/class: kerberos
+                  secrets.stackable.tech/kerberos.service.names: spark
+                  secrets.stackable.tech/scope: service=spark
+              spec:
+                accessModes:
+                  - ReadWriteOnce
+                resources:
+                  requests:
+                    storage: "1"
+                storageClassName: secrets.stackable.tech
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: spark-report-script
+data:
+  spark-report.py: |
+    from pyspark.sql import SparkSession
+    from pyspark.sql.types import StructType, StructField, StringType, LongType, ShortType, FloatType, DoubleType, BooleanType, TimestampType, MapType, ArrayType
+    from pyspark.sql.functions import col, from_json, expr
+
+    import time
+
+    spark = SparkSession.builder.appName("spark-report").getOrCreate()
+
+    spark.sql("show catalogs").show()
+    spark.sql("show tables in lakehouse.default").show()
+
+    customer_table = "lakehouse.customer_analytics.customer"
+    while not spark.catalog.tableExists(customer_table):
+        print(f"Table {customer_table} not found, waiting for Trino to create it...")
+        time.sleep(5)
+
+    print(f"Table {customer_table} found, starting report")
+
+    spark.sql(f"SELECT * FROM lakehouse.customer_analytics.customer").show()
+    spark.sql(f"CREATE TABLE IF NOT EXISTS lakehouse.customer_analytics.spark_report AS SELECT c_birth_country, count(*) FROM {customer_table} group by c_birth_country order by c_birth_country").show()
+
+    print("Report written")
diff --git a/demos/end-to-end-security/serviceaccount.yaml b/demos/end-to-end-security/serviceaccount.yaml
@@ -0,0 +1,50 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: demo-serviceaccount
+  namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: demo-clusterrolebinding
+subjects:
+  - kind: ServiceAccount
+    name: demo-serviceaccount
+    namespace: default
+roleRef:
+  kind: ClusterRole
+  name: demo-clusterrole
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: demo-clusterrole
+rules:
+  - apiGroups:
+      - ""
+    resources:
+      - pods
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - batch
+    resources:
+      - jobs
+    verbs:
+      - get
+      - list
+      - watch
+  - apiGroups:
+      - spark.stackable.tech
+    resources:
+      - sparkapplications
+    verbs:
+      - get
+      - list
+      - watch
+      - create
diff --git a/demos/end-to-end-security/spark-report.yaml b/demos/end-to-end-security/spark-report.yaml