DataExpert-io
diff --git a/‎bootcamp/materials/3-spark-fundamentals/notebooks/Caching.ipynb
Lines changed: 49 additions & 46 deletions b/‎bootcamp/materials/3-spark-fundamentals/notebooks/Caching.ipynb
Lines changed: 49 additions & 46 deletions
diff --git a/‎bootcamp/materials/3-spark-fundamentals/notebooks/DatasetApi.ipynb
Lines changed: 48 additions & 34 deletions b/‎bootcamp/materials/3-spark-fundamentals/notebooks/DatasetApi.ipynb
Lines changed: 48 additions & 34 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "id": "e9ae4c8b-4599-4fbb-a545-76b6e3bcb84d",
    "metadata": {},
    "outputs": [
@@ -12,52 +12,54 @@
      "text": [
       "== Physical Plan ==\n",
       "AdaptiveSparkPlan isFinalPlan=false\n",
-      "+- ObjectHashAggregate(keys=[device_id#937, device_type#940], functions=[collect_list(user_id#907, 0, 0)])\n",
-      "   +- ObjectHashAggregate(keys=[device_id#937, device_type#940], functions=[partial_collect_list(user_id#907, 0, 0)])\n",
-      "      +- Project [device_id#937, device_type#940, user_id#907]\n",
-      "         +- SortMergeJoin [device_id#937], [device_id#908], Inner\n",
-      "            :- Sort [device_id#937 ASC NULLS FIRST], false, 0\n",
-      "            :  +- Exchange hashpartitioning(device_id#937, 4), ENSURE_REQUIREMENTS, [plan_id=1320]\n",
-      "            :     +- Filter isnotnull(device_id#937)\n",
-      "            :        +- FileScan csv [device_id#937,device_type#940] Batched: false, DataFilters: [isnotnull(device_id#937)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/devices.csv], PartitionFilters: [], PushedFilters: [IsNotNull(device_id)], ReadSchema: struct<device_id:int,device_type:string>\n",
-      "            +- Sort [device_id#908 ASC NULLS FIRST], false, 0\n",
-      "               +- Exchange hashpartitioning(device_id#908, 4), ENSURE_REQUIREMENTS, [plan_id=1321]\n",
-      "                  +- Filter isnotnull(device_id#908)\n",
-      "                     +- InMemoryTableScan [user_id#907, device_id#908], [isnotnull(device_id#908)]\n",
-      "                           +- InMemoryRelation [user_id#907, device_id#908, event_counts#945L, host_array#946], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
-      "                                 +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[count(1), collect_list(distinct host#201, 0, 0)])\n",
-      "                                    +- Exchange hashpartitioning(user_id#198, device_id#199, 4), ENSURE_REQUIREMENTS, [plan_id=1338]\n",
-      "                                       +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[merge_count(1), partial_collect_list(distinct host#201, 0, 0)])\n",
-      "                                          +- *(2) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[merge_count(1)])\n",
-      "                                             +- Exchange hashpartitioning(user_id#198, device_id#199, host#201, 4), ENSURE_REQUIREMENTS, [plan_id=1333]\n",
-      "                                                +- *(1) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[partial_count(1)])\n",
-      "                                                   +- *(1) Filter isnotnull(user_id#198)\n",
-      "                                                      +- FileScan csv [user_id#198,device_id#199,host#201] Batched: false, DataFilters: [isnotnull(user_id#198)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
+      "+- ObjectHashAggregate(keys=[device_id#598, device_type#601], functions=[collect_list(user_id#568, 0, 0)])\n",
+      "   +- ObjectHashAggregate(keys=[device_id#598, device_type#601], functions=[partial_collect_list(user_id#568, 0, 0)])\n",
+      "      +- Project [device_id#598, device_type#601, user_id#568]\n",
+      "         +- SortMergeJoin [device_id#598], [device_id#569], Inner\n",
+      "            :- Sort [device_id#598 ASC NULLS FIRST], false, 0\n",
+      "            :  +- Exchange hashpartitioning(device_id#598, 4), ENSURE_REQUIREMENTS, [plan_id=735]\n",
+      "            :     +- Filter isnotnull(device_id#598)\n",
+      "            :        +- FileScan csv [device_id#598,device_type#601] Batched: false, DataFilters: [isnotnull(device_id#598)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/devices.csv], PartitionFilters: [], PushedFilters: [IsNotNull(device_id)], ReadSchema: struct<device_id:int,device_type:string>\n",
+      "            +- Sort [device_id#569 ASC NULLS FIRST], false, 0\n",
+      "               +- Exchange hashpartitioning(device_id#569, 4), ENSURE_REQUIREMENTS, [plan_id=736]\n",
+      "                  +- Filter isnotnull(device_id#569)\n",
+      "                     +- InMemoryTableScan [user_id#568, device_id#569], [isnotnull(device_id#569)]\n",
+      "                           +- InMemoryRelation [user_id#568, device_id#569, event_counts#606L, host_array#607], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "                                 +- AdaptiveSparkPlan isFinalPlan=false\n",
+      "                                    +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[count(1), collect_list(distinct host#20, 0, 0)])\n",
+      "                                       +- Exchange hashpartitioning(user_id#17, device_id#18, 4), ENSURE_REQUIREMENTS, [plan_id=752]\n",
+      "                                          +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[merge_count(1), partial_collect_list(distinct host#20, 0, 0)])\n",
+      "                                             +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[merge_count(1)])\n",
+      "                                                +- Exchange hashpartitioning(user_id#17, device_id#18, host#20, 4), ENSURE_REQUIREMENTS, [plan_id=748]\n",
+      "                                                   +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[partial_count(1)])\n",
+      "                                                      +- Filter isnotnull(user_id#17)\n",
+      "                                                         +- FileScan csv [user_id#17,device_id#18,host#20] Batched: false, DataFilters: [isnotnull(user_id#17)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
       "\n",
       "\n",
       "== Physical Plan ==\n",
       "AdaptiveSparkPlan isFinalPlan=false\n",
-      "+- ObjectHashAggregate(keys=[user_id#907], functions=[max(event_counts#945L), collect_list(device_id#908, 0, 0)])\n",
-      "   +- ObjectHashAggregate(keys=[user_id#907], functions=[partial_max(event_counts#945L), partial_collect_list(device_id#908, 0, 0)])\n",
-      "      +- Project [user_id#907, device_id#908, event_counts#945L]\n",
-      "         +- SortMergeJoin [user_id#907], [user_id#953], Inner\n",
-      "            :- Sort [user_id#907 ASC NULLS FIRST], false, 0\n",
-      "            :  +- Exchange hashpartitioning(user_id#907, 4), ENSURE_REQUIREMENTS, [plan_id=1374]\n",
-      "            :     +- Filter isnotnull(user_id#907)\n",
-      "            :        +- FileScan csv [user_id#907,device_id#908] Batched: false, DataFilters: [isnotnull(user_id#907)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int>\n",
-      "            +- Sort [user_id#953 ASC NULLS FIRST], false, 0\n",
-      "               +- Exchange hashpartitioning(user_id#953, 4), ENSURE_REQUIREMENTS, [plan_id=1375]\n",
-      "                  +- Filter isnotnull(user_id#953)\n",
-      "                     +- InMemoryTableScan [user_id#953, event_counts#945L], [isnotnull(user_id#953)]\n",
-      "                           +- InMemoryRelation [user_id#953, device_id#954, event_counts#945L, host_array#946], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
-      "                                 +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[count(1), collect_list(distinct host#201, 0, 0)])\n",
-      "                                    +- Exchange hashpartitioning(user_id#198, device_id#199, 4), ENSURE_REQUIREMENTS, [plan_id=1392]\n",
-      "                                       +- ObjectHashAggregate(keys=[user_id#198, device_id#199], functions=[merge_count(1), partial_collect_list(distinct host#201, 0, 0)])\n",
-      "                                          +- *(2) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[merge_count(1)])\n",
-      "                                             +- Exchange hashpartitioning(user_id#198, device_id#199, host#201, 4), ENSURE_REQUIREMENTS, [plan_id=1387]\n",
-      "                                                +- *(1) HashAggregate(keys=[user_id#198, device_id#199, host#201], functions=[partial_count(1)])\n",
-      "                                                   +- *(1) Filter isnotnull(user_id#198)\n",
-      "                                                      +- FileScan csv [user_id#198,device_id#199,host#201] Batched: false, DataFilters: [isnotnull(user_id#198)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
+      "+- ObjectHashAggregate(keys=[user_id#568], functions=[max(event_counts#606L), collect_list(device_id#569, 0, 0)])\n",
+      "   +- ObjectHashAggregate(keys=[user_id#568], functions=[partial_max(event_counts#606L), partial_collect_list(device_id#569, 0, 0)])\n",
+      "      +- Project [user_id#568, device_id#569, event_counts#606L]\n",
+      "         +- SortMergeJoin [user_id#568], [user_id#614], Inner\n",
+      "            :- Sort [user_id#568 ASC NULLS FIRST], false, 0\n",
+      "            :  +- Exchange hashpartitioning(user_id#568, 4), ENSURE_REQUIREMENTS, [plan_id=788]\n",
+      "            :     +- Filter isnotnull(user_id#568)\n",
+      "            :        +- FileScan csv [user_id#568,device_id#569] Batched: false, DataFilters: [isnotnull(user_id#568)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int>\n",
+      "            +- Sort [user_id#614 ASC NULLS FIRST], false, 0\n",
+      "               +- Exchange hashpartitioning(user_id#614, 4), ENSURE_REQUIREMENTS, [plan_id=789]\n",
+      "                  +- Filter isnotnull(user_id#614)\n",
+      "                     +- InMemoryTableScan [user_id#614, event_counts#606L], [isnotnull(user_id#614)]\n",
+      "                           +- InMemoryRelation [user_id#614, device_id#615, event_counts#606L, host_array#607], StorageLevel(disk, memory, deserialized, 1 replicas)\n",
+      "                                 +- AdaptiveSparkPlan isFinalPlan=false\n",
+      "                                    +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[count(1), collect_list(distinct host#20, 0, 0)])\n",
+      "                                       +- Exchange hashpartitioning(user_id#17, device_id#18, 4), ENSURE_REQUIREMENTS, [plan_id=805]\n",
+      "                                          +- ObjectHashAggregate(keys=[user_id#17, device_id#18], functions=[merge_count(1), partial_collect_list(distinct host#20, 0, 0)])\n",
+      "                                             +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[merge_count(1)])\n",
+      "                                                +- Exchange hashpartitioning(user_id#17, device_id#18, host#20, 4), ENSURE_REQUIREMENTS, [plan_id=801]\n",
+      "                                                   +- HashAggregate(keys=[user_id#17, device_id#18, host#20], functions=[partial_count(1)])\n",
+      "                                                      +- Filter isnotnull(user_id#17)\n",
+      "                                                         +- FileScan csv [user_id#17,device_id#18,host#20] Batched: false, DataFilters: [isnotnull(user_id#17)], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/home/iceberg/data/events.csv], PartitionFilters: [], PushedFilters: [IsNotNull(user_id)], ReadSchema: struct<user_id:int,device_id:int,host:string>\n",
       "\n",
       "\n"
      ]
@@ -73,10 +75,10 @@
        "eventsAggregated: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [user_id: int, device_id: int ... 2 more fields]\n",
        "usersAndDevices: org.apache.spark.sql.DataFrame = [user_id: int, user_id: int ... 2 more fields]\n",
        "devicesOnEvents: org.apache.spark.sql.DataFrame = [device_id: int, device_type: string ... 3 more fields]\n",
-       "res4: Array[org.apache.spark.sql.Row] = Array([-2147470439,-2147470439,3,WrappedArray(378988111, 378988111, 378988111)])\n"
+       "res1: Array[org.apache.spark.sql.Row] = Array([-2147470439,-2147470439,3,WrappedArray(378988111, 378988111, 378988111)])\n"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -107,6 +109,7 @@
     "//Caching here should be < 5 GBs or used for broadcast join\n",
     "//You need to tune executor memory otherwise it'll spill to disk and be slow\n",
     "//Don't really try using any of the other StorageLevel besides MEMORY_ONLY\n",
+    "\n",
     "val eventsAggregated = spark.sql(f\"\"\"\n",
     "  SELECT user_id, \n",
     "          device_id, \n",
@@ -207,4 +210,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}
@@ -2,17 +2,37 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
    "id": "22b842be-6a82-4127-b937-ead4103a92e8",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "res13: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@7f9ff709\n"
+       "Intitializing Scala interpreter ..."
       ]
      },
-     "execution_count": 14,
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Spark Web UI available at http://0a64d2ba5c88:4042\n",
+       "SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1733519375641)\n",
+       "SparkSession available as 'spark'\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "res0: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@591302c9\n"
+      ]
+     },
+     "execution_count": 1,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -23,42 +43,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
+   "id": "d8d4270e-b96d-4437-808a-994b0bb996b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# If something is nullabe, you need to wrap the value type in Option[] - this helps enforce assumptions about the pipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "73b5384f-be28-49e3-9bcf-4b9783ba7d91",
    "metadata": {},
    "outputs": [
     {
-     "ename": "org.apache.spark.SparkRuntimeException",
-     "evalue": " Error while decoding: java.lang.NullPointerException",
+     "ename": "<console>",
+     "evalue": "80: error: illegal start of simple expression",
      "output_type": "error",
      "traceback": [
-      "org.apache.spark.SparkRuntimeException: Error while decoding: java.lang.NullPointerException",
-      "newInstance(class Event).",
-      "  at org.apache.spark.sql.errors.QueryExecutionErrors$.expressionDecodingError(QueryExecutionErrors.scala:1543)",
-      "  at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Deserializer.apply(ExpressionEncoder.scala:178)",
-      "  at org.apache.spark.sql.catalyst.encoders.ExpressionEncoder$Deserializer.apply(ExpressionEncoder.scala:166)",
-      "  at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)",
-      "  at scala.collection.IndexedSeqOptimized.foreach(IndexedSeqOptimized.scala:36)",
-      "  at scala.collection.IndexedSeqOptimized.foreach$(IndexedSeqOptimized.scala:33)",
-      "  at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:198)",
-      "  at scala.collection.TraversableLike.map(TraversableLike.scala:286)",
-      "  at scala.collection.TraversableLike.map$(TraversableLike.scala:279)",
-      "  at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:198)",
-      "  at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4177)",
-      "  at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3161)",
-      "  at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4167)",
-      "  at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:526)",
-      "  at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4165)",
-      "  at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118)",
-      "  at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)",
-      "  at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)",
-      "  at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)",
-      "  at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)",
-      "  at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4165)",
-      "  at org.apache.spark.sql.Dataset.head(Dataset.scala:3161)",
-      "  at org.apache.spark.sql.Dataset.take(Dataset.scala:3382)",
-      "  ... 49 elided",
-      "Caused by: java.lang.NullPointerException",
+      "<console>:80: error: illegal start of simple expression",
+      "           .map( case (row: EventWithDeviceInfo) => {",
+      "                 ^",
       ""
      ]
     }
@@ -186,6 +192,14 @@
     "combinedViaDatasets.take(5)\n"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ce150e3-e5b7-4ece-8803-8189762625ea",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -226,4 +240,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 5
-}
+}