DataExpert-io
diff --git a/‎bootcamp/materials/3-spark-fundamentals/notebooks/Homework3_Draft.ipynb
Lines changed: 1797 additions & 0 deletions b/‎bootcamp/materials/3-spark-fundamentals/notebooks/Homework3_Draft.ipynb
Lines changed: 1797 additions & 0 deletions
diff --git a/‎bootcamp/materials/3-spark-fundamentals/notebooks/event_data_pyspark.ipynb
Lines changed: 1 addition & 1 deletion b/‎bootcamp/materials/3-spark-fundamentals/notebooks/event_data_pyspark.ipynb
Lines changed: 1 addition & 1 deletion
diff --git a/‎bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/actors_scd_job.cpython-311.pyc
2.37 KB b/‎bootcamp/materials/3-spark-fundamentals/src/jobs/__pycache__/actors_scd_job.cpython-311.pyc
2.37 KB
diff --git a/‎bootcamp/materials/3-spark-fundamentals/src/jobs/actors_scd_job.py
Lines changed: 60 additions & 0 deletions b/‎bootcamp/materials/3-spark-fundamentals/src/jobs/actors_scd_job.py
Lines changed: 60 additions & 0 deletions
diff --git a/‎bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_actor_scd.cpython-311-pytest-8.3.4.pyc
1.72 KB b/‎bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_actor_scd.cpython-311-pytest-8.3.4.pyc
1.72 KB
diff --git a/‎bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_actors_scd.cpython-311-pytest-8.3.4.pyc
1.72 KB b/‎bootcamp/materials/3-spark-fundamentals/src/tests/__pycache__/test_actors_scd.cpython-311-pytest-8.3.4.pyc
1.72 KB
diff --git a/‎bootcamp/materials/3-spark-fundamentals/src/tests/test_actors_scd.py
Lines changed: 27 additions & 0 deletions b/‎bootcamp/materials/3-spark-fundamentals/src/tests/test_actors_scd.py
Lines changed: 27 additions & 0 deletions
@@ -10,7 +10,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "24/12/06 20:42:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
+      "24/12/11 15:50:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"
      ]
     },
     {
 
@@ -0,0 +1,60 @@
+from pyspark.sql import SparkSession
+
+query = """
+
+WITH with_previous AS (
+    SELECT actor
+        , actorid
+        , current_year
+        , quality_class
+        , is_active
+        , LAG(quality_class, 1) OVER (PARTITION BY actorid ORDER BY current_year) AS previous_quality_class
+        , LAG(is_active, 1) OVER (PARTITION BY actorid ORDER BY current_year)     AS previous_is_active
+    FROM actors
+    WHERE current_year < 2021
+),
+with_indicators AS (
+    SELECT *
+        , CASE
+            WHEN quality_class <> previous_quality_class THEN 1
+            WHEN is_active <> previous_is_active THEN 1
+            ELSE 0
+        END AS change_indicator
+    FROM with_previous
+),
+with_streaks AS (
+    SELECT *
+        , SUM(change_indicator) OVER (PARTITION BY actorid ORDER BY current_year) AS streak_identifier
+    FROM with_indicators
+)
+SELECT
+    actor
+    , actorid
+    , quality_class
+    , is_active
+    , MIN(current_year) AS start_year
+    , MAX(current_year) AS end_year
+    , 2020 AS current_year
+FROM with_streaks
+GROUP BY actor
+        , actorid
+        , quality_class
+        , is_active
+        , streak_identifier
+ORDER BY actor
+        , streak_identifier
+
+"""
+
+
+def do_actor_scd_transformation(spark, dataframe):
+    dataframe.createOrReplaceTempView("actors")
+    return spark.sql(query)
+
+def main():
+    spark = SparkSession.builder \
+        .master("local") \
+        .appName("actors_scd") \
+        .getOrCreate()
+    output_df = do_actor_scd_transformation(spark, spark.table("actors"))
+    output_df.write.mode("overwrite").insertInto("actors_scd")
@@ -0,0 +1,27 @@
+from chispa.dataframe_comparer import *
+from ..jobs.actors_scd_job import do_actor_scd_transformation
+from collections import namedtuple
+
+ActorYear = namedtuple("ActorYear", "actor current_year quality_class")
+ActorScd = namedtuple("ActorScd", "actor quality_class start_year end_year")
+
+
+def test_scd_generation(spark):
+    source_data = [
+        ActorYear("Meat Loaf", 2018, 'Good'),
+        ActorYear("Meat Loaf", 2019, 'Good'),
+        ActorYear("Meat Loaf", 2020, 'Bad'),
+        ActorYear("Meat Loaf", 2021, 'Bad'),
+        ActorYear("Skid Markel", 2020, 'Bad'),
+        ActorYear("Skid Markel", 2021, 'Bad')
+    ]
+    source_df = spark.createDataFrame(source_data)
+
+    actual_df = do_actor_scd_transformation(spark, source_df)
+    expected_data = [
+        ActorScd("Meat Loaf", 'Good', 2018, 2019),
+        ActorScd("Meat Loaf", 'Bad', 2020, 2021),
+        ActorScd("Skid Markel", 'Bad', 2020, 2021)
+    ]
+    expected_df = spark.createDataFrame(expected_data)
+    assert_df_equality(actual_df, expected_df)
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`"name": "stderr",`
`11`	`11`	`"output_type": "stream",`
`12`	`12`	`"text": [`
`13`		`- "24/12/06 20:42:52 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"`
	`13`	`+ "24/12/11 15:50:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n"`
`14`	`14`	`]`
`15`	`15`	`},`
`16`	`16`	`{`