2025-05-21-04-59-55

josephmachado · josephmachado · commit d06d054a603e · 2025-05-21T04:59:55.000-04:00
diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/capstone/rainforest/etl/silver/dim_buyer.py b/capstone/rainforest/etl/silver/dim_buyer.py
@@ -67,19 +67,23 @@ def transform_upstream(
         # Rename common columns in appuser_data to avoid conflicts
         appuser_data = appuser_data.selectExpr(
             *[
-                f"`{col}` as appuser_{col}"
-                if col in common_columns and col != "user_id"
-                else col
+                (
+                    f"`{col}` as appuser_{col}"
+                    if col in common_columns and col != "user_id"
+                    else col
+                )
                 for col in appuser_data.columns
             ]
         )
 
         # Rename common columns in buyer_data to avoid conflicts
         buyer_data = buyer_data.selectExpr(
             *[
-                f"`{col}` as buyer_{col}"
-                if col in common_columns and col != "user_id"
-                else col
+                (
+                    f"`{col}` as buyer_{col}"
+                    if col in common_columns and col != "user_id"
+                    else col
+                )
                 for col in buyer_data.columns
             ]
         )
diff --git a/capstone/rainforest/etl/silver/dim_product.py b/capstone/rainforest/etl/silver/dim_product.py
@@ -70,19 +70,23 @@ def transform_upstream(
         # Rename common columns in product_data to avoid conflicts
         product_data = product_data.selectExpr(
             *[
-                f"`{col}` as product_{col}"
-                if col in common_columns and col != "brand_id"
-                else col
+                (
+                    f"`{col}` as product_{col}"
+                    if col in common_columns and col != "brand_id"
+                    else col
+                )
                 for col in product_data.columns
             ]
         )
 
         # Rename common columns in brand_data to avoid conflicts
         brand_data = brand_data.selectExpr(
             *[
-                f"`{col}` as brand_{col}"
-                if col in common_columns and col != "brand_id"
-                else col
+                (
+                    f"`{col}` as brand_{col}"
+                    if col in common_columns and col != "brand_id"
+                    else col
+                )
                 for col in brand_data.columns
             ]
         )
@@ -102,20 +106,24 @@ def transform_upstream(
         # Rename common columns in dim_product_data to avoid conflicts
         dim_product_data = dim_product_data.selectExpr(
             *[
-                f"`{col}` as product_{col}"
-                if col in common_columns
-                and col not in ["brand_id", "manufacturer_id"]
-                else col
+                (
+                    f"`{col}` as product_{col}"
+                    if col in common_columns
+                    and col not in ["brand_id", "manufacturer_id"]
+                    else col
+                )
                 for col in dim_product_data.columns
             ]
         )
 
         # Rename common columns in manufacturer_data to avoid conflicts
         manufacturer_data = manufacturer_data.selectExpr(
             *[
-                f"`{col}` as manufacturer_{col}"
-                if col in common_columns and col != "manufacturer_id"
-                else col
+                (
+                    f"`{col}` as manufacturer_{col}"
+                    if col in common_columns and col != "manufacturer_id"
+                    else col
+                )
                 for col in manufacturer_data.columns
             ]
         )
diff --git a/capstone/rainforest/etl/silver/dim_seller.py b/capstone/rainforest/etl/silver/dim_seller.py
@@ -67,19 +67,23 @@ def transform_upstream(
         # Rename common columns in appuser_data to avoid conflicts
         appuser_data = appuser_data.selectExpr(
             *[
-                f"`{col}` as appuser_{col}"
-                if col in common_columns and col != "user_id"
-                else col
+                (
+                    f"`{col}` as appuser_{col}"
+                    if col in common_columns and col != "user_id"
+                    else col
+                )
                 for col in appuser_data.columns
             ]
         )
 
         # Rename common columns in seller_data to avoid conflicts
         seller_data = seller_data.selectExpr(
             *[
-                f"`{col}` as seller_{col}"
-                if col in common_columns and col != "user_id"
-                else col
+                (
+                    f"`{col}` as seller_{col}"
+                    if col in common_columns and col != "user_id"
+                    else col
+                )
                 for col in seller_data.columns
             ]
         )
diff --git a/capstone/rainforest/tests/conftest.py b/capstone/rainforest/tests/conftest.py
@@ -19,7 +19,7 @@ def spark():
         .config("spark.dynamicAllocation.enabled", "false")
         .config("spark.ui.enabled", "false")
         .config("spark.ui.showConsoleProgress", "false")
-        .config("spark.default.parallelism", 6) # my laptop has 6 cores
+        .config("spark.default.parallelism", 6)  # my laptop has 6 cores
         .config("spark.executor.cores", "1")
         .config("spark.executor.instances", "1")
         .config("spark.sql.shuffle.partitions", "1")
diff --git a/data-processing-spark/1-lab-setup/containers/spark/create_buckets.py b/data-processing-spark/1-lab-setup/containers/spark/create_buckets.py
@@ -1,6 +1,7 @@
 import boto3
-from botocore.exceptions import ClientError
 from botocore.client import Config
+from botocore.exceptions import ClientError
+
 
 def create_s3_client(access_key, secret_key, endpoint, region):
     """
@@ -18,9 +19,10 @@ def create_s3_client(access_key, secret_key, endpoint, region):
         endpoint_url=endpoint,
         aws_access_key_id=access_key,
         aws_secret_access_key=secret_key,
-        config=Config(signature_version='s3v4')
+        config=Config(signature_version='s3v4'),
     )
 
+
 def create_bucket_if_not_exists(s3_client, bucket_name):
     """
     Check if an S3 bucket exists, and if not, create it.
@@ -44,6 +46,7 @@ def create_bucket_if_not_exists(s3_client, bucket_name):
         else:
             print(f"Error: {e}")
 
+
 # Credentials and Connection Info
 access_key = 'minio'
 secret_key = 'minio123'
@@ -53,10 +56,9 @@ def create_bucket_if_not_exists(s3_client, bucket_name):
 # Client creation and usage
 try:
     s3_client = create_s3_client(access_key, secret_key, endpoint, region)
-    bucket_name = 'tpch'# Replace with your bucket name
+    bucket_name = 'tpch'  # Replace with your bucket name
     create_bucket_if_not_exists(s3_client, bucket_name)
-    bucket_name = 'rainforest'# Replace with your bucket name
+    bucket_name = 'rainforest'  # Replace with your bucket name
     create_bucket_if_not_exists(s3_client, bucket_name)
 except:
     print("Full catch, check bucket creation script at create_buckets.py")
-
diff --git a/data-processing-spark/2-apache-spark-basics/architecture/resource_config.py b/data-processing-spark/2-apache-spark-basics/architecture/resource_config.py
@@ -1,8 +1,13 @@
 import pprint
+from typing import Dict
 
 from pyspark.sql import SparkSession
 
 
+def some_function(n_name: str) -> Dict[str, str]:
+    return None
+
+
 def run_code(spark):
     print("============================================")
     print("PRINT SPARKSESSION RESOURCE CONFIGS")
@@ -12,11 +17,13 @@ def run_code(spark):
 
     # Print the resource configurations
     print("Resource Configurations:")
-    pp = pprint.PrettyPrinter(indent=4)
+    pp = pprint.PrettyPrinter(
+        indent=4,
+    )
     pp.pprint(dict(conf.getAll()))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     spark = (
         SparkSession.builder.appName("efficient-data-processing-spark")
         .enableHiveSupport()
diff --git a/data-processing-spark/4-data-processing/2-app-job-stage-task/spark_app_config.py b/data-processing-spark/4-data-processing/2-app-job-stage-task/spark_app_config.py
@@ -3,7 +3,9 @@
 
 def run_code(spark):
     print("==========================================")
-    print(f"Running a simple query with {spark.sparkContext.getConf().get('spark.app.name')}")
+    print(
+        f"Running a simple query with {spark.sparkContext.getConf().get('spark.app.name')}"
+    )
     print("==========================================")
 
     spark.sql(
@@ -15,19 +17,23 @@ def run_code(spark):
         """
     ).show(10)
 
+
 if __name__ == '__main__':
 
     spark = (
         SparkSession.builder.appName("Custom config")
-        .config("spark.executor.memory", "2g") 
-        .config("spark.executor.cores", "3") # total cores across all executors
-        .config("spark.cores.max", "3") 
-        .config("spark.memory.fraction", "0.9") # set aside 10% for user memory, rest for Spark data processing
+        .config("spark.executor.memory", "2g")
+        .config(
+            "spark.executor.cores", "3"
+        )  # total cores across all executors
+        .config("spark.cores.max", "3")
+        .config(
+            "spark.memory.fraction", "0.9"
+        )  # set aside 10% for user memory, rest for Spark data processing
         .enableHiveSupport()
         .getOrCreate()
     )
     # Set the log level
     spark.sparkContext.setLogLevel("ERROR")
     run_code(spark=spark)
     spark.stop()
-
diff --git a/main.py b/main.py
@@ -0,0 +1,6 @@
+def main():
+    print("Hello from efficient-data-processing-spark!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,7 @@
+[project]
+name = "efficient-data-processing-spark"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.13"
+dependencies = []