Add stacking/sync script for long wide table#15
Add stacking/sync script for long wide table#15yukinko-iwasaki wants to merge 22 commits intomainfrom
Conversation
| import pyspark.sql.functions as f | ||
|
|
There was a problem hiding this comment.
The PySpark module is imported twice, on lines 3 and 7. Remove the duplicate import to follow Python best practices and improve code maintainability.
| import pyspark.sql.functions as f |
| spark.sql(f""" | ||
| UPDATE `prd_csc_mega`.`sgld48`.`test_ingestion_metadata` | ||
| SET stacked_ouo_table_version = stacked_ouo_table_version + 1, | ||
| stacked_all_table_version = stacked_all_table_version + 1 | ||
| WHERE table_name IN {names_tuple} | ||
| """) |
There was a problem hiding this comment.
This code uses string formatting to construct an SQL UPDATE statement, which is vulnerable to SQL injection. If the table_name field contains malicious SQL code, it could be executed. Use parameterized queries or proper SQL escaping to prevent SQL injection vulnerabilities.
| # 1. Read the source table | ||
| # 2. Randomize using f.rand() | ||
| # 3. Limit to 20 rows | ||
| test_df = spark.table("`prd_csc_mega`.`sgld48`.`_ingestion_metadata`") \ | ||
| .orderBy(f.rand()) \ | ||
| .limit(20) | ||
|
|
||
| # 4. Save as the new test table | ||
| # 'overwrite' mode mimics 'CREATE OR REPLACE' | ||
| test_df.write \ | ||
| .mode("overwrite") \ | ||
| .format("delta") \ | ||
| .saveAsTable("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") | ||
|
|
||
| # 1. Grab 5 random table names and "freeze" them into a Python list | ||
| # .collect() pulls the data out of the Spark plan and into local memory | ||
| random_rows = spark.table("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") \ | ||
| .select("table_name") \ | ||
| .orderBy(f.rand()) \ | ||
| .limit(5) \ | ||
| .collect() | ||
|
|
||
| # Create a list of strings: ['table_a', 'table_b', ...] | ||
| target_tables = [row.table_name for row in random_rows] | ||
|
|
||
| # 2. Update the Delta table using the fixed list | ||
| # This uses the standard SQL 'IN' syntax but fills it with our Python list | ||
| if target_tables: | ||
| names_tuple = str(tuple(target_tables)).replace(",)", ")") # Handle single-item edge case | ||
|
|
||
| spark.sql(f""" | ||
| UPDATE `prd_csc_mega`.`sgld48`.`test_ingestion_metadata` | ||
| SET stacked_ouo_table_version = stacked_ouo_table_version + 1, | ||
| stacked_all_table_version = stacked_all_table_version + 1 | ||
| WHERE table_name IN {names_tuple} | ||
| """) | ||
|
|
||
| print(f"Successfully updated: {target_tables}") | ||
| else: | ||
| print("No rows found to update.") |
There was a problem hiding this comment.
This test file doesn't define any actual test functions. All pytest test functions must start with "test_" prefix. The code appears to be a script that directly executes operations rather than a test. Consider restructuring this as proper test functions with assertions to validate expected behavior.
| # 1. Read the source table | |
| # 2. Randomize using f.rand() | |
| # 3. Limit to 20 rows | |
| test_df = spark.table("`prd_csc_mega`.`sgld48`.`_ingestion_metadata`") \ | |
| .orderBy(f.rand()) \ | |
| .limit(20) | |
| # 4. Save as the new test table | |
| # 'overwrite' mode mimics 'CREATE OR REPLACE' | |
| test_df.write \ | |
| .mode("overwrite") \ | |
| .format("delta") \ | |
| .saveAsTable("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") | |
| # 1. Grab 5 random table names and "freeze" them into a Python list | |
| # .collect() pulls the data out of the Spark plan and into local memory | |
| random_rows = spark.table("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") \ | |
| .select("table_name") \ | |
| .orderBy(f.rand()) \ | |
| .limit(5) \ | |
| .collect() | |
| # Create a list of strings: ['table_a', 'table_b', ...] | |
| target_tables = [row.table_name for row in random_rows] | |
| # 2. Update the Delta table using the fixed list | |
| # This uses the standard SQL 'IN' syntax but fills it with our Python list | |
| if target_tables: | |
| names_tuple = str(tuple(target_tables)).replace(",)", ")") # Handle single-item edge case | |
| spark.sql(f""" | |
| UPDATE `prd_csc_mega`.`sgld48`.`test_ingestion_metadata` | |
| SET stacked_ouo_table_version = stacked_ouo_table_version + 1, | |
| stacked_all_table_version = stacked_all_table_version + 1 | |
| WHERE table_name IN {names_tuple} | |
| """) | |
| print(f"Successfully updated: {target_tables}") | |
| else: | |
| print("No rows found to update.") | |
| def _prepare_and_update_test_ingestion_metadata(): | |
| # 1. Read the source table | |
| # 2. Randomize using f.rand() | |
| # 3. Limit to 20 rows | |
| test_df = spark.table("`prd_csc_mega`.`sgld48`.`_ingestion_metadata`") \ | |
| .orderBy(f.rand()) \ | |
| .limit(20) | |
| # 4. Save as the new test table | |
| # 'overwrite' mode mimics 'CREATE OR REPLACE' | |
| test_df.write \ | |
| .mode("overwrite") \ | |
| .format("delta") \ | |
| .saveAsTable("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") | |
| # 1. Grab 5 random table names and "freeze" them into a Python list | |
| # .collect() pulls the data out of the Spark plan and into local memory | |
| random_rows = spark.table("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") \ | |
| .select("table_name") \ | |
| .orderBy(f.rand()) \ | |
| .limit(5) \ | |
| .collect() | |
| # Create a list of strings: ['table_a', 'table_b', ...] | |
| target_tables = [row.table_name for row in random_rows] | |
| # 2. Update the Delta table using the fixed list | |
| # This uses the standard SQL 'IN' syntax but fills it with our Python list | |
| if target_tables: | |
| names_tuple = str(tuple(target_tables)).replace(",)", ")") # Handle single-item edge case | |
| spark.sql(f""" | |
| UPDATE `prd_csc_mega`.`sgld48`.`test_ingestion_metadata` | |
| SET stacked_ouo_table_version = stacked_ouo_table_version + 1, | |
| stacked_all_table_version = stacked_all_table_version + 1 | |
| WHERE table_name IN {names_tuple} | |
| """) | |
| print(f"Successfully updated: {target_tables}") | |
| else: | |
| print("No rows found to update.") | |
| def test_prepare_and_update_test_ingestion_metadata(): | |
| """ | |
| Basic pytest-compatible test that runs the stacking pipeline and verifies | |
| that the target test table can be read and contains at most 20 rows. | |
| """ | |
| _prepare_and_update_test_ingestion_metadata() | |
| # Verify the test table exists and the row count matches the expected limit. | |
| df = spark.table("`prd_csc_mega`.`sgld48`.`test_ingestion_metadata`") | |
| row_count = df.count() | |
| assert 0 <= row_count <= 20 |
| spark_write_table(ouo_df, HARMONIZED_OFFICIAL, mode = "overwrite", options = list("overwriteSchema" = "true")) | ||
| } | ||
|
|
||
| # |
There was a problem hiding this comment.
The comment on line 183 is incomplete or unnecessary. It consists only of a "#" with no content, which should either be removed or completed with meaningful information.
| # |
| # Databricks notebook source | ||
| #TESTING |
There was a problem hiding this comment.
This file has a Databricks notebook source comment, but it appears to be a test file in the pytest directory. Test files should not be Databricks notebooks and should follow the proper pytest structure. The comment "#TESTING" on line 2 also appears to be a placeholder or debugging comment that should be removed.
| # Databricks notebook source | |
| #TESTING |
gld_incremental_sync.r
Outdated
| if (SparkR::tableExists(HARMONIZED_CONFIDENTIAL)) { | ||
| harmonized_all <- tbl(sc, HARMONIZED_CONFIDENTIAL) | ||
| } else { | ||
| create_query <- paste0( | ||
| "CREATE TABLE ", HARMONIZED_CONFIDENTIAL, | ||
| " (", columns_sql, ") USING DELTA" | ||
| ) | ||
| DBI::dbExecute(sc, create_query) | ||
| harmonized_all <- tbl(sc, HARMONIZED_CONFIDENTIAL) | ||
| } | ||
|
|
||
| # Check and create HARMONIZED_OFFICIAL if needed | ||
| if (SparkR::tableExists(HARMONIZED_OFFICIAL)) { |
There was a problem hiding this comment.
Using SparkR::tableExists while the rest of the script uses sparklyr is inconsistent and may not work correctly. The sparklyr package provides its own table existence checking through DBI::dbExistsTable or by catching errors when trying to access the table. This mixing of Spark interfaces could lead to compatibility issues.
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
…mega-gld into stacking-scripts
No description provided.