[DOP-19901] Add integration MSSQL tests & CI

Ilyas Gasanov · Ilyas Gasanov · commit 51591b960de3 · 2024-11-27T17:49:47.000+03:00
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -95,7 +95,7 @@ services:
         condition: service_healthy
       rabbitmq:
         condition: service_healthy
-    profiles: [worker, scheduler, s3, oracle, hdfs, hive, all, clickhouse, mssql]
+    profiles: [worker, scheduler, s3, oracle, hdfs, hive, clickhouse, mssql, all]
 
   test-postgres:
     image: postgres
diff --git a/syncmaster/worker/handlers/db/mssql.py b/syncmaster/worker/handlers/db/mssql.py
@@ -34,5 +34,5 @@ def connect(self, spark: SparkSession):
 
     def normalize_column_names(self, df: DataFrame) -> DataFrame:
         for column_name in df.columns:
-            df = df.withColumnRenamed(column_name, column_name.upper())
+            df = df.withColumnRenamed(column_name, column_name.lower())
         return df
diff --git a/tests/test_integration/test_run_transfer/conftest.py b/tests/test_integration/test_run_transfer/conftest.py
@@ -637,31 +637,31 @@ def prepare_mssql(
         spark=spark,
     ).check()
     try:
-        onetl_conn.execute(f"DROP TABLE {mssql.user}.source_table")
+        onetl_conn.execute(f"DROP TABLE dbo.source_table")
     except Exception:
         pass
     try:
-        onetl_conn.execute(f"DROP TABLE {mssql.user}.target_table")
+        onetl_conn.execute(f"DROP TABLE dbo.target_table")
     except Exception:
         pass
 
     def fill_with_data(df: DataFrame):
-        logger.info("START PREPARE ORACLE")
+        logger.info("START PREPARE MSSQL")
         db_writer = DBWriter(
             connection=onetl_conn,
-            target=f"{mssql.user}.source_table",
+            target="dbo.source_table",
         )
         db_writer.run(df)
-        logger.info("END PREPARE ORACLE")
+        logger.info("END PREPARE MSSQL")
 
     yield onetl_conn, fill_with_data
 
     try:
-        onetl_conn.execute(f"DROP TABLE {mssql.user}.source_table")
+        onetl_conn.execute(f"DROP TABLE dbo.source_table")
     except Exception:
         pass
     try:
-        onetl_conn.execute(f"DROP TABLE {mssql.user}.target_table")
+        onetl_conn.execute(f"DROP TABLE dbo.target_table")
     except Exception:
         pass
 
diff --git a/tests/test_integration/test_run_transfer/test_mssql.py b/tests/test_integration/test_run_transfer/test_mssql.py
@@ -6,6 +6,7 @@
 from onetl.connection import MSSQL
 from onetl.db import DBReader
 from pyspark.sql import DataFrame
+from pyspark.sql.functions import col, date_trunc
 from sqlalchemy.ext.asyncio import AsyncSession
 
 from syncmaster.db.models import Connection, Group, Queue, Status, Transfer
@@ -37,7 +38,7 @@ async def postgres_to_mssql(
         },
         target_params={
             "type": "mssql",
-            "table_name": f"{mssql_for_conftest.user}.target_table",
+            "table_name": "dbo.target_table",
         },
         queue_id=queue.id,
     )
@@ -63,7 +64,7 @@ async def mssql_to_postgres(
         target_connection_id=postgres_connection.id,
         source_params={
             "type": "mssql",
-            "table_name": f"{mssql_for_conftest.user}.source_table",
+            "table_name": "dbo.source_table",
         },
         target_params={
             "type": "postgres",
@@ -113,10 +114,14 @@ async def test_run_transfer_postgres_to_mssql(
     assert "password" not in target_auth_data
     reader = DBReader(
         connection=mssql,
-        table=f"{mssql.user}.target_table",
+        table="dbo.target_table",
     )
     df = reader.run()
 
+    # as spark rounds datetime up to milliseconds while writing to mssql: https://onetl.readthedocs.io/en/latest/connection/db_connection/mssql/types.html#id5
+    df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
+    init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
+
     for field in init_df.schema:
         df = df.withColumn(field.name, df[field.name].cast(field.dataType))
 
@@ -161,11 +166,19 @@ async def test_run_transfer_postgres_to_mssql_mixed_naming(
 
     reader = DBReader(
         connection=mssql,
-        table=f"{mssql.user}.target_table",
+        table=f"dbo.target_table",
     )
     df = reader.run()
+
     assert df.columns != init_df_with_mixed_column_naming.columns
-    assert df.columns == [column.lower() for column in init_df_with_mixed_column_naming.columns]
+    assert df.columns == [column.upper() for column in init_df_with_mixed_column_naming.columns]
+
+    # as spark rounds datetime up to milliseconds while writing to mssql: https://onetl.readthedocs.io/en/latest/connection/db_connection/mssql/types.html#id5
+    df = df.withColumn("Registered At", date_trunc("second", col("Registered At")))
+    init_df_with_mixed_column_naming = init_df_with_mixed_column_naming.withColumn(
+        "Registered At",
+        date_trunc("second", col("Registered At")),
+    )
 
     for field in init_df_with_mixed_column_naming.schema:
         df = df.withColumn(field.name, df[field.name].cast(field.dataType))
@@ -215,6 +228,10 @@ async def test_run_transfer_mssql_to_postgres(
     )
     df = reader.run()
 
+    # as spark rounds datetime up to milliseconds while writing to mssql: https://onetl.readthedocs.io/en/latest/connection/db_connection/mssql/types.html#id5
+    df = df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
+    init_df = init_df.withColumn("REGISTERED_AT", date_trunc("second", col("REGISTERED_AT")))
+
     for field in init_df.schema:
         df = df.withColumn(field.name, df[field.name].cast(field.dataType))
 
@@ -266,6 +283,13 @@ async def test_run_transfer_mssql_to_postgres_mixed_naming(
     assert df.columns != init_df_with_mixed_column_naming.columns
     assert df.columns == [column.lower() for column in init_df_with_mixed_column_naming.columns]
 
+    # as spark rounds datetime up to milliseconds while writing to mssql: https://onetl.readthedocs.io/en/latest/connection/db_connection/mssql/types.html#id5
+    df = df.withColumn("Registered At", date_trunc("second", col("Registered At")))
+    init_df_with_mixed_column_naming = init_df_with_mixed_column_naming.withColumn(
+        "Registered At",
+        date_trunc("second", col("Registered At")),
+    )
+
     for field in init_df_with_mixed_column_naming.schema:
         df = df.withColumn(field.name, df[field.name].cast(field.dataType))