Update dependencies, tools, and tests

ghanse · ghanse · commit 67786c985829 · 2025-08-21T16:48:26.000-04:00
diff --git a/.github/workflows/push.yml b/.github/workflows/push.yml
@@ -59,7 +59,7 @@ jobs:
         run: pip install hatch
 
       - name: Run unit tests
-        run: make test
+        run: make dev test
 
       - name: Publish test coverage to coverage site
         uses: codecov/codecov-action@v4
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,13 +7,22 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 
 #### Fixed 
 * Updated build scripts to use Ubuntu 22.04 to correspond to environment in Databricks runtime
+* Refactored `DataAnalyzer` and `BasicStockTickerProvider` to comply with ANSI SQL standards
+* Removed internal modification of `SparkSession`
 
 #### Changed
 * Changed base Databricks runtime version to DBR 13.3 LTS (based on Apache Spark 3.4.1) - minimum supported version
   of Python is now 3.10.12
+* Updated build tooling to use [hatch](https://hatch.pypa.io/latest/)
+* Moved dependencies and tool configuration to [pyproject.toml](pyproject.toml)
+* Removed dependencies provided by the Databricks Runtime
+* Updated Git actions
+* Updated [makefile](makefile)
+* Updated [CONTRIBUTING.md](CONTRIBUTING.md)
 
 #### Added
 * Added support for serialization to/from JSON format
+* Added Ruff and mypy tooling
 
 
 ### Version 0.4.0 Hotfix 2
diff --git a/README.md b/README.md
@@ -168,8 +168,7 @@ runtimes.
 By design, installing `dbldatagen` does not install releases of dependent packages in order 
 to preserve the curated set of packages pre-installed in any Databricks runtime environment.
 
-When building on local environments, the build process uses the `Pipfile` and requirements files to determine 
-the package versions for releases and unit tests. 
+When building on local environments, run `make dev` to install required dependencies.
 
 ## Project Support
 Please note that all projects released under [`Databricks Labs`](https://www.databricks.com/learn/labs)
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
@@ -943,7 +943,7 @@ def _getSeedExpression(self, base_column):
                 else:
                     return col(base_column[0])
             elif self._baseColumnComputeMethod == VALUES_COMPUTE_METHOD:
-                base_values = [f"string(ifnull(`{x}`, 'null'))" for x in base_column]
+                base_values = [f"string(ifnull(`{x}`, cast(null as string)))" for x in base_column]
                 return expr(f"array({','.join(base_values)})")
             else:
                 return expr(f"hash({','.join(base_column)})")
diff --git a/dbldatagen/data_analyzer.py b/dbldatagen/data_analyzer.py
@@ -226,13 +226,13 @@ def summarizeToDF(self):
         # string characteristics for strings and string representation of other values
         dfDataSummary = self._addMeasureToSummary(
             'print_len_min',
-            fieldExprs=[f"min(length(string({dtype[0]}))) as {dtype[0]}" for dtype in dtypes],
+            fieldExprs=[f"string(min(length(string({dtype[0]})))) as {dtype[0]}" for dtype in dtypes],
             dfData=self._df,
             dfSummary=dfDataSummary)
 
         dfDataSummary = self._addMeasureToSummary(
             'print_len_max',
-            fieldExprs=[f"max(length(string({dtype[0]}))) as {dtype[0]}" for dtype in dtypes],
+            fieldExprs=[f"string(max(length(string({dtype[0]})))) as {dtype[0]}" for dtype in dtypes],
             dfData=self._df,
             dfSummary=dfDataSummary)
 
diff --git a/dbldatagen/datasets/basic_stock_ticker.py b/dbldatagen/datasets/basic_stock_ticker.py
@@ -60,14 +60,14 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
                         baseColumn="symbol_id", omit=True)
             .withColumn("symbol", "string",
                         expr="""concat_ws('', transform(split(conv(symbol_id, 10, 26), ''), 
-                            x -> case when x < 10 then char(ascii(x) - 48 + 65) else char(ascii(x) + 10) end))""")
-            .withColumn("days_from_start_date", "int", expr=f"floor(id / {numSymbols})", omit=True)
+                            x -> case when ascii(x) < 10 then char(ascii(x) - 48 + 65) else char(ascii(x) + 10) end))""")
+            .withColumn("days_from_start_date", "int", expr=f"floor(try_divide(id, {numSymbols}))", omit=True)
             .withColumn("post_date", "date", expr=f"date_add(cast('{startDate}' as date), days_from_start_date)")
             .withColumn("start_value", "decimal(11,2)",
-                        values=[1.0 + 199.0 * random() for _ in range(int(numSymbols / 10))], omit=True)
-            .withColumn("growth_rate", "float", values=[-0.1 + 0.35 * random() for _ in range(int(numSymbols / 10))],
+                        values=[1.0 + 199.0 * random() for _ in range(max(1, int(numSymbols / 10)))], omit=True)
+            .withColumn("growth_rate", "float", values=[-0.1 + 0.35 * random() for _ in range(max(1, int(numSymbols / 10)))],
                         baseColumn="symbol_id")
-            .withColumn("volatility", "float", values=[0.0075 * random() for _ in range(int(numSymbols / 10))],
+            .withColumn("volatility", "float", values=[0.0075 * random() for _ in range(max(1, int(numSymbols / 10)))],
                         baseColumn="symbol_id", omit=True)
             .withColumn("prev_modifier_sign", "float",
                         expr=f"case when sin((id - {numSymbols}) % 17) > 0 then -1.0 else 1.0 end""",
@@ -78,12 +78,12 @@ def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions
             .withColumn("open_base", "decimal(11,2)",
                         expr=f"""start_value 
                             + (volatility * prev_modifier_sign * start_value * sin((id - {numSymbols}) % 17)) 
-                            + (growth_rate * start_value * (days_from_start_date - 1) / 365)""",
+                            + (growth_rate * start_value * try_divide(days_from_start_date - 1, 365))""",
                         omit=True)
             .withColumn("close_base", "decimal(11,2)",
                         expr="""start_value 
                             + (volatility * start_value * sin(id % 17)) 
-                            + (growth_rate * start_value * days_from_start_date / 365)""",
+                            + (growth_rate * start_value * try_divide(days_from_start_date, 365))""",
                         omit=True)
             .withColumn("high_base", "decimal(11,2)",
                         expr="greatest(open_base, close_base) + rand() * volatility * open_base",
diff --git a/dbldatagen/text_generators.py b/dbldatagen/text_generators.py
@@ -856,8 +856,8 @@ def generateText(self, baseValues, rowCount=1):
         # hardening a mask prevents masked values from being changed
         np.ma.harden_mask(masked_offsets)
         # Cast offsets to the same dtype as the array to avoid casting errors
-        capitals_offset = word_offset_type.type(self._startOfCapitalsOffset)
-        spaced_words_offset = word_offset_type.type(self._startOfSpacedWordsOffset)
+        capitals_offset = self._wordOffsetType.type(self._startOfCapitalsOffset)
+        spaced_words_offset = self._wordOffsetType.type(self._startOfSpacedWordsOffset)
         masked_offsets[:, :, :, 0] = masked_offsets[:, :, :, 0] + capitals_offset
         masked_offsets[:, :, :, 1:] = masked_offsets[:, :, :, 1:] + spaced_words_offset
         np.ma.soften_mask(masked_offsets)
@@ -869,7 +869,7 @@ def generateText(self, baseValues, rowCount=1):
         new_col = new_word_offsets[:, :, :, np.newaxis]
         terminated_word_offsets = np.ma.concatenate((masked_offsets, new_col), axis=3)
         new_column = terminated_word_offsets[:, :, :, -1]
-        sentence_end_offset = word_offset_type.type(self._sentenceEndOffset)
+        sentence_end_offset = self._wordOffsetType.type(self._sentenceEndOffset)
         new_column[~new_column.mask] = sentence_end_offset
 
         # reshape to paragraphs
@@ -887,7 +887,7 @@ def generateText(self, baseValues, rowCount=1):
             # set the paragraph end marker on all paragraphs except last
             # new_masked_elements = terminated_paragraph_offsets[:,:,-1]
             new_column = terminated_paragraph_offsets[:, :, -1]
-            paragraph_end_offset = word_offset_type.type(self._paragraphEnd)
+            paragraph_end_offset = self._wordOffsetType.type(self._paragraphEnd)
             new_column[~new_column.mask] = paragraph_end_offset
         else:
             terminated_paragraph_offsets = paragraph_offsets
@@ -897,7 +897,7 @@ def generateText(self, baseValues, rowCount=1):
         shape = terminated_paragraph_offsets.shape
         terminated_paragraph_offsets = terminated_paragraph_offsets.reshape((rowCount, shape[1] * shape[2]))
 
-        empty_string_offset = word_offset_type.type(self._emptyStringOffset)
+        empty_string_offset = self._wordOffsetType.type(self._emptyStringOffset)
         final_data = terminated_paragraph_offsets.filled(fill_value=empty_string_offset)
 
         # its faster to manipulate text in data frames as numpy strings are fixed length
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,16 +29,7 @@ classifiers = [
 ]
 dependencies = [
     "databricks-sdk~=0.57",
-    "numpy>=1.22.0",
-    "pandas>=1.3.4",
-    "pyarrow>=7.0.0",
-    "pyspark[sql]>=3.3.0",
-    "python-dateutil>=2.8.2",
-    "six>=1.16.0",
-    "pyparsing>=3.0.4",
-    "jmespath>=0.10.0",
-    "py4j>=0.10.9",
-    "pickleshare>=0.7.5",
+    "py4j>=0.10.9"
 ]
 
 [project.urls]
@@ -49,14 +40,6 @@ Homepage = "https://github.com/databrickslabs/dbldatagen"
 Repository = "https://github.com/databrickslabs/dbldatagen.git"
 
 [project.optional-dependencies]
-dev = [
-    "pytest>=6.0.0",
-    "pytest-cov>=3.0.0",
-    "pytest-timeout",
-    "ruff>=0.1.0",
-    "pylint>=2.15.0",
-    "mypy>=1.0.0",
-]
 docs = [
     "sphinx>=7.0.0",
     "sphinx-rtd-theme",
@@ -110,7 +93,17 @@ dependencies = [
     "ruff~=0.3.4",
     "types-PyYAML~=6.0.12",
     "types-requests~=2.31.0",
-    "pyspark[sql]~=3.5.0"
+    "databricks-sdk~=0.57",
+    "numpy>=1.21.5",
+    "pandas>=1.4.4",
+    "pyarrow>=8.0.0",
+    "pyspark[sql]>=3.4.1",
+    "python-dateutil>=2.8.2",
+    "six>=1.16.0",
+    "pyparsing>=3.0.9",
+    "jmespath>=0.10.0",
+    "py4j>=0.10.9",
+    "pickleshare>=0.7.5",
 ]
 
 python="3.10"
diff --git a/tests/test_complex_columns.py b/tests/test_complex_columns.py
@@ -523,11 +523,11 @@ def test_inferred_column_structs1(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        expectedType = StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
+        expectedType = StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)])
         assert type1 == expectedType
 
         type2 = self.getFieldType(df.schema, "struct2")
-        expectedType2 = StructType([StructField('a', DateType(), False), StructField('b', StringType())])
+        expectedType2 = StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)])
         assert type2 == expectedType2
 
     def test_inferred_column_structs2(self, setupLogging):
@@ -551,13 +551,13 @@ def test_inferred_column_structs2(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        assert type1 == StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
+        assert type1 == StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)])
         type2 = self.getFieldType(df.schema, "struct2")
-        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType())])
+        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)])
         type3 = self.getFieldType(df.schema, "struct3")
         assert type3 == StructType(
-            [StructField('a', StructType([StructField('a', IntegerType()), StructField('b', IntegerType())]), False),
-             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType())]), False)]
+            [StructField('a', StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)]), False),
+             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)]), False)]
         )
 
     def test_with_struct_column1(self, setupLogging):
@@ -580,9 +580,9 @@ def test_with_struct_column1(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        assert type1 == StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
+        assert type1 == StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)])
         type2 = self.getFieldType(df.schema, "struct2")
-        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType())])
+        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)])
 
     def test_with_struct_column2(self, setupLogging):
         column_count = 10
@@ -604,9 +604,9 @@ def test_with_struct_column2(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        assert type1 == StructType([StructField('code1', IntegerType()), StructField('code2', IntegerType())])
+        assert type1 == StructType([StructField('code1', IntegerType(), True), StructField('code2', IntegerType(), True)])
         type2 = self.getFieldType(df.schema, "struct2")
-        assert type2 == StructType([StructField('code5', DateType(), False), StructField('code6', StringType())])
+        assert type2 == StructType([StructField('code5', DateType(), False), StructField('code6', StringType(), False)])
 
     def test_with_json_struct_column(self, setupLogging):
         column_count = 10
@@ -680,13 +680,13 @@ def test_with_struct_column3(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        assert type1 == StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
+        assert type1 == StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)])
         type2 = self.getFieldType(df.schema, "struct2")
-        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType())])
+        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)])
         type3 = self.getFieldType(df.schema, "struct3")
         assert type3 == StructType(
-            [StructField('a', StructType([StructField('a', IntegerType()), StructField('b', IntegerType())]), False),
-             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType())]),
+            [StructField('a', StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)]), False),
+             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)]),
                          False)])
 
     def test_with_struct_column4(self, setupLogging):
@@ -711,13 +711,13 @@ def test_with_struct_column4(self, setupLogging):
         df = df_spec.build()
 
         type1 = self.getFieldType(df.schema, "struct1")
-        assert type1 == StructType([StructField('a', IntegerType()), StructField('b', IntegerType())])
+        assert type1 == StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)])
         type2 = self.getFieldType(df.schema, "struct2")
-        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType())])
+        assert type2 == StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)])
         type3 = self.getFieldType(df.schema, "struct3")
         assert type3 == StructType(
-            [StructField('a', StructType([StructField('a', IntegerType()), StructField('b', IntegerType())]), False),
-             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType())]),
+            [StructField('a', StructType([StructField('a', IntegerType(), True), StructField('b', IntegerType(), True)]), False),
+             StructField('b', StructType([StructField('a', DateType(), False), StructField('b', StringType(), False)]),
                          False)])
 
     def test_with_struct_column_err1(self, setupLogging):
diff --git a/tests/test_serverless.py b/tests/test_serverless.py
@@ -24,10 +24,28 @@ def serverlessSpark(self):
 
         oldSetMethod = sparkSession.conf.set
         oldGetMethod = sparkSession.conf.get
-        sparkSession.conf.set = MagicMock(
-            side_effect=ValueError("Setting value prohibited in simulated serverless env."))
-        sparkSession.conf.get = MagicMock(
-            side_effect=ValueError("Getting value prohibited in simulated serverless env."))
+        def mock_conf_set(*args, **kwargs):
+            raise ValueError("Setting value prohibited in simulated serverless env.")
+        
+        def mock_conf_get(config_key, default=None):
+            # Allow internal PySpark configuration calls that are needed for basic operation
+            whitelisted_configs = {
+                'spark.sql.stackTracesInDataFrameContext': '1',
+                'spark.sql.execution.arrow.enabled': 'false',
+                'spark.sql.execution.arrow.pyspark.enabled': 'false',
+                'spark.python.sql.dataFrameDebugging.enabled': 'true',
+                'spark.sql.execution.arrow.maxRecordsPerBatch': '10000'
+            }
+            if config_key in whitelisted_configs:
+                try:
+                    return oldGetMethod(config_key, whitelisted_configs[config_key])
+                except:
+                    return whitelisted_configs[config_key]
+            else:
+                raise ValueError("Getting value prohibited in simulated serverless env.")
+        
+        sparkSession.conf.set = MagicMock(side_effect=mock_conf_set)
+        sparkSession.conf.get = MagicMock(side_effect=mock_conf_get)
 
         yield sparkSession
 
@@ -59,7 +77,7 @@ def test_basic_data(self, serverlessSpark):
             )
         )
 
-        dfTestData = testDataSpec.build()
+        testDataSpec.build()
 
     @pytest.mark.parametrize("providerName, providerOptions", [
         ("basic/user", {"rows": 50, "partitions": 4, "random": False, "dummyValues": 0}),
@@ -72,4 +90,4 @@ def test_basic_user_table_retrieval(self, providerName, providerOptions, serverl
                                 """
         df = ds.build()
 
-        assert df.count() >= 0
+        assert df.count() >= 0