databrickslabs
diff --git a/‎dbldatagen/data_analyzer.py‎
Lines changed: 329 additions & 272 deletions b/‎dbldatagen/data_analyzer.py‎
Lines changed: 329 additions & 272 deletions
diff --git a/‎dbldatagen/data_generator.py‎
Lines changed: 1 addition & 1 deletion b/‎dbldatagen/data_generator.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dbldatagen/datasets/dataset_provider.py‎
Lines changed: 3 additions & 2 deletions b/‎dbldatagen/datasets/dataset_provider.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎dbldatagen/datasets_object.py‎
Lines changed: 229 additions & 138 deletions b/‎dbldatagen/datasets_object.py‎
Lines changed: 229 additions & 138 deletions
diff --git a/‎dbldatagen/function_builder.py‎
Lines changed: 43 additions & 25 deletions b/‎dbldatagen/function_builder.py‎
Lines changed: 43 additions & 25 deletions
diff --git a/‎dbldatagen/html_utils.py‎
Lines changed: 29 additions & 34 deletions b/‎dbldatagen/html_utils.py‎
Lines changed: 29 additions & 34 deletions
@@ -1913,7 +1913,7 @@ def scriptMerge(
         result = "\n".join(results)
 
         if asHtml:
-            result = HtmlUtils.formatCodeAsHtml(results)
+            result = HtmlUtils.formatCodeAsHtml(result)
 
         return result
 
 
@@ -20,6 +20,7 @@
 This file defines the DatasetProvider class
 """
 
+
 class DatasetProvider(ABC):
     """
     The DatasetProvider class acts as a base class for all dataset providers
@@ -206,7 +207,7 @@ def getTableGenerator(self, sparkSession: SparkSession, *, tableName: str|None=N
         raise NotImplementedError("Base data provider does not provide any table generation specifications!")
 
     @abstractmethod
-    def getAssociatedDataset(self, sparkSession: SparkSession, *, tableName: str|None=None, rows: int=-1, partitions: int=-1,
+    def getAssociatedDataset(self, sparkSession: SparkSession, *, tableName: str | None=None, rows: int=-1, partitions: int=-1,
                              **options: dict[str, Any]) -> DataGenerator:
         """
         Gets associated datasets that are used in conjunction with the provider datasets.
@@ -240,7 +241,7 @@ def allowed_options(options: list[str]|None =None) -> Callable[[Callable], Calla
 
         def decorator(func: Callable) -> Callable:
             @functools.wraps(func)
-            def wrapper(*args, **kwargs) -> Callable: # noqa: ANN002
+            def wrapper(*args, **kwargs) -> Callable:
                 bad_options = [keyword_arg for keyword_arg in kwargs
                                if keyword_arg not in DEFAULT_OPTIONS and keyword_arg not in options]
 
 
@@ -1,30 +1,46 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+"""
+This file defines the `ColumnGeneratorBuilder` class and utility functions
+"""
+
 import itertools
+from typing import Any
 
-from pyspark.sql.types import StringType, DateType, TimestampType
+from pyspark.sql.types import DataType, DateType, StringType, TimestampType
 
 
 class ColumnGeneratorBuilder:
-    """ Helper class to build functional column generators of specific forms"""
+    """
+    Helper class to build functional column generators of specific forms
+    """
 
     @classmethod
-    def _mkList(cls, x):
+    def _mkList(cls, x: object) -> list:
+        """
+        Makes a list of the supplied object instance if it is not already a list.
+
+        :param x: Input object to process
+        :returns: List containing the supplied object if it is not already a list; otherwise returns the object
         """
-        Makes a list of the supplied object instance if it is not already a list
-        :param x: object to process
-        :returns: Returns list of supplied object if it is not already a list, otherwise simply returns the object"""
         return [x] if type(x) is not list else x
 
     @classmethod
-    def _lastElement(cls, x):
-        """ Gets the last element, if the object is a list otherwise returns the object itself"""
-        return x[-1] if type(x) is list else x
+    def _lastElement(cls, x: object) -> object:
+        """
+        Gets the last element from the supplied object if it is a list.
+
+        :param x: Input object
+        :returns: Last element of the input object if it is a list; otherwise returns the object
+        """
+        return x[-1] if isinstance(x, list) else x
 
     @classmethod
-    def _mkCdfProbabilities(cls, weights):
-        """ make cumulative distribution function probabilities for each value in values list
+    def _mkCdfProbabilities(cls, weights: list[float]) -> list[float]:
+        """
+        Makes cumulative distribution function probabilities for each value in values list.
 
         a cumulative distribution function for discrete values can uses
         a  table of cumulative probabilities to evaluate different expressions
@@ -46,6 +62,9 @@ def _mkCdfProbabilities(cls, weights):
         while datasets of size 10,000 x `number of values` gives a repeated
         distribution within 5% of expected distribution.
 
+        :param weights: List of weights to compute CDF probabilities for
+        :returns: List of CDF probabilities
+
         Example code to be generated (pseudo code)::
 
            # given values value1 .. valueN, prob1 to probN
@@ -61,13 +80,12 @@ def _mkCdfProbabilities(cls, weights):
 
         """
         total_weights = sum(weights)
-        return list(map(lambda x: x / total_weights, itertools.accumulate(weights)))
+        return [x / total_weights for x in itertools.accumulate(weights)]
 
     @classmethod
-    def mkExprChoicesFn(cls, values, weights, seed_column, datatype):
-        """ Create SQL expression to compute the weighted values expression
-
-        build an expression of the form::
+    def mkExprChoicesFn(cls, values: list[Any], weights: list[float], seed_column: str, datatype: DataType) -> str:
+        """
+        Creates a SQL expression to compute a weighted values expression. Builds an expression of the form::
 
            case
               when rnd_column <= weight1 then value1
@@ -77,22 +95,22 @@ def mkExprChoicesFn(cls, values, weights, seed_column, datatype):
               else valueN
            end
 
-        based on computed probability distribution for values.
-
-        In Python 3.6 onwards, we could use the choices function but this python version is not
-        guaranteed on all Databricks distributions
+        The output expression is based on the computed probability distribution for the specified values.
 
-        :param values: list of values
-        :param weights: list of weights
-        :param seed_column: base column for expression
-        :param datatype: data type of function return value
+        In Python 3.6 onwards, we could use the choices function but this python version is not guaranteed on all
+        Databricks distributions.
 
+        :param values: List of values
+        :param weights: List of weights
+        :param seed_column: Base column name for expression
+        :param datatype: Spark `DataType` of the output expression
+        :returns: SQL expression representing the weighted values
         """
         cdf_probs = cls._mkCdfProbabilities(weights)
 
         output = [" CASE "]
 
-        conditions = zip(values, cdf_probs)
+        conditions = zip(values, cdf_probs, strict=False)
 
         for v, cdf in conditions:
             # TODO(alex): single quotes needs to be escaped
 
@@ -6,40 +6,40 @@
 This file defines the `HtmlUtils` classes and utility functions
 """
 
-from .utils import system_time_millis
+from dbldatagen.utils import system_time_millis
 
 
 class HtmlUtils:
-    """ Utility class for formatting code as HTML and other notebook related formatting
-
+    """
+    Utility class for formatting code as HTML and other notebook-related formatting.
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         pass
 
-    @classmethod
-    def formatCodeAsHtml(cls, codeText):
-        """ Formats supplied code as Html suitable for use with notebook ``displayHTML``
-
-        :param codeText: Code to be wrapped in html section
-        :return: Html string
+    @staticmethod
+    def formatCodeAsHtml(codeText: str) -> str:
+        """
+        Formats the input code as HTML suitable for use with a notebook's ``displayHTML`` command.
 
-        This will wrap the code with a html section using html ``pre`` and ``code`` tags.
+        This method wraps the input code with an html section using ``pre`` and ``code`` tags. It adds a *Copy Text to
+        Clipboard* button which allows users to easily copy the code to the clipboard.
 
-        It adds a copy text to clipboard button to enable users to easily copy the code to the clipboard.
+        Code is not reformatted. Supplied code should be preformatted into lines.
 
-        It does not reformat code so supplied code should be preformatted into lines.
+        :param codeText: Input code as a string
+        :return: Formatted code as an HTML string
 
         .. note::
             As the notebook environment uses IFrames in rendering html within ``displayHtml``, it cannot use
             the newer ``navigator`` based functionality as this is blocked for cross domain IFrames by default.
 
         """
-        ts = system_time_millis()
+        current_ts = system_time_millis()
 
-        formattedCode = f"""
+        return f"""
             <h3>Generated Code</h3>
-            <div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{ts}"> 
+            <div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{current_ts}">
               {codeText}
             </code></pre></p></br>
             </div>
@@ -48,7 +48,7 @@ def formatCodeAsHtml(cls, codeText):
             function dbldatagen_copy_code_to_clipboard() {{
                try {{
                  var r = document.createRange();
-                 r.selectNode(document.getElementById("generated_code_{ts}"));
+                 r.selectNode(document.getElementById("generated_code_{current_ts}"));
                  window.getSelection().removeAllRanges();
                  window.getSelection().addRange(r);
                  document.execCommand('copy');
@@ -61,23 +61,20 @@ def formatCodeAsHtml(cls, codeText):
         </script>
         """
 
-        return formattedCode
-
-    @classmethod
-    def formatTextAsHtml(cls, textContent, title="Output"):
-        """ Formats supplied text as Html suitable for use with notebook ``displayHTML``
-
-        :param textContent: Text to be wrapped in html section
-        :param title: Title text to be used
-        :return: Html string
-
-        This will wrap the text content with with Html formatting
+    @staticmethod
+    def formatTextAsHtml(textContent: str, title: str = "Output") -> str:
+        """
+        Formats the input text as HTML suitable for use with a notebook's ``displayHTML`` command. This wraps the text
+        content with HTML formatting blocks and adds a section title.
 
+        :param textContent: Input text to be wrapped in an HTML section
+        :param title: Section title (default `"Output"`)
+        :return: Text section as an HTML string
         """
-        ts = system_time_millis()
-        formattedContent = f"""
+        current_ts = system_time_millis()
+        return f"""
             <h3>{title}</h3>
-            <div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{ts}"> 
+            <div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{current_ts}">
               {textContent}
             </pre></p></br>
             </div>
@@ -86,7 +83,7 @@ def formatTextAsHtml(cls, textContent, title="Output"):
             function dbldatagen_copy_to_clipboard() {{
                try {{
                  var r = document.createRange();
-                 r.selectNode(document.getElementById("generated_content_{ts}"));
+                 r.selectNode(document.getElementById("generated_content_{current_ts}"));
                  window.getSelection().removeAllRanges();
                  window.getSelection().addRange(r);
                  document.execCommand('copy');
@@ -98,5 +95,3 @@ def formatTextAsHtml(cls, textContent, title="Output"):
             }}
         </script>
         """
-
-        return formattedContent