added changes for better build ordering when SQL expressions are pres… (#159)

ronanstokes-db · web-flow · commit 431d865aa5a5 · 2023-03-05T19:23:54.000-08:00
* added changes for better build ordering when SQL expressions are present without baseColumn specification

* updated changelog

* fixed use of as_keyword in parsing

* fixed typo

* fixes based on review feedback
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,11 +3,22 @@
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
-### Version Unreleased
+### Unreleased
 
 #### Changed
+* Adjusted column build phase separation (i.e which select statement is used to build columns) so that a 
+  column with a SQL expression can refer to previously created columns without use of a `baseColumn` attribute
 * Changed build labelling to comply with PEP440
 
+#### Fixed 
+
+#### Added 
+* Parsing of SQL expressions to determine column dependencies
+
+#### Notes
+* This does not change actual order of column building - but adjusts which phase columns are built in 
+
+
 ### Version 0.3.1
 
 #### Changed
diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
@@ -27,7 +27,7 @@
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
                                RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
 from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
-    deprecated, parse_time_interval, DataGenError
+    deprecated, parse_time_interval, DataGenError, split_list_matching_condition
 from ._version import __version__
 from .column_generation_spec import ColumnGenerationSpec
 from .column_spec_options import ColumnSpecOptions
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -14,8 +14,9 @@
 from .column_generation_spec import ColumnGenerationSpec
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \
                                DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION
-from .utils import ensure, topologicalSort, DataGenError, deprecated
+from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
 from . _version import _get_spark_version
+from .schema_parser import SchemaParser
 
 _OLD_MIN_OPTION = 'min'
 _OLD_MAX_OPTION = 'max'
@@ -754,7 +755,6 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
         new_props = {}
         new_props.update(kwargs)
 
-        from .schema_parser import SchemaParser
         if type(colType) == str:
             colType = SchemaParser.columnTypeFromString(colType)
 
@@ -908,24 +908,85 @@ def _computeColumnBuildOrder(self):
             self._seedColumnName, set())
                                for x in self._allColumnSpecs]
 
-        # self.pp_list(dependency_ordering, msg="dependencies")
-
         self.logger.info("dependency list: %s", str(dependency_ordering))
 
         self._buildOrder = list(
             topologicalSort(dependency_ordering, flatten=False, initial_columns=[self._seedColumnName]))
 
         self.logger.info("columnBuildOrder: %s", str(self._buildOrder))
 
-        # self.pp_list(self._buildOrder, "build order")
+        self._buildOrder = self._adjustBuildOrderForSqlDependencies(self._buildOrder,  self._columnSpecsByName)
+
         return self._buildOrder
 
+    def _adjustBuildOrderForSqlDependencies(self, buildOrder, columnSpecsByName):
+        """ Adjust column build order according to the following heuristics
+
+        1: if the column being built in a specific build order phase has a SQL expression and it references
+           other columns in the same build phase (or potentially references them as the expression parsing is
+           primitive), separate that phase into multiple phases.
+
+        It will also issue a warning if the SQL expression appears to reference a column built later
+
+        :param buildOrder: list of lists of ids - each sublist represents phase of build
+        :param columnSpecsByName: dictionary to map column names to column specs
+        :returns: Spark SQL dataframe of generated test data
+
+        """
+        new_build_order = []
+
+        all_columns = set([item for sublist in buildOrder for item in sublist])
+        built_columns = []
+        prior_phase_built_columns = []
+
+        # for each phase, evaluate it to see if it needs to be split
+        for current_phase in buildOrder:
+            separate_phase_columns = []
+
+            for columnBeingBuilt in current_phase:
+
+                if columnBeingBuilt in columnSpecsByName:
+                    cs = columnSpecsByName[columnBeingBuilt]
+
+                    if cs.expr is not None:
+                        sql_references = SchemaParser.columnsReferencesFromSQLString(cs.expr, filter=all_columns)
+
+                        # determine references to columns not yet built
+                        forward_references = set(sql_references) - set(built_columns)
+                        if len(forward_references) > 0:
+                            msg = f"Column '{columnBeingBuilt} may have forward references to {forward_references}."
+                            self.logger.warning(msg)
+                            self.logger.warning("Use `baseColumn` attribute to correct build ordering if necessary")
+
+                        references_not_yet_built = set(sql_references) - set(prior_phase_built_columns)
+
+                        if len(references_not_yet_built.intersection(set(current_phase))) > 0:
+                            separate_phase_columns.append(columnBeingBuilt)
+
+                # for each column, get the set of sql references and filter against column names
+                built_columns.append(columnBeingBuilt)
+
+            if len(separate_phase_columns) > 0:
+                # split phase based on columns in separate_phase_column_list set
+                revised_phase = split_list_matching_condition(current_phase, lambda el: el in separate_phase_columns)
+                new_build_order.extend(revised_phase)
+            else:
+                # no change to phase
+                new_build_order.append(current_phase)
+
+            prior_phase_built_columns.extend(current_phase)
+
+        return new_build_order
+
     @property
     def build_order(self):
         """ return the build order minus the seed column (which defaults to `id`)
 
         The build order will be a list of lists - each list specifying columns that can be built at the same time
         """
+        if not self.buildPlanComputed:
+            self.computeBuildPlan()
+
         return [x for x in self._buildOrder if x != [self._seedColumnName]]
 
     def _getColumnDataTypes(self, columns):
@@ -1033,6 +1094,9 @@ def _buildColumnExpressionsWithSelects(self, df1):
         Build column generation expressions with selects
         :param df1: dataframe for base data generator
         :return: new dataframe
+
+        The data generator build plan is separated into `rounds` of expressions. Each round consists of
+        expressions that are generated using a single `select` operation
         """
         self.executionHistory.append("Generating data with selects")
         # generation with selects may be more efficient as less intermediate data frames
diff --git a/dbldatagen/schema_parser.py b/dbldatagen/schema_parser.py
@@ -132,7 +132,7 @@ def getTypeDefinitionParser(cls):
                 pp.delimitedList(pp.Group(ident + pp.Optional(colon) + pp.Group(type_expr)))) + r_angle
 
             # try to capture invalid type name for better error reporting
-            invalid_type = pp.Word(pp.alphas, pp.alphanums+"_", as_keyword=True)
+            invalid_type = pp.Word(pp.alphas, pp.alphanums+"_", asKeyword=True)
 
             # use left recursion to handle nesting of types
             type_expr <<= pp.MatchFirst([primitive_type_keyword, array_expr, map_expr, struct_expr, invalid_type])
@@ -260,6 +260,67 @@ def columnTypeFromString(cls, type_string):
 
         return type_construct
 
+    @classmethod
+    def _cleanseSQL(cls, sql_string):
+        """ Cleanse sql string removing string literals so that they are not considered as part of potential column
+            references
+        :param sql_string: String representation of SQL expression
+        :returns: cleansed string
+
+        Any strings identified are replaced with `' '`
+        """
+        assert sql_string is not None, "`sql_string` must be specified"
+
+        # skip over quoted identifiers even if they contain quotes
+        quoted_ident = pp.QuotedString(quoteChar="`", escQuote="``")
+
+        stringForm1 = pp.Literal('r') + pp.QuotedString(quoteChar="'")
+        stringForm2 = pp.Literal('r') + pp.QuotedString(quoteChar='"')
+        stringForm3 = pp.QuotedString(quoteChar="'", escQuote=r"\'")
+        stringForm4 = pp.QuotedString(quoteChar='"', escQuote=r'\"')
+        stringForm = stringForm1 ^ stringForm2 ^ stringForm3 ^ stringForm4
+        stringForm.set_parse_action(lambda s, loc, toks: "' '")
+
+        parser = quoted_ident ^ stringForm
+
+        transformed_string = parser.transform_string(sql_string)
+
+        return transformed_string
+
+    @classmethod
+    def columnsReferencesFromSQLString(cls, sql_string, filter=None):
+        """ Generate a list of possible column references from a SQL string
+
+        This method finds all condidate references to SQL columnn ids in the string
+
+        To avoid the overhead of a full SQL parser, the implementation will simply look for possible field names
+
+        Further improvements may eliminate some common syntax but in current form, reserved words will
+        also be returned as possible column references.
+
+        So any uses of this must not assume that all possible references are valid column references
+
+        :param sql_string: String representation of SQL expression
+        :returns: list of possible column references
+        """
+        assert sql_string is not None, "`sql_string` must be specified"
+        assert filter is None or isinstance(filter, list) or isinstance(filter, set)
+
+        cleansed_sql_string = cls._cleanseSQL(sql_string)
+
+        ident = pp.Word(pp.alphas, pp.alphanums + "_") | pp.QuotedString(quoteChar="`", escQuote="``")
+        parser = ident
+
+        references = parser.search_string(cleansed_sql_string)
+
+        results = set([item for sublist in references for item in sublist])
+
+        if filter is not None:
+            filtered_results = results.intersection(set(filter))
+            return list(filtered_results)
+        else:
+            return list(results)
+
     @classmethod
     def parseCreateTable(cls, sparkSession, source_schema):
         """ Parse a schema from a schema string
diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -75,6 +75,7 @@ def ensure(cond, msg="condition does not hold true"):
     :raises: `DataGenError` exception if condition does not hold true
     :returns: Does not return anything but raises exception if condition does not hold
     """
+
     def strip_margin(text):
         return re.sub(r'\n[ \t]*\|', '\n', text)
 
@@ -214,3 +215,57 @@ def parse_time_interval(spec):
     )
 
     return delta
+
+
+def split_list_matching_condition(lst, cond):
+    """ Split a list on elements that match a condition
+
+    This will find all matches of a specific condition in the list and split the list into sublists around the
+    element that matches this condition.
+
+    It will handle multiple matches performing splits on each match.
+
+    For example, the following code will produce the results below:
+
+    x = ['id', 'city_name', 'id', 'city_id', 'city_pop', 'id', 'city_id', 'city_pop','city_id', 'city_pop','id']
+    splitListOnCondition(x, lambda el: el == 'id')
+
+
+    result:
+    `[['id'], ['city_name'], ['id'], ['city_id', 'city_pop'],
+     ['id'], ['city_id', 'city_pop', 'city_id', 'city_pop'], ['id']]`
+
+    :arg lst: list of items to perform condition matches against
+    :arg cond: lambda function or function taking single argument and returning True or False
+    :returns: list of sublists
+    """
+
+    def match_condition(matchList, matchFn):
+        """Return first index of element of list matching condition"""
+        if matchList is None or len(matchList) == 0:
+            return -1
+
+        for i in range(len(matchList)):
+            if matchFn(matchList[i]):
+                return i
+
+        return -1
+
+    # main code
+    retval = []
+
+    if lst is None:
+        retval = lst
+    elif len(lst) == 1:
+        retval = [lst]
+    else:
+        ix = match_condition(lst, cond)
+        if ix != -1:
+            retval.extend(split_list_matching_condition(lst[0:ix], cond))
+            retval.append(lst[ix:ix + 1])
+            retval.extend(split_list_matching_condition(lst[ix + 1:], cond))
+        else:
+            retval = [lst]
+
+    # filter out empty lists
+    return [el for el in retval if el != []]
diff --git a/docs/source/generating_column_data.rst b/docs/source/generating_column_data.rst
@@ -107,13 +107,29 @@ This performs the following actions:
 - The final set of output fields will be selected (omitting any columns where the ``omit`` attribute was set to
   **True**)
 
+.. note::
+
+  Normally the columns will be built in the order specified in the spec.
+  Use of the `baseColumn` attribute may change the column build ordering.
+
+
 This has several implications:
 
-- If a column is referred to in an expression, the ``baseColumn`` attribute must be defined with a dependency
+- If a column is referred to in an expression, the ``baseColumn`` attribute may need to be defined with a dependency
   on that column
 - If a column uses a base column with a restricted range of values then it is possible that the column
   will not generate the full range of values in the column generation spec
 - If the base column is of type ``boolean`` or some other restricted range type, computations on that base value
   may not produce the expected range of values
-- If base column is not specified, you may see errors reporting that the column in an expression does not exist
+- If base column is not specified, you may see errors reporting that the column in an expression does not exist. 
+  This may be fixed by specifying a column dependency using the `baseColumn` attribute
+
+.. note::
+
+  The implementation performs primitive scanning of SQL expressions (specified using the `expr` attribute)
+  to determine if the sql expression depends on
+  earlier columns and if so, will put the building of the column in a separate phase.
 
+  However it does not reorder the building sequence if there is a reference to a column that will be built later in the
+  SQL expression.
+  To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency.
diff --git a/tests/test_build_planning.py b/tests/test_build_planning.py
diff --git a/tests/test_schema_parser.py b/tests/test_schema_parser.py
diff --git a/tests/test_utils.py b/tests/test_utils.py