Add serialization for DataGenerators and ColumnGenerationSpecs

ghanse · ghanse · commit 3eaf7f1bb70c · 2025-02-19T11:11:10.000-05:00
diff --git a/Pipfile b/Pipfile
@@ -26,6 +26,7 @@ pandas = "==1.2.4"
 setuptools = "==65.6.3"
 pyparsing = "==2.4.7"
 jmespath = "==0.10.0"
+pyyaml = ">=6.0.2"
 
 [requires]
 python_version = "3.8.12"
diff --git a/dbldatagen/__init__.py b/dbldatagen/__init__.py
@@ -34,6 +34,8 @@
 from ._version import __version__
 from .column_generation_spec import ColumnGenerationSpec
 from .column_spec_options import ColumnSpecOptions
+from .constraints import Constraint, ChainedRelation, LiteralRange, LiteralRelation, NegativeValues, PositiveValues, \
+    RangedValues, SqlExpr, UniqueCombinations
 from .data_analyzer import DataAnalyzer
 from .schema_parser import SchemaParser
 from .daterange import DateRange
@@ -49,7 +51,7 @@
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils", "datasets_object"
+           "text_generator_plugins", "html_utils", "datasets_object", "constraints"
            ]
 
 
diff --git a/dbldatagen/column_generation_spec.py b/dbldatagen/column_generation_spec.py
@@ -25,6 +25,7 @@
 from .daterange import DateRange
 from .distributions import Normal, DataDistribution
 from .nrange import NRange
+from .serialization import Serializable
 from .text_generators import TemplateGenerator
 from .utils import ensure, coalesce_values
 from .schema_parser import SchemaParser
@@ -40,7 +41,7 @@
                                RAW_VALUES_COMPUTE_METHOD]
 
 
-class ColumnGenerationSpec(object):
+class ColumnGenerationSpec(Serializable):
     """ Column generation spec object - specifies how column is to be generated
 
     Each column to be output will have a corresponding ColumnGenerationSpec object.
@@ -119,7 +120,7 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
             if EXPR_OPTION not in kwargs:
                 raise ValueError("Column generation spec must have `expr` attribute specified if datatype is inferred")
 
-        elif type(colType) == str:
+        elif isinstance(colType, str):
             colType = SchemaParser.columnTypeFromString(colType)
 
         assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"
@@ -299,6 +300,29 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
         # set up the temporary columns needed for data generation
         self._setupTemporaryColumns()
 
+    @classmethod
+    def getMapping(cls):
+        return {
+            "colName": "name",
+            "colType": "typeString",
+            "minValue": "min",
+            "maxValue": "max",
+            "step": "step",
+            "prefix": "prefix",
+            "random": "random",
+            "randomSeed": "_randomSeed",
+            "randomSeedMethod": "_randomSeedMethod",
+            "implicit": "implicit",
+            "omit": "omit",
+            "nullable": "nullable",
+            "values": "values",
+            "weights": "weights",
+            "distribution": "distribution",
+            "baseColumn": "baseColumn",
+            "dataRange": "dataRange"
+            # TODO: ADD ALL COLUMN SPEC OPTIONS?
+        }
+
     def _temporaryRename(self, tmpName):
         """ Create enter / exit object to support temporary renaming of column spec
 
@@ -417,6 +441,11 @@ def inferDatatype(self):
         """
         return self._inferDataType
 
+    @property
+    def typeString(self):
+        """ Get the simple string representing the column type."""
+        return self.datatype.simpleString()
+
     @property
     def baseColumns(self):
         """ Return base columns as list of strings"""
@@ -836,6 +865,10 @@ def numFeatures(self):
         """
         return self['numFeatures']
 
+    @property
+    def dataRange(self):
+        return self._dataRange
+
     def structType(self):
         """get the `structType` attribute used to generate values for this column
 
diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -6,20 +6,25 @@
 This file defines the `DataGenError` and `DataGenerator` classes
 """
 import copy
+import json
 import logging
 import re
 
+import yaml
 from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType
 
 from ._version import _get_spark_version
 from .column_generation_spec import ColumnGenerationSpec
-from .constraints.constraint import Constraint
-from .constraints.sql_expr import SqlExpr
+from .constraints import Constraint, SqlExpr
+from .datarange import DataRange
+from .distributions import DataDistribution
+
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \
     DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION, \
     OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, \
     INFER_DATATYPE, SPARK_DEFAULT_PARALLELISM
 from .html_utils import HtmlUtils
+from .serialization import Serializable
 from .schema_parser import SchemaParser
 from .spark_singleton import SparkSingleton
 from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
@@ -30,7 +35,7 @@
 _STREAMING_TIMESTAMP_COLUMN = "_source_timestamp"
 
 
-class DataGenerator:
+class DataGenerator(Serializable):
     """ Main Class for test data set generation
 
     This class acts as the entry point to all test data generation activities.
@@ -173,6 +178,50 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
         # set up use of pandas udfs
         self._setupPandas(batchSize)
 
+    @classmethod
+    def getMapping(cls):
+        return {
+            "name": "name",
+            "randomSeedMethod": "_seedMethod",
+            "rows": "_rowCount",
+            "startingId": "starting_id",
+            "randomSeed": "_randomSeed",
+            "partitions": "partitions",
+            "verbose": "verbose",
+            "batchSize": "_batchSize",
+            "debug": "debug",
+            "seedColumnName": "_seedColumnName",
+            "random": "_defaultRandom"
+        }
+
+    @classmethod
+    def fromDict(cls, options):
+        """ Creates a DataGenerator instance from a Python dictionary.
+            :param options: Python dictionary of options for the DataGenerator, ColumnGenerationSpecs, and Constraints
+            :return: DataGenerator instance
+        """
+        ir = options.copy()
+        columns = ir.pop("columns") if "columns" in ir else []
+        constraints = ir.pop("constraints") if "constraints" in ir else []
+        return (
+            DataGenerator(**{k: v for k, v in ir.items() if not isinstance(v, list)})
+            .withColumnDefinitions(columns)
+            .withConstraintDefinitions(constraints)
+        )
+
+    def toDict(self):
+        """ Creates a Python dictionary from a DataGenerator instance.
+            :return: Python dictionary of options for the DataGenerator, ColumnGenerationSpecs, and Constraints
+        """
+        d = {constructor_key: getattr(self, object_key) for constructor_key, object_key in self.getMapping().items()}
+        d["columns"] = [{
+                k: v for k, v in column.toDict().items()
+                if k != "kind"}
+            for column in self.getColumnGenerationSpecs()]
+        d["constraints"] = [constraint.toDict() for constraint in self.getConstraints()]
+        d["kind"] = self.__class__.__name__
+        return d
+
     @property
     def seedColumnName(self):
         """ return the name of data generation seed column"""
@@ -869,6 +918,26 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
         self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable))
         return self
 
+    def withColumnDefinitions(self, columns):
+        """ Adds a set of columns to the synthetic generation specification.
+            :param columns: A list of column generation specifications as dictionaries
+            :returns:       A modified in-place instance of a data generator allowing for chaining of calls
+                            following a builder pattern
+        """
+        for column in columns:
+            internal_column = column.copy()
+            if "colName" not in internal_column:
+                internal_column["colName"] = internal_column.pop("name")
+            for k, v in internal_column.items():
+                if k == "dataRange":
+                    t = [s for s in DataRange.__subclasses__() if s.__name__ == v["kind"]][0]
+                    internal_column[k] = t.fromDict(v)
+                if k == "distribution":
+                    t = [s for s in DataDistribution.__subclasses__() if s.__name__ == v["kind"]][0]
+                    internal_column[k] = t.fromDict(v)
+            self.withColumn(**internal_column)
+        return self
+
     def _mkSqlStructFromList(self, fields):
         """
         Create a SQL struct expression from a list of fields
@@ -1206,6 +1275,12 @@ def _getColumnDataTypes(self, columns):
         """
         return [self._columnSpecsByName[colspec].datatype for colspec in columns]
 
+    def getColumnGenerationSpecs(self):
+        return self._allColumnSpecs
+
+    def getConstraints(self):
+        return self._constraints
+
     def withConstraint(self, constraint):
         """Add a constraint to control the data generation
 
@@ -1255,6 +1330,18 @@ def withSqlConstraint(self, sqlExpression: str):
         self.withConstraint(SqlExpr(sqlExpression))
         return self
 
+    def withConstraintDefinitions(self, constraints):
+        """ Adds a set of constraints to the synthetic generation specification.
+
+            :param constraints: A list of constraints as dictionaries
+            :returns:       A modified in-place instance of a data generator allowing for chaining of calls
+                            following a builder pattern
+        """
+        for c in constraints:
+            t = [s for s in Constraint.__subclasses__() if s.__name__ == c["kind"]][0]
+            self.withConstraint(t.fromDict(c))  # Call fromDict
+        return self
+
     def computeBuildPlan(self):
         """ prepare for building by computing a pseudo build plan
 
@@ -1604,3 +1691,33 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
             result = HtmlUtils.formatCodeAsHtml(results)
 
         return result
+
+    @staticmethod
+    def fromJson(options):
+        """ Creates a data generator from a JSON string.
+            :param options: A JSON string containing data generation options
+            :return: A data generator with the specified options
+        """
+        options = json.loads(options)
+        return DataGenerator.fromDict(options)
+
+    def toJson(self):
+        """ Returns the JSON string representation of a data generator.
+            :return: A JSON string representation of the DataGenerator
+        """
+        return json.dumps(self.toDict())
+
+    @staticmethod
+    def fromYaml(options):
+        """ Creates a data generator from a YAML string.
+            :param options: A YAML string containing data generation options
+            :return: A data generator with the specified options
+        """
+        options = yaml.safe_load(options)
+        return DataGenerator.fromDict(options)
+
+    def toYaml(self):
+        """ Returns the YAML string representation of a data generator.
+            :return: A YAML string representation of the DataGenerator
+        """
+        return yaml.dump(self.toDict())
diff --git a/dbldatagen/datarange.py b/dbldatagen/datarange.py
@@ -10,10 +10,16 @@
 
 """
 
+from .serialization import Serializable
 
-class DataRange(object):
+
+class DataRange(Serializable):
     """ Abstract class used as base class for NRange and DateRange """
 
+    @classmethod
+    def getMapping(cls):
+        raise NotImplementedError("method not implemented")
+
     def isEmpty(self):
         """Check if object is empty (i.e all instance vars of note are `None`)"""
         raise NotImplementedError("method not implemented")
diff --git a/dbldatagen/daterange.py b/dbldatagen/daterange.py
@@ -44,6 +44,7 @@ def __init__(self, begin, end, interval=None, datetime_format=DEFAULT_UTC_TS_FOR
         assert begin is not None, "`begin` must be specified"
         assert end is not None, "`end` must be specified"
 
+        self.datetime_format = datetime_format
         self.begin = begin if not isinstance(begin, str) else self._datetime_from_string(begin, datetime_format)
         self.end = end if not isinstance(end, str) else self._datetime_from_string(end, datetime_format)
         self.interval = interval if not isinstance(interval, str) else self._timedelta_from_string(interval)
@@ -54,12 +55,37 @@ def __init__(self, begin, end, interval=None, datetime_format=DEFAULT_UTC_TS_FOR
                          * self.computeTimestampIntervals(self.begin, self.end, self.interval))
         self.step = self.interval.total_seconds()
 
+    @classmethod
+    def getMapping(cls):
+        return {
+            "begin": "begin_string",
+            "end": "end_string",
+            "interval": "interval_string",
+            "datetime_format": "datetime_format"
+        }
+
+    @property
+    def begin_string(self):
+        return self._string_from_datetime(self.begin, self.datetime_format)
+
+    @property
+    def end_string(self):
+        return self._string_from_datetime(self.end, self.datetime_format)
+
+    @property
+    def interval_string(self):
+        return self.formatInterval(int(self.interval.total_seconds()))
+
     @classmethod
     def _datetime_from_string(cls, date_str, date_format):
         """convert string to Python DateTime object using format"""
         result = datetime.strptime(date_str, date_format)
         return result
 
+    @classmethod
+    def _string_from_datetime(cls, date_str, date_format):
+        return datetime.strftime(date_str, date_format)
+
     @classmethod
     def _timedelta_from_string(cls, interval):
         return cls.parseInterval(interval)
@@ -70,6 +96,11 @@ def parseInterval(cls, interval_str):
         assert interval_str is not None, "`interval_str` must be specified"
         return parse_time_interval(interval_str)
 
+    @classmethod
+    def formatInterval(cls, interval_time_seconds):
+        assert interval_time_seconds is not None, "`interval_time` must be specified"
+        return f"INTERVAL {interval_time_seconds} SECONDS"
+
     @classmethod
     def _getDateTime(cls, dt, datetime_format, default_value):
         if isinstance(dt, str):
diff --git a/dbldatagen/nrange.py b/dbldatagen/nrange.py
@@ -57,9 +57,18 @@ def __init__(self, minValue=None, maxValue=None, step=None, until=None, **kwArgs
         assert self.maxValue is None if until is not None else True, "Only one of maxValue or until can be specified"
 
         if until is not None:
+            self.until = until
             self.maxValue = until + 1
         self.step = step
 
+    @classmethod
+    def getMapping(cls):
+        return {
+            "minValue": "minValue",
+            "maxValue": "maxValue",
+            "step": "step"
+        }
+
     def __str__(self):
         return f"NRange({self.minValue}, {self.maxValue}, {self.step})"
 
diff --git a/python/dev_require.txt b/python/dev_require.txt
@@ -10,6 +10,7 @@ python-dateutil==2.8.1
 six==1.15.0
 pyparsing==2.4.7
 jmespath==0.10.0
+pyyaml>=6.0.2
 
 # The following packages are required for development only
 wheel==0.36.2
diff --git a/python/require.txt b/python/require.txt
@@ -10,6 +10,7 @@ python-dateutil==2.8.1
 six==1.15.0
 pyparsing==2.4.7
 jmespath==0.10.0
+pyyaml>=6.0.2
 
 # The following packages are required for development only
 wheel==0.36.2