|
6 | 6 | This file defines the `DataGenError` and `DataGenerator` classes |
7 | 7 | """ |
8 | 8 | import copy |
| 9 | +import json |
9 | 10 | import logging |
10 | 11 | import re |
11 | 12 |
|
| 13 | +import yaml |
12 | 14 | from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, DataType |
13 | 15 |
|
14 | 16 | from ._version import _get_spark_version |
15 | 17 | from .column_generation_spec import ColumnGenerationSpec |
16 | | -from .constraints.constraint import Constraint |
17 | | -from .constraints.sql_expr import SqlExpr |
| 18 | +from .constraints import Constraint, SqlExpr |
| 19 | +from .datarange import DataRange |
| 20 | +from .distributions import DataDistribution |
| 21 | + |
18 | 22 | from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \ |
19 | 23 | DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION, \ |
20 | 24 | OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD, \ |
21 | 25 | INFER_DATATYPE, SPARK_DEFAULT_PARALLELISM |
22 | 26 | from .html_utils import HtmlUtils |
| 27 | +from .serialization import Serializable |
23 | 28 | from .schema_parser import SchemaParser |
24 | 29 | from .spark_singleton import SparkSingleton |
25 | 30 | from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition |
|
30 | 35 | _STREAMING_TIMESTAMP_COLUMN = "_source_timestamp" |
31 | 36 |
|
32 | 37 |
|
33 | | -class DataGenerator: |
| 38 | +class DataGenerator(Serializable): |
34 | 39 | """ Main Class for test data set generation |
35 | 40 |
|
36 | 41 | This class acts as the entry point to all test data generation activities. |
@@ -173,6 +178,50 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None, |
173 | 178 | # set up use of pandas udfs |
174 | 179 | self._setupPandas(batchSize) |
175 | 180 |
|
| 181 | + @classmethod |
| 182 | + def getMapping(cls): |
| 183 | + return { |
| 184 | + "name": "name", |
| 185 | + "randomSeedMethod": "_seedMethod", |
| 186 | + "rows": "_rowCount", |
| 187 | + "startingId": "starting_id", |
| 188 | + "randomSeed": "_randomSeed", |
| 189 | + "partitions": "partitions", |
| 190 | + "verbose": "verbose", |
| 191 | + "batchSize": "_batchSize", |
| 192 | + "debug": "debug", |
| 193 | + "seedColumnName": "_seedColumnName", |
| 194 | + "random": "_defaultRandom" |
| 195 | + } |
| 196 | + |
| 197 | + @classmethod |
| 198 | + def fromDict(cls, options): |
| 199 | + """ Creates a DataGenerator instance from a Python dictionary. |
| 200 | + :param options: Python dictionary of options for the DataGenerator, ColumnGenerationSpecs, and Constraints |
| 201 | + :return: DataGenerator instance |
| 202 | + """ |
| 203 | + ir = options.copy() |
| 204 | + columns = ir.pop("columns") if "columns" in ir else [] |
| 205 | + constraints = ir.pop("constraints") if "constraints" in ir else [] |
| 206 | + return ( |
| 207 | + DataGenerator(**{k: v for k, v in ir.items() if not isinstance(v, list)}) |
| 208 | + .withColumnDefinitions(columns) |
| 209 | + .withConstraintDefinitions(constraints) |
| 210 | + ) |
| 211 | + |
| 212 | + def toDict(self): |
| 213 | + """ Creates a Python dictionary from a DataGenerator instance. |
| 214 | + :return: Python dictionary of options for the DataGenerator, ColumnGenerationSpecs, and Constraints |
| 215 | + """ |
| 216 | + d = {constructor_key: getattr(self, object_key) for constructor_key, object_key in self.getMapping().items()} |
| 217 | + d["columns"] = [{ |
| 218 | + k: v for k, v in column.toDict().items() |
| 219 | + if k != "kind"} |
| 220 | + for column in self.getColumnGenerationSpecs()] |
| 221 | + d["constraints"] = [constraint.toDict() for constraint in self.getConstraints()] |
| 222 | + d["kind"] = self.__class__.__name__ |
| 223 | + return d |
| 224 | + |
176 | 225 | @property |
177 | 226 | def seedColumnName(self): |
178 | 227 | """ return the name of data generation seed column""" |
@@ -869,6 +918,26 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None |
869 | 918 | self._inferredSchemaFields.append(StructField(colName, newColumn.datatype, nullable)) |
870 | 919 | return self |
871 | 920 |
|
| 921 | + def withColumnDefinitions(self, columns): |
| 922 | + """ Adds a set of columns to the synthetic generation specification. |
| 923 | + :param columns: A list of column generation specifications as dictionaries |
| 924 | + :returns: A modified in-place instance of a data generator allowing for chaining of calls |
| 925 | + following a builder pattern |
| 926 | + """ |
| 927 | + for column in columns: |
| 928 | + internal_column = column.copy() |
| 929 | + if "colName" not in internal_column: |
| 930 | + internal_column["colName"] = internal_column.pop("name") |
| 931 | + for k, v in internal_column.items(): |
| 932 | + if k == "dataRange": |
| 933 | + t = [s for s in DataRange.__subclasses__() if s.__name__ == v["kind"]][0] |
| 934 | + internal_column[k] = t.fromDict(v) |
| 935 | + if k == "distribution": |
| 936 | + t = [s for s in DataDistribution.__subclasses__() if s.__name__ == v["kind"]][0] |
| 937 | + internal_column[k] = t.fromDict(v) |
| 938 | + self.withColumn(**internal_column) |
| 939 | + return self |
| 940 | + |
872 | 941 | def _mkSqlStructFromList(self, fields): |
873 | 942 | """ |
874 | 943 | Create a SQL struct expression from a list of fields |
@@ -1206,6 +1275,12 @@ def _getColumnDataTypes(self, columns): |
1206 | 1275 | """ |
1207 | 1276 | return [self._columnSpecsByName[colspec].datatype for colspec in columns] |
1208 | 1277 |
|
| 1278 | + def getColumnGenerationSpecs(self): |
| 1279 | + return self._allColumnSpecs |
| 1280 | + |
| 1281 | + def getConstraints(self): |
| 1282 | + return self._constraints |
| 1283 | + |
1209 | 1284 | def withConstraint(self, constraint): |
1210 | 1285 | """Add a constraint to control the data generation |
1211 | 1286 |
|
@@ -1255,6 +1330,18 @@ def withSqlConstraint(self, sqlExpression: str): |
1255 | 1330 | self.withConstraint(SqlExpr(sqlExpression)) |
1256 | 1331 | return self |
1257 | 1332 |
|
| 1333 | + def withConstraintDefinitions(self, constraints): |
| 1334 | + """ Adds a set of constraints to the synthetic generation specification. |
| 1335 | +
|
| 1336 | + :param constraints: A list of constraints as dictionaries |
| 1337 | + :returns: A modified in-place instance of a data generator allowing for chaining of calls |
| 1338 | + following a builder pattern |
| 1339 | + """ |
| 1340 | + for c in constraints: |
| 1341 | + t = [s for s in Constraint.__subclasses__() if s.__name__ == c["kind"]][0] |
| 1342 | + self.withConstraint(t.fromDict(c)) # Call fromDict |
| 1343 | + return self |
| 1344 | + |
1258 | 1345 | def computeBuildPlan(self): |
1259 | 1346 | """ prepare for building by computing a pseudo build plan |
1260 | 1347 |
|
@@ -1604,3 +1691,33 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, |
1604 | 1691 | result = HtmlUtils.formatCodeAsHtml(results) |
1605 | 1692 |
|
1606 | 1693 | return result |
| 1694 | + |
| 1695 | + @staticmethod |
| 1696 | + def fromJson(options): |
| 1697 | + """ Creates a data generator from a JSON string. |
| 1698 | + :param options: A JSON string containing data generation options |
| 1699 | + :return: A data generator with the specified options |
| 1700 | + """ |
| 1701 | + options = json.loads(options) |
| 1702 | + return DataGenerator.fromDict(options) |
| 1703 | + |
| 1704 | + def toJson(self): |
| 1705 | + """ Returns the JSON string representation of a data generator. |
| 1706 | + :return: A JSON string representation of the DataGenerator |
| 1707 | + """ |
| 1708 | + return json.dumps(self.toDict()) |
| 1709 | + |
| 1710 | + @staticmethod |
| 1711 | + def fromYaml(options): |
| 1712 | + """ Creates a data generator from a YAML string. |
| 1713 | + :param options: A YAML string containing data generation options |
| 1714 | + :return: A data generator with the specified options |
| 1715 | + """ |
| 1716 | + options = yaml.safe_load(options) |
| 1717 | + return DataGenerator.fromDict(options) |
| 1718 | + |
| 1719 | + def toYaml(self): |
| 1720 | + """ Returns the YAML string representation of a data generator. |
| 1721 | + :return: A YAML string representation of the DataGenerator |
| 1722 | + """ |
| 1723 | + return yaml.dump(self.toDict()) |
0 commit comments