Skip to content

Commit 1eda552

Browse files
wip
2 parents 5f0ffc0 + 6c4702d commit 1eda552

20 files changed

+866
-292
lines changed

CHANGELOG.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,19 @@ All notable changes to the Databricks Labs Data Generator will be documented in
88
#### Changed
99
* Fixed use of logger in _version.py and in spark_singleton.py
1010
* Fixed template issues
11-
* Added use of prospector to build process to validate common code issues
12-
* Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
1311
* Document reformatting and updates
12+
* Modified option to allow for range when specifying `numFeatures` with `structType='array'` to allow generation
13+
of varying number of columns
14+
* When generating multi-column or array valued columns, compute random seed with different name for each column
15+
16+
### Fixed
17+
* Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
18+
19+
### Added
20+
* Added use of prospector to build process to validate common code issues
21+
* Added top level `random` attribute to data generator specification constructor
22+
23+
1424

1525
### Version 0.3.2
1626

dbldatagen/column_generation_spec.py

Lines changed: 146 additions & 71 deletions
Large diffs are not rendered by default.

dbldatagen/column_spec_options.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ class ColumnSpecOptions(object):
3636
3737
:param step: Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter
3838
39+
:param numColumns: generate `n` columns numbered from 1 .. n-1 with same definition
40+
41+
:param numFeatures: generate `n` columns numbered from 0 .. n-1 with same definition. Alias for `numColumns`
42+
43+
:param structType: If specified as "array" and used with numColumns / numFeatures, will combine columns as array
44+
3945
:param random: If True, will generate random values for column value. Defaults to `False`
4046
4147
:param baseColumn: Either the string name of the base column, or a list of columns to use to

dbldatagen/data_generator.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
from .spark_singleton import SparkSingleton
1414
from .column_generation_spec import ColumnGenerationSpec
1515
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \
16-
DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION
16+
DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION, \
17+
OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
18+
1719
from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
1820
from . _version import _get_spark_version
1921
from .schema_parser import SchemaParser
@@ -40,6 +42,7 @@ class DataGenerator:
4042
:param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
4143
:param debug: = if set to True, output debug level of information
4244
:param seedColumnName: = if set, this should be the name of the `seed` or logical `id` column. Defaults to `id`
45+
:param random: = if set, specifies default value of `random` attribute for all columns where not set
4346
4447
By default the seed column is named `id`. If you need to use this column name in your generated data,
4548
it is recommended that you use a different name for the seed column - for example `_id`.
@@ -63,6 +66,7 @@ class DataGenerator:
6366
def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
6467
rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
6568
batchSize=None, debug=False, seedColumnName=DEFAULT_SEED_COLUMN,
69+
random=False,
6670
**kwargs):
6771
""" Constructor for data generator object """
6872

@@ -119,6 +123,9 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
119123

120124
self._seedMethod = randomSeedMethod
121125

126+
# set default random setting
127+
self._defaultRandom = random if random is not None else False
128+
122129
if randomSeed is None:
123130
self._instanceRandomSeed = self._randomSeed
124131

@@ -297,6 +304,13 @@ def randomSeed(self):
297304
""" return the data generation spec random seed"""
298305
return self._instanceRandomSeed
299306

307+
@property
308+
def random(self):
309+
""" return the data generation spec default random setting for columns to be used
310+
when an explicit `random` attribute setting is not supplied
311+
"""
312+
return self._defaultRandom
313+
300314
def _markForPlanRegen(self):
301315
"""Mark that build plan needs to be regenerated
302316
@@ -591,13 +605,19 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
591605
:returns: modified in-place instance of test data generator allowing for chaining of calls following
592606
Builder pattern
593607
608+
.. note::
609+
matchTypes may also take SQL type strings or a list of SQL type strings such as "array<integer>"
610+
594611
You may also add a variety of options to further control the test data generation process.
595612
For full list of options, see :doc:`/reference/api/dbldatagen.column_spec_options`.
596613
597614
"""
598615
if fields is not None and type(fields) is str:
599616
fields = [fields]
600617

618+
if OPTION_RANDOM not in kwargs:
619+
kwargs[OPTION_RANDOM] = self._defaultRandom
620+
601621
# add support for deprecated legacy names
602622
if "match_types" in kwargs:
603623
assert matchTypes is None, "Argument 'match_types' is deprecated, use 'matchTypes' instead"
@@ -620,7 +640,15 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
620640
effective_fields = [x for x in effective_fields for y in patterns if re.search(y, x) is not None]
621641

622642
if matchTypes is not None:
623-
effective_fields = [x for x in effective_fields for y in matchTypes
643+
effective_types = []
644+
645+
for typ in matchTypes:
646+
if isinstance(typ, str):
647+
effective_types.append(SchemaParser.columnTypeFromString(typ))
648+
else:
649+
effective_types.append(typ)
650+
651+
effective_fields = [x for x in effective_fields for y in effective_types
624652
if self.getColumnType(x) == y]
625653

626654
for f in effective_fields:
@@ -648,7 +676,7 @@ def _checkColumnOrColumnList(self, columns, allowId=False):
648676
return True
649677

650678
def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=None,
651-
random=False, distribution=None,
679+
random=None, distribution=None,
652680
implicit=False, dataRange=None, omit=False, baseColumn=None, **kwargs):
653681
""" add a column specification for an existing column
654682
@@ -670,6 +698,9 @@ def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=N
670698
Datatype parameter is only needed for `withColumn` and not permitted for `withColumnSpec`
671699
""")
672700

701+
if random is None:
702+
random = self._defaultRandom
703+
673704
# handle migration of old `min` and `max` options
674705
if _OLD_MIN_OPTION in kwargs:
675706
assert minValue is None, \
@@ -705,7 +736,7 @@ def hasColumnSpec(self, colName):
705736
return colName in self._columnSpecsByName
706737

707738
def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None, step=1,
708-
dataRange=None, prefix=None, random=False, distribution=None,
739+
dataRange=None, prefix=None, random=None, distribution=None,
709740
baseColumn=None, nullable=True,
710741
omit=False, implicit=False, noWarn=False,
711742
**kwargs):
@@ -756,6 +787,9 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
756787
maxValue = kwargs[_OLD_MAX_OPTION]
757788
kwargs.pop(_OLD_MAX_OPTION, None)
758789

790+
if random is None:
791+
random = self._defaultRandom
792+
759793
new_props = {}
760794
new_props.update(kwargs)
761795

@@ -792,25 +826,25 @@ def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
792826
# if the column has the option `random` set to true
793827
# then use the instance level random seed
794828
# otherwise use the default random seed for the class
795-
if "randomSeed" in new_props:
796-
effective_random_seed = new_props["randomSeed"]
797-
new_props.pop("randomSeed")
798-
new_props["random"] = True
829+
if OPTION_RANDOM_SEED in new_props:
830+
effective_random_seed = new_props[OPTION_RANDOM_SEED]
831+
new_props.pop(OPTION_RANDOM_SEED)
832+
new_props[OPTION_RANDOM] = True
799833

800834
# if random seed has override but randomSeedMethod does not
801835
# set it to fixed
802-
if "randomSeedMethod" not in new_props:
803-
new_props["randomSeedMethod"] = RANDOM_SEED_FIXED
836+
if OPTION_RANDOM_SEED_METHOD not in new_props:
837+
new_props[OPTION_RANDOM_SEED_METHOD] = RANDOM_SEED_FIXED
804838

805-
elif "random" in new_props and new_props["random"]:
839+
elif OPTION_RANDOM in new_props and new_props[OPTION_RANDOM]:
806840
effective_random_seed = self._instanceRandomSeed
807841
else:
808842
effective_random_seed = self._randomSeed
809843

810844
# handle column level override
811-
if "randomSeedMethod" in new_props:
812-
effective_random_seed_method = new_props["randomSeedMethod"]
813-
new_props.pop("randomSeedMethod")
845+
if OPTION_RANDOM_SEED_METHOD in new_props:
846+
effective_random_seed_method = new_props[OPTION_RANDOM_SEED_METHOD]
847+
new_props.pop(OPTION_RANDOM_SEED_METHOD)
814848
else:
815849
effective_random_seed_method = self._seedMethod
816850

dbldatagen/datagen_constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,8 @@
3636
# minimum versions for version checks
3737
MIN_PYTHON_VERSION = (3, 8)
3838
MIN_SPARK_VERSION = (3, 1, 2)
39+
40+
# options for randon data generation
41+
OPTION_RANDOM = "random"
42+
OPTION_RANDOM_SEED_METHOD = "randomSeedMethod"
43+
OPTION_RANDOM_SEED = "randomSeed"

dbldatagen/text_generator_plugins.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs):
375375
:param args: positional args to be passed to underlying Faker instance
376376
:param _lib: internal only param - library to load
377377
:param _rootClass: internal only param - root class to create
378+
378379
:returns : instance of PyfuncText for use with Faker
379380
380381
``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")``

dbldatagen/text_generators.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -178,24 +178,24 @@ class TemplateGenerator(TextGenerator): # lgtm [py/missing-equals]
178178
179179
It uses the following special chars:
180180
181-
======== ======================================
182-
Chars Meaning
183-
======== ======================================
184-
``\\`` Apply escape to next char.
185-
v0,v1,..v9 Use base value as an array of values and substitute the `nth` element ( 0 .. 9). Always escaped.
186-
x Insert a random lowercase hex digit
187-
X Insert an uppercase random hex digit
188-
d Insert a random lowercase decimal digit
189-
D Insert an uppercase random decimal digit
190-
a Insert a random lowercase alphabetical character
191-
A Insert a random uppercase alphabetical character
192-
k Insert a random lowercase alphanumeric character
193-
K Insert a random uppercase alphanumeric character
194-
n Insert a random number between 0 .. 255 inclusive. This option must always be escaped
195-
N Insert a random number between 0 .. 65535 inclusive. This option must always be escaped
196-
w Insert a random lowercase word from the ipsum lorem word set. Always escaped
197-
W Insert a random uppercase word from the ipsum lorem word set. Always escaped
198-
======== ======================================
181+
========== ======================================
182+
Chars Meaning
183+
========== ======================================
184+
``\\`` Apply escape to next char.
185+
v0,v1,..v9 Use base value as an array of values and substitute the `nth` element ( 0 .. 9). Always escaped.
186+
x Insert a random lowercase hex digit
187+
X Insert an uppercase random hex digit
188+
d Insert a random lowercase decimal digit
189+
D Insert an uppercase random decimal digit
190+
a Insert a random lowercase alphabetical character
191+
A Insert a random uppercase alphabetical character
192+
k Insert a random lowercase alphanumeric character
193+
K Insert a random uppercase alphanumeric character
194+
n Insert a random number between 0 .. 255 inclusive. This option must always be escaped
195+
N Insert a random number between 0 .. 65535 inclusive. This option must always be escaped
196+
w Insert a random lowercase word from the ipsum lorem word set. Always escaped
197+
W Insert a random uppercase word from the ipsum lorem word set. Always escaped
198+
========== ======================================
199199
200200
.. note::
201201
If escape is used and`escapeSpecialChars` is False, then the following

dbldatagen/utils.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,13 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
116116
:arg sources: list of ``(name, set(names of dependencies))`` pairs
117117
:arg initial_columns: force ``initial_columns`` to be computed first
118118
:arg flatten: if true, flatten output list
119-
:returns: list of names in dependency order. If not flattened, result will be list of lists
119+
:returns: list of names in dependency order separated into build phases
120+
121+
.. note::
122+
The algorith will give preference to retaining order of inbound sequence
123+
over modifying order to produce a lower number of build phases.
124+
125+
Overall the effect is that the input build order should be retained unless there are forward references
120126
"""
121127
# generate a copy so that we can modify in place
122128
pending = [(name, set(deps)) for name, deps in sources]
@@ -127,27 +133,36 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
127133
next_pending = []
128134
gen = []
129135
value_emitted = False
136+
defer_emitted = False
130137
gen_provided = []
131138
for entry in pending:
132139
name, deps = entry
133140
deps.difference_update(provided)
134141
if deps:
135142
next_pending.append((name, set(deps)))
143+
144+
# if dependencies will be satisfied by item emitted in this round, defer output
145+
if not deps.difference(gen_provided):
146+
defer_emitted = True
147+
elif defer_emitted:
148+
next_pending.append((name, set(deps)))
136149
elif name in provided:
137-
value_emitted |= True
150+
value_emitted = True
138151
else:
139152
gen.append(name)
140153
gen_provided.append(name)
141-
value_emitted |= True
154+
value_emitted = True
142155
provided.extend(gen_provided)
143156
build_orders.append(gen)
157+
144158
if not value_emitted:
145159
raise ValueError(f"cyclic or missing dependency detected [{next_pending}]")
146160

147161
pending = next_pending
148162

149163
if flatten:
150-
return [item for sublist in build_orders for item in sublist]
164+
flattened_list = [item for sublist in build_orders for item in sublist]
165+
return flattened_list
151166
else:
152167
return build_orders
153168

docs/source/APIDOCS.md

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -165,13 +165,13 @@ testDataSpec = (
165165
numColumns=column_count,
166166
)
167167
.withColumn("code1", IntegerType(), minValue=100, maxValue=200)
168-
.withColumn("code2", IntegerType(), minValue=0, maxValue=10, random=True)
168+
.withColumn("code2", "integer", minValue=0, maxValue=10, random=True)
169169
.withColumn("code3", StringType(), values=["online", "offline", "unknown"])
170170
.withColumn(
171171
"code4", StringType(), values=["a", "b", "c"], random=True, percentNulls=0.05
172172
)
173173
.withColumn(
174-
"code5", StringType(), values=["a", "b", "c"], random=True, weights=[9, 1, 1]
174+
"code5", "string", values=["a", "b", "c"], random=True, weights=[9, 1, 1]
175175
)
176176
)
177177

@@ -193,7 +193,8 @@ column. Note this expression can refer to any preceding column including the `id
193193
inclusive. These will be computed using modulo arithmetic on the `id` column.
194194

195195
- The `withColumn` method call for the `code2` column specifies the generation of values between 0 and 10
196-
inclusive. These will be computed via a uniformly distributed random value.
196+
inclusive. These will be computed via a uniformly distributed random value. Note that type strings can be used
197+
in place of "IntegerType()"
197198

198199
> By default all random values are uniformly distributed
199200
> unless either the `weights` option is used or a specific distribution is used.
@@ -329,29 +330,29 @@ testDataSpec = (
329330
.withIdOutput()
330331
# we'll use hash of the base field to generate the ids to
331332
# avoid a simple incrementing sequence
332-
.withColumn("internal_device_id", LongType(), minValue=0x1000000000000,
333+
.withColumn("internal_device_id", "long", minValue=0x1000000000000,
333334
uniqueValues=device_population, omit=True, baseColumnType="hash",
334335
)
335336
# note for format strings, we must use "%lx" not "%x" as the
336337
# underlying value is a long
337338
.withColumn(
338-
"device_id", StringType(), format="0x%013x", baseColumn="internal_device_id"
339+
"device_id", "string", format="0x%013x", baseColumn="internal_device_id"
339340
)
340341
# the device / user attributes will be the same for the same device id
341342
# so lets use the internal device id as the base column for these attribute
342-
.withColumn("country", StringType(), values=country_codes, weights=country_weights,
343+
.withColumn("country", "string", values=country_codes, weights=country_weights,
343344
baseColumn="internal_device_id")
344-
.withColumn("manufacturer", StringType(), values=manufacturers,
345+
.withColumn("manufacturer", "string", values=manufacturers,
345346
baseColumn="internal_device_id", )
346347
# use omit = True if you don't want a column to appear in the final output
347348
# but just want to use it as part of generation of another column
348-
.withColumn("line", StringType(), values=lines, baseColumn="manufacturer",
349+
.withColumn("line", "string", values=lines, baseColumn="manufacturer",
349350
baseColumnType="hash", omit=True )
350-
.withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, baseColumn="device_id",
351+
.withColumn("model_ser", "integer", minValue=1, maxValue=11, baseColumn="device_id",
351352
baseColumnType="hash", omit=True, )
352-
.withColumn("model_line", StringType(), expr="concat(line, '#', model_ser)",
353+
.withColumn("model_line", "string", expr="concat(line, '#', model_ser)",
353354
baseColumn=["line", "model_ser"] )
354-
.withColumn("event_type", StringType(),
355+
.withColumn("event_type", "string",
355356
values=["activation", "deactivation", "plan change", "telecoms activity",
356357
"internet activity", "device error", ],
357358
random=True)
@@ -379,6 +380,12 @@ of unique values.
379380
- The `withColumn` method call for the `line` column introduces a temporary column for purposes of
380381
generating other columns, but through the use of the `omit` option, omits it from the final data set.
381382

383+
> NOTE: Type strings can be used in place of instances of data type objects. Type strings use SQL data type syntax
384+
> and can be used to specify basic types, numeric types such as "decimal(10,3)" as well as complex structured types
385+
> such as "array<string>", "map<string, int>" and "struct<a:binary, b:int, c:float>".
386+
>
387+
> Type strings are case-insensitive.
388+
382389
### Scaling it up
383390

384391
When generating data, the number of rows to be generated is controlled by the `rows` parameter supplied to the

0 commit comments

Comments
 (0)