databrickslabs
diff --git a/‎CHANGELOG.md‎
Lines changed: 12 additions & 2 deletions b/‎CHANGELOG.md‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎dbldatagen/column_generation_spec.py‎
Lines changed: 146 additions & 71 deletions b/‎dbldatagen/column_generation_spec.py‎
Lines changed: 146 additions & 71 deletions
diff --git a/‎dbldatagen/column_spec_options.py‎
Lines changed: 6 additions & 0 deletions b/‎dbldatagen/column_spec_options.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎dbldatagen/data_generator.py‎
Lines changed: 48 additions & 14 deletions b/‎dbldatagen/data_generator.py‎
Lines changed: 48 additions & 14 deletions
diff --git a/‎dbldatagen/datagen_constants.py‎
Lines changed: 5 additions & 0 deletions b/‎dbldatagen/datagen_constants.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎dbldatagen/text_generator_plugins.py‎
Lines changed: 1 addition & 0 deletions b/‎dbldatagen/text_generator_plugins.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎dbldatagen/text_generators.py‎
Lines changed: 18 additions & 18 deletions b/‎dbldatagen/text_generators.py‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎dbldatagen/utils.py‎
Lines changed: 19 additions & 4 deletions b/‎dbldatagen/utils.py‎
Lines changed: 19 additions & 4 deletions
diff --git a/‎docs/source/APIDOCS.md‎
Lines changed: 18 additions & 11 deletions b/‎docs/source/APIDOCS.md‎
Lines changed: 18 additions & 11 deletions
@@ -8,9 +8,19 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 #### Changed
 * Fixed use of logger in _version.py and in spark_singleton.py
 * Fixed template issues 
-* Added use of prospector to build process to validate common code issues
-* Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
 * Document reformatting and updates
+* Modified option to allow for range when specifying `numFeatures` with `structType='array'` to allow generation
+  of varying number of columns
+* When generating multi-column or array valued columns, compute random seed with different name for each column
+
+### Fixed
+* Apply pandas optimizations when generating multiple columns using same `withColumn` or `withColumnSpec`
+
+### Added
+* Added use of prospector to build process to validate common code issues
+* Added top level `random` attribute to data generator specification constructor
+
+
 
 ### Version 0.3.2
 
 
@@ -36,6 +36,12 @@ class ColumnSpecOptions(object):
 
     :param step: Step to use for range of generated value. As an alternative, you may use the `dataRange` parameter
 
+    :param numColumns: generate `n` columns numbered from 1 .. n-1 with same definition
+
+    :param numFeatures: generate `n` columns numbered from 0 .. n-1 with same definition. Alias for `numColumns`
+
+    :param structType: If specified as "array" and used with numColumns / numFeatures, will combine columns as array
+
     :param random: If True, will generate random values for column value. Defaults to `False`
 
     :param baseColumn: Either the string name of the base column, or a list of columns to use to
 
@@ -13,7 +13,9 @@
 from .spark_singleton import SparkSingleton
 from .column_generation_spec import ColumnGenerationSpec
 from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_FIXED, RANDOM_SEED_HASH_FIELD_NAME, \
-                               DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION
+                               DEFAULT_SEED_COLUMN, SPARK_RANGE_COLUMN, MIN_SPARK_VERSION, \
+                               OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD
+
 from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
 from . _version import _get_spark_version
 from .schema_parser import SchemaParser
@@ -40,6 +42,7 @@ class DataGenerator:
     :param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
     :param debug: = if set to True, output debug level of information
     :param seedColumnName: = if set, this should be the name of the `seed` or logical `id` column. Defaults to `id`
+    :param random: = if set, specifies default value of `random` attribute for all columns where not set
 
     By default the seed column is named `id`. If you need to use this column name in your generated data,
     it is recommended that you use a different name for the seed column - for example `_id`.
@@ -63,6 +66,7 @@ class DataGenerator:
     def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
                  rows=1000000, startingId=0, randomSeed=None, partitions=None, verbose=False,
                  batchSize=None, debug=False, seedColumnName=DEFAULT_SEED_COLUMN,
+                 random=False,
                  **kwargs):
         """ Constructor for data generator object """
 
@@ -119,6 +123,9 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
 
         self._seedMethod = randomSeedMethod
 
+        # set default random setting
+        self._defaultRandom = random if random is not None else False
+
         if randomSeed is None:
             self._instanceRandomSeed = self._randomSeed
 
@@ -297,6 +304,13 @@ def randomSeed(self):
         """ return the data generation spec random seed"""
         return self._instanceRandomSeed
 
+    @property
+    def random(self):
+        """ return the data generation spec default random setting for columns to be used
+            when an explicit `random` attribute setting is not supplied
+        """
+        return self._defaultRandom
+
     def _markForPlanRegen(self):
         """Mark that build plan needs to be regenerated
 
@@ -591,13 +605,19 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
         :returns: modified in-place instance of test data generator allowing for chaining of calls following
                   Builder pattern
 
+        .. note::
+           matchTypes may also take SQL type strings or a list of SQL type strings such as "array<integer>"
+
         You may also add a variety of options to further control the test data generation process.
         For full list of options, see :doc:`/reference/api/dbldatagen.column_spec_options`.
 
         """
         if fields is not None and type(fields) is str:
             fields = [fields]
 
+        if OPTION_RANDOM not in kwargs:
+            kwargs[OPTION_RANDOM] = self._defaultRandom
+
         # add support for deprecated legacy names
         if "match_types" in kwargs:
             assert matchTypes is None, "Argument 'match_types' is deprecated, use 'matchTypes' instead"
@@ -620,7 +640,15 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
             effective_fields = [x for x in effective_fields for y in patterns if re.search(y, x) is not None]
 
         if matchTypes is not None:
-            effective_fields = [x for x in effective_fields for y in matchTypes
+            effective_types = []
+
+            for typ in matchTypes:
+                if isinstance(typ, str):
+                    effective_types.append(SchemaParser.columnTypeFromString(typ))
+                else:
+                    effective_types.append(typ)
+
+            effective_fields = [x for x in effective_fields for y in effective_types
                                 if self.getColumnType(x) == y]
 
         for f in effective_fields:
@@ -648,7 +676,7 @@ def _checkColumnOrColumnList(self, columns, allowId=False):
         return True
 
     def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=None,
-                       random=False, distribution=None,
+                       random=None, distribution=None,
                        implicit=False, dataRange=None, omit=False, baseColumn=None, **kwargs):
         """ add a column specification for an existing column
 
@@ -670,6 +698,9 @@ def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=N
                     Datatype parameter is only needed for `withColumn` and not permitted for `withColumnSpec`
                """)
 
+        if random is None:
+            random = self._defaultRandom
+
         # handle migration of old `min` and `max` options
         if _OLD_MIN_OPTION in kwargs:
             assert minValue is None, \
@@ -705,7 +736,7 @@ def hasColumnSpec(self, colName):
         return colName in self._columnSpecsByName
 
     def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None, step=1,
-                   dataRange=None, prefix=None, random=False, distribution=None,
+                   dataRange=None, prefix=None, random=None, distribution=None,
                    baseColumn=None, nullable=True,
                    omit=False, implicit=False, noWarn=False,
                    **kwargs):
@@ -756,6 +787,9 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
             maxValue = kwargs[_OLD_MAX_OPTION]
             kwargs.pop(_OLD_MAX_OPTION, None)
 
+        if random is None:
+            random = self._defaultRandom
+
         new_props = {}
         new_props.update(kwargs)
 
@@ -792,25 +826,25 @@ def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
         # if the column  has the option `random` set to true
         # then use the instance level random seed
         # otherwise use the default random seed for the class
-        if "randomSeed" in new_props:
-            effective_random_seed = new_props["randomSeed"]
-            new_props.pop("randomSeed")
-            new_props["random"] = True
+        if OPTION_RANDOM_SEED in new_props:
+            effective_random_seed = new_props[OPTION_RANDOM_SEED]
+            new_props.pop(OPTION_RANDOM_SEED)
+            new_props[OPTION_RANDOM] = True
 
             # if random seed has override but randomSeedMethod does not
             # set it to fixed
-            if "randomSeedMethod" not in new_props:
-                new_props["randomSeedMethod"] = RANDOM_SEED_FIXED
+            if OPTION_RANDOM_SEED_METHOD not in new_props:
+                new_props[OPTION_RANDOM_SEED_METHOD] = RANDOM_SEED_FIXED
 
-        elif "random" in new_props and new_props["random"]:
+        elif OPTION_RANDOM in new_props and new_props[OPTION_RANDOM]:
             effective_random_seed = self._instanceRandomSeed
         else:
             effective_random_seed = self._randomSeed
 
         # handle column level override
-        if "randomSeedMethod" in new_props:
-            effective_random_seed_method = new_props["randomSeedMethod"]
-            new_props.pop("randomSeedMethod")
+        if OPTION_RANDOM_SEED_METHOD in new_props:
+            effective_random_seed_method = new_props[OPTION_RANDOM_SEED_METHOD]
+            new_props.pop(OPTION_RANDOM_SEED_METHOD)
         else:
             effective_random_seed_method = self._seedMethod
 
 
@@ -36,3 +36,8 @@
 # minimum versions for version checks
 MIN_PYTHON_VERSION = (3, 8)
 MIN_SPARK_VERSION = (3, 1, 2)
+
+# options for randon data generation
+OPTION_RANDOM = "random"
+OPTION_RANDOM_SEED_METHOD = "randomSeedMethod"
+OPTION_RANDOM_SEED = "randomSeed"
@@ -375,6 +375,7 @@ def fakerText(mname, *args, _lib=None, _rootClass=None, **kwargs):
        :param args: positional args to be passed to underlying Faker instance
        :param _lib: internal only param - library to load
        :param _rootClass: internal only param - root class to create
+       
        :returns : instance of PyfuncText for use with Faker
 
        ``fakerText("sentence")`` is same as ``FakerTextFactory()("sentence")``
 
@@ -178,24 +178,24 @@ class TemplateGenerator(TextGenerator):  # lgtm [py/missing-equals]
 
     It uses the following special chars:
 
-    ========   ======================================
-    Chars      Meaning
-    ========   ======================================
-    ``\\``     Apply escape to next char.
-    v0,v1,..v9 Use base value as an array of values and substitute the `nth` element ( 0 .. 9). Always escaped.
-    x          Insert a random lowercase hex digit
-    X          Insert an uppercase random hex digit
-    d          Insert a random lowercase decimal digit
-    D          Insert an uppercase random decimal digit
-    a          Insert a random lowercase alphabetical character
-    A          Insert a random uppercase alphabetical character
-    k          Insert a random lowercase alphanumeric character
-    K          Insert a random uppercase alphanumeric character
-    n          Insert a random number between 0 .. 255 inclusive. This option must always be escaped
-    N          Insert a random number between 0 .. 65535 inclusive. This option must always be escaped
-    w          Insert a random lowercase word from the ipsum lorem word set. Always escaped
-    W          Insert a random uppercase word from the ipsum lorem word set. Always escaped
-    ========   ======================================
+    ==========  ======================================
+    Chars       Meaning
+    ==========  ======================================
+    ``\\``       Apply escape to next char.
+    v0,v1,..v9  Use base value as an array of values and substitute the `nth` element ( 0 .. 9). Always escaped.
+    x           Insert a random lowercase hex digit
+    X           Insert an uppercase random hex digit
+    d           Insert a random lowercase decimal digit
+    D           Insert an uppercase random decimal digit
+    a           Insert a random lowercase alphabetical character
+    A           Insert a random uppercase alphabetical character
+    k           Insert a random lowercase alphanumeric character
+    K           Insert a random uppercase alphanumeric character
+    n           Insert a random number between 0 .. 255 inclusive. This option must always be escaped
+    N           Insert a random number between 0 .. 65535 inclusive. This option must always be escaped
+    w           Insert a random lowercase word from the ipsum lorem word set. Always escaped
+    W           Insert a random uppercase word from the ipsum lorem word set. Always escaped
+    ==========  ======================================
 
     .. note::
               If escape is used and`escapeSpecialChars` is False, then the following
 
@@ -116,7 +116,13 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
     :arg sources: list of ``(name, set(names of dependencies))`` pairs
     :arg initial_columns: force ``initial_columns`` to be computed first
     :arg flatten: if true, flatten output list
-    :returns: list of names in dependency order. If not flattened, result will be list of lists
+    :returns: list of names in dependency order separated into build phases
+
+    .. note::
+       The algorith will give preference to retaining order of inbound sequence
+       over modifying order to produce a lower number of build phases.
+
+       Overall the effect is that the input build order should be retained unless there are forward references
     """
     # generate a copy so that we can modify in place
     pending = [(name, set(deps)) for name, deps in sources]
@@ -127,27 +133,36 @@ def topologicalSort(sources, initial_columns=None, flatten=True):
         next_pending = []
         gen = []
         value_emitted = False
+        defer_emitted = False
         gen_provided = []
         for entry in pending:
             name, deps = entry
             deps.difference_update(provided)
             if deps:
                 next_pending.append((name, set(deps)))
+
+                # if dependencies will be satisfied by item emitted in this round, defer output
+                if not deps.difference(gen_provided):
+                    defer_emitted = True
+            elif defer_emitted:
+                next_pending.append((name, set(deps)))
             elif name in provided:
-                value_emitted |= True
+                value_emitted = True
             else:
                 gen.append(name)
                 gen_provided.append(name)
-                value_emitted |= True
+                value_emitted = True
         provided.extend(gen_provided)
         build_orders.append(gen)
+
         if not value_emitted:
             raise ValueError(f"cyclic or missing dependency detected [{next_pending}]")
 
         pending = next_pending
 
     if flatten:
-        return [item for sublist in build_orders for item in sublist]
+        flattened_list = [item for sublist in build_orders for item in sublist]
+        return flattened_list
     else:
         return build_orders
 
 
@@ -165,13 +165,13 @@ testDataSpec = (
         numColumns=column_count,
     )
     .withColumn("code1", IntegerType(), minValue=100, maxValue=200)
-    .withColumn("code2", IntegerType(), minValue=0, maxValue=10, random=True)
+    .withColumn("code2", "integer", minValue=0, maxValue=10, random=True)
     .withColumn("code3", StringType(), values=["online", "offline", "unknown"])
     .withColumn(
         "code4", StringType(), values=["a", "b", "c"], random=True, percentNulls=0.05
     )
     .withColumn(
-        "code5", StringType(), values=["a", "b", "c"], random=True, weights=[9, 1, 1]
+        "code5", "string", values=["a", "b", "c"], random=True, weights=[9, 1, 1]
     )
 )
 
@@ -193,7 +193,8 @@ column. Note this expression can refer to any preceding column including the `id
 inclusive. These will be computed using modulo arithmetic on the `id` column. 
 
 - The `withColumn` method call for the `code2` column specifies the generation of values between 0 and 10 
-inclusive. These will be computed via a uniformly distributed random value. 
+inclusive. These will be computed via a uniformly distributed random value. Note that type strings can be used
+in place of "IntegerType()"
 
 > By default all random values are uniformly distributed
 > unless either the `weights` option is used or a specific distribution is used. 
@@ -329,29 +330,29 @@ testDataSpec = (
     .withIdOutput()
     # we'll use hash of the base field to generate the ids to
     # avoid a simple incrementing sequence
-    .withColumn("internal_device_id", LongType(), minValue=0x1000000000000, 
+    .withColumn("internal_device_id", "long", minValue=0x1000000000000, 
                 uniqueValues=device_population, omit=True, baseColumnType="hash",
     )
     # note for format strings, we must use "%lx" not "%x" as the
     # underlying value is a long
     .withColumn(
-        "device_id", StringType(), format="0x%013x", baseColumn="internal_device_id"
+        "device_id", "string", format="0x%013x", baseColumn="internal_device_id"
     )
     # the device / user attributes will be the same for the same device id
     # so lets use the internal device id as the base column for these attribute
-    .withColumn("country", StringType(), values=country_codes, weights=country_weights, 
+    .withColumn("country", "string", values=country_codes, weights=country_weights, 
                 baseColumn="internal_device_id")
-    .withColumn("manufacturer", StringType(), values=manufacturers, 
+    .withColumn("manufacturer", "string", values=manufacturers, 
                 baseColumn="internal_device_id", )
     # use omit = True if you don't want a column to appear in the final output
     # but just want to use it as part of generation of another column
-    .withColumn("line", StringType(), values=lines, baseColumn="manufacturer", 
+    .withColumn("line", "string", values=lines, baseColumn="manufacturer", 
                 baseColumnType="hash", omit=True )
-    .withColumn("model_ser", IntegerType(), minValue=1, maxValue=11, baseColumn="device_id", 
+    .withColumn("model_ser", "integer", minValue=1, maxValue=11, baseColumn="device_id", 
                 baseColumnType="hash", omit=True, )
-    .withColumn("model_line", StringType(), expr="concat(line, '#', model_ser)", 
+    .withColumn("model_line", "string", expr="concat(line, '#', model_ser)", 
                 baseColumn=["line", "model_ser"] )
-    .withColumn("event_type", StringType(), 
+    .withColumn("event_type", "string", 
                 values=["activation", "deactivation", "plan change", "telecoms activity", 
                         "internet activity", "device error", ],
                 random=True)
@@ -379,6 +380,12 @@ of unique values.
 - The `withColumn` method call for the `line` column introduces a temporary column for purposes of 
 generating other columns, but through the use of the `omit` option, omits it from the final data set.
 
+> NOTE: Type strings can be used in place of instances of data type objects. Type strings use SQL data type syntax
+> and can be used to specify basic types, numeric types such as "decimal(10,3)" as well as complex structured types
+> such as "array<string>", "map<string, int>" and "struct<a:binary, b:int, c:float>".
+> 
+> Type strings are case-insensitive.
+
 ### Scaling it up
 
 When generating data, the number of rows to be generated is controlled by the `rows` parameter supplied to the