1313from .spark_singleton import SparkSingleton
1414from .column_generation_spec import ColumnGenerationSpec
1515from .datagen_constants import DEFAULT_RANDOM_SEED , RANDOM_SEED_FIXED , RANDOM_SEED_HASH_FIELD_NAME , \
16- DEFAULT_SEED_COLUMN , SPARK_RANGE_COLUMN , MIN_SPARK_VERSION
16+ DEFAULT_SEED_COLUMN , SPARK_RANGE_COLUMN , MIN_SPARK_VERSION , \
17+ OPTION_RANDOM , OPTION_RANDOM_SEED , OPTION_RANDOM_SEED_METHOD
18+
1719from .utils import ensure , topologicalSort , DataGenError , deprecated , split_list_matching_condition
1820from . _version import _get_spark_version
1921from .schema_parser import SchemaParser
@@ -40,6 +42,7 @@ class DataGenerator:
4042 :param batchSize: = UDF batch number of rows to pass via Apache Arrow to Pandas UDFs
4143 :param debug: = if set to True, output debug level of information
4244 :param seedColumnName: = if set, this should be the name of the `seed` or logical `id` column. Defaults to `id`
45+ :param random: = if set, specifies default value of `random` attribute for all columns where not set
4346
4447 By default the seed column is named `id`. If you need to use this column name in your generated data,
4548 it is recommended that you use a different name for the seed column - for example `_id`.
@@ -63,6 +66,7 @@ class DataGenerator:
6366 def __init__ (self , sparkSession = None , name = None , randomSeedMethod = None ,
6467 rows = 1000000 , startingId = 0 , randomSeed = None , partitions = None , verbose = False ,
6568 batchSize = None , debug = False , seedColumnName = DEFAULT_SEED_COLUMN ,
69+ random = False ,
6670 ** kwargs ):
6771 """ Constructor for data generator object """
6872
@@ -119,6 +123,9 @@ def __init__(self, sparkSession=None, name=None, randomSeedMethod=None,
119123
120124 self ._seedMethod = randomSeedMethod
121125
126+ # set default random setting
127+ self ._defaultRandom = random if random is not None else False
128+
122129 if randomSeed is None :
123130 self ._instanceRandomSeed = self ._randomSeed
124131
@@ -297,6 +304,13 @@ def randomSeed(self):
297304 """ return the data generation spec random seed"""
298305 return self ._instanceRandomSeed
299306
307+ @property
308+ def random (self ):
309+ """ return the data generation spec default random setting for columns to be used
310+ when an explicit `random` attribute setting is not supplied
311+ """
312+ return self ._defaultRandom
313+
300314 def _markForPlanRegen (self ):
301315 """Mark that build plan needs to be regenerated
302316
@@ -591,13 +605,19 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
591605 :returns: modified in-place instance of test data generator allowing for chaining of calls following
592606 Builder pattern
593607
608+ .. note::
609+ matchTypes may also take SQL type strings or a list of SQL type strings such as "array<integer>"
610+
594611 You may also add a variety of options to further control the test data generation process.
595612 For full list of options, see :doc:`/reference/api/dbldatagen.column_spec_options`.
596613
597614 """
598615 if fields is not None and type (fields ) is str :
599616 fields = [fields ]
600617
618+ if OPTION_RANDOM not in kwargs :
619+ kwargs [OPTION_RANDOM ] = self ._defaultRandom
620+
601621 # add support for deprecated legacy names
602622 if "match_types" in kwargs :
603623 assert matchTypes is None , "Argument 'match_types' is deprecated, use 'matchTypes' instead"
@@ -620,7 +640,15 @@ def withColumnSpecs(self, patterns=None, fields=None, matchTypes=None, **kwargs)
620640 effective_fields = [x for x in effective_fields for y in patterns if re .search (y , x ) is not None ]
621641
622642 if matchTypes is not None :
623- effective_fields = [x for x in effective_fields for y in matchTypes
643+ effective_types = []
644+
645+ for typ in matchTypes :
646+ if isinstance (typ , str ):
647+ effective_types .append (SchemaParser .columnTypeFromString (typ ))
648+ else :
649+ effective_types .append (typ )
650+
651+ effective_fields = [x for x in effective_fields for y in effective_types
624652 if self .getColumnType (x ) == y ]
625653
626654 for f in effective_fields :
@@ -648,7 +676,7 @@ def _checkColumnOrColumnList(self, columns, allowId=False):
648676 return True
649677
650678 def withColumnSpec (self , colName , minValue = None , maxValue = None , step = 1 , prefix = None ,
651- random = False , distribution = None ,
679+ random = None , distribution = None ,
652680 implicit = False , dataRange = None , omit = False , baseColumn = None , ** kwargs ):
653681 """ add a column specification for an existing column
654682
@@ -670,6 +698,9 @@ def withColumnSpec(self, colName, minValue=None, maxValue=None, step=1, prefix=N
670698 Datatype parameter is only needed for `withColumn` and not permitted for `withColumnSpec`
671699 """ )
672700
701+ if random is None :
702+ random = self ._defaultRandom
703+
673704 # handle migration of old `min` and `max` options
674705 if _OLD_MIN_OPTION in kwargs :
675706 assert minValue is None , \
@@ -705,7 +736,7 @@ def hasColumnSpec(self, colName):
705736 return colName in self ._columnSpecsByName
706737
707738 def withColumn (self , colName , colType = StringType (), minValue = None , maxValue = None , step = 1 ,
708- dataRange = None , prefix = None , random = False , distribution = None ,
739+ dataRange = None , prefix = None , random = None , distribution = None ,
709740 baseColumn = None , nullable = True ,
710741 omit = False , implicit = False , noWarn = False ,
711742 ** kwargs ):
@@ -756,6 +787,9 @@ def withColumn(self, colName, colType=StringType(), minValue=None, maxValue=None
756787 maxValue = kwargs [_OLD_MAX_OPTION ]
757788 kwargs .pop (_OLD_MAX_OPTION , None )
758789
790+ if random is None :
791+ random = self ._defaultRandom
792+
759793 new_props = {}
760794 new_props .update (kwargs )
761795
@@ -792,25 +826,25 @@ def _generateColumnDefinition(self, colName, colType=None, baseColumn=None,
792826 # if the column has the option `random` set to true
793827 # then use the instance level random seed
794828 # otherwise use the default random seed for the class
795- if "randomSeed" in new_props :
796- effective_random_seed = new_props ["randomSeed" ]
797- new_props .pop ("randomSeed" )
798- new_props ["random" ] = True
829+ if OPTION_RANDOM_SEED in new_props :
830+ effective_random_seed = new_props [OPTION_RANDOM_SEED ]
831+ new_props .pop (OPTION_RANDOM_SEED )
832+ new_props [OPTION_RANDOM ] = True
799833
800834 # if random seed has override but randomSeedMethod does not
801835 # set it to fixed
802- if "randomSeedMethod" not in new_props :
803- new_props ["randomSeedMethod" ] = RANDOM_SEED_FIXED
836+ if OPTION_RANDOM_SEED_METHOD not in new_props :
837+ new_props [OPTION_RANDOM_SEED_METHOD ] = RANDOM_SEED_FIXED
804838
805- elif "random" in new_props and new_props ["random" ]:
839+ elif OPTION_RANDOM in new_props and new_props [OPTION_RANDOM ]:
806840 effective_random_seed = self ._instanceRandomSeed
807841 else :
808842 effective_random_seed = self ._randomSeed
809843
810844 # handle column level override
811- if "randomSeedMethod" in new_props :
812- effective_random_seed_method = new_props ["randomSeedMethod" ]
813- new_props .pop ("randomSeedMethod" )
845+ if OPTION_RANDOM_SEED_METHOD in new_props :
846+ effective_random_seed_method = new_props [OPTION_RANDOM_SEED_METHOD ]
847+ new_props .pop (OPTION_RANDOM_SEED_METHOD )
814848 else :
815849 effective_random_seed_method = self ._seedMethod
816850
0 commit comments