Skip to content

Commit 0f89fd8

Browse files
authored
initial code refactoring (#74)
Changes include: * Using the f-strings instead of `.format` that is harder to read * Use `key in dict` instead of `key in dict.keys()` * Fix warning in tests about deprecated Spark property for Arrow
1 parent 10a5519 commit 0f89fd8

16 files changed

+108
-127
lines changed

dbldatagen/column_generation_spec.py

Lines changed: 36 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def __init__(self, name, colType=None, minValue=0, maxValue=None, step=1, prefix
103103
if colType is None: # default to integer field if none specified
104104
colType = IntegerType()
105105

106-
assert isinstance(colType, DataType), "colType `{}` is not instance of DataType".format(colType)
106+
assert isinstance(colType, DataType), f"colType `{colType}` is not instance of DataType"
107107

108108
self._initialBuildPlan = [] # the build plan for the column - descriptive only
109109
self.executionHistory = [] # the execution history for the column
@@ -375,19 +375,19 @@ def _setupTemporaryColumns(self):
375375
ensure(self['numColumns'] is None or self['numColumns'] <= 1,
376376
"weighted columns not supported for multi-column or multi-feature values")
377377
if self.random:
378-
temp_name = "_rnd_{}".format(self.name)
378+
temp_name = f"_rnd_{self.name}"
379379
self.dependencies.append(temp_name)
380-
desc = "adding temporary column {} required by {}".format(temp_name, self.name)
380+
desc = f"adding temporary column {temp_name} required by {self.name}"
381381
self._initialBuildPlan.append(desc)
382382
sql_random_generator = self._getUniformRandomSQLExpression(self.name)
383383
self.temporaryColumns.append((temp_name, DoubleType(), {'expr': sql_random_generator, 'omit': True,
384384
'description': desc}))
385385
self._weightedBaseColumn = temp_name
386386
else:
387387
# create temporary expression mapping values to range of weights
388-
temp_name = "_scaled_{}".format(self.name)
388+
temp_name = f"_scaled_{self.name}"
389389
self.dependencies.append(temp_name)
390-
desc = "adding temporary column {} required by {}".format(temp_name, self.name)
390+
desc = f"adding temporary column {temp_name} required by {self.name}"
391391
self._initialBuildPlan.append(desc)
392392

393393
# use a base expression based on mapping base column to size of data
@@ -511,10 +511,10 @@ def _getUniformRandomExpression(self, col_name):
511511
"""
512512
assert col_name is not None, "`col_name` must not be None"
513513
if self._randomSeedMethod == RANDOM_SEED_FIXED and self._randomSeed != RANDOM_SEED_RANDOM:
514-
return expr("rand({})".format(self._randomSeed))
514+
return expr(f"rand({self._randomSeed})")
515515
elif self._randomSeedMethod == RANDOM_SEED_HASH_FIELD_NAME:
516516
assert self.name is not None, " `self.name` must not be none"
517-
return expr("rand(hash('{}'))".format(self.name))
517+
return expr(f"rand(hash('{self.name}'))")
518518
else:
519519
return rand()
520520

@@ -530,8 +530,7 @@ def _getRandomExpressionForDistribution(self, col_name, col_distribution):
530530
assert isinstance(col_distribution, DataDistribution), \
531531
"`distribution` object must be an instance of data distribution"
532532

533-
self.executionHistory.append(".. random number generation via distribution `{}`"
534-
.format(str(col_distribution)))
533+
self.executionHistory.append(f".. random number generation via distribution `{col_distribution}`")
535534

536535
return col_distribution.generateNormalizedDistributionSample()
537536

@@ -543,10 +542,10 @@ def _getUniformRandomSQLExpression(self, col_name):
543542
assert col_name is not None, " `col_name` must not be None"
544543
if self._randomSeedMethod == RANDOM_SEED_FIXED and self._randomSeed != RANDOM_SEED_RANDOM:
545544
assert self._randomSeed is not None, "`randomSeed` must not be None"
546-
return "rand({})".format(self._randomSeed)
545+
return f"rand({self._randomSeed})"
547546
elif self._randomSeedMethod == RANDOM_SEED_HASH_FIELD_NAME:
548547
assert self.name is not None, "`self.name` must not be none"
549-
return "rand(hash('{}'))".format(self.name)
548+
return f"rand(hash('{self.name}'))"
550549
else:
551550
return "rand()"
552551

@@ -597,7 +596,7 @@ def _getScaledIntSQLExpression(self, col_name, scale, base_columns, base_datatyp
597596
result = f"cast( ( floor(({column_set} % {scale}) + {scale}) % {scale}) as double) "
598597

599598
if normalize:
600-
result = "({} / {})".format(result, (scale * 1.0) - 1.0)
599+
result = f"({result} / {(scale * 1.0) - 1.0})"
601600

602601
self.logger.debug("computing scaled field [%s] as expression [%s]", col_name, result)
603602
return result
@@ -613,7 +612,7 @@ def getNames(self):
613612
struct_type = self._csOptions.getOrElse('structType', None)
614613

615614
if num_columns > 1 and struct_type is None:
616-
return ["{0}_{1}".format(self.name, x) for x in range(0, num_columns)]
615+
return [f"{self.name}_{x}" for x in range(0, num_columns)]
617616
else:
618617
return [self.name]
619618

@@ -623,7 +622,7 @@ def getNamesAndTypes(self):
623622
struct_type = self._csOptions.getOrElse('structType', None)
624623

625624
if num_columns > 1 and struct_type is None:
626-
return [("{0}_{1}".format(self.name, x), self.datatype) for x in range(0, num_columns)]
625+
return [(f"{self.name}_{x}", self.datatype) for x in range(0, num_columns)]
627626
else:
628627
return [(self.name, self.datatype)]
629628

@@ -786,26 +785,22 @@ def _checkProps(self, column_props):
786785
raise ValueError("Effective range greater than range of type")
787786

788787
for k in column_props.keys():
789-
ensure(k in ColumnSpecOptions._ALLOWED_PROPERTIES, 'invalid column option {0}'.format(k))
788+
ensure(k in ColumnSpecOptions._ALLOWED_PROPERTIES, f'invalid column option {k}')
790789

791790
for arg in ColumnSpecOptions._REQUIRED_PROPERTIES:
792-
ensure(arg in column_props.keys() and column_props[arg] is not None,
793-
'missing column option {0}'.format(arg))
791+
ensure(column_props.get(arg) is not None, f'missing column option {arg}')
794792

795793
for arg in ColumnSpecOptions._FORBIDDEN_PROPERTIES:
796-
ensure(arg not in column_props.keys(),
797-
'forbidden column option {0}'.format(arg))
794+
ensure(arg not in column_props, f'forbidden column option {arg}')
798795

799796
# check weights and values
800-
if 'weights' in column_props.keys():
801-
ensure('values' in column_props.keys(),
802-
"weights are only allowed for columns with values - column '{}' ".format(column_props['name']))
797+
if 'weights' in column_props:
798+
ensure('values' in column_props,
799+
f"weights are only allowed for columns with values - column '{column_props['name']}' ")
803800
ensure(column_props['values'] is not None and len(column_props['values']) > 0,
804-
"weights must be associated with non-empty list of values - column '{}' ".format(
805-
column_props['name']))
801+
f"weights must be associated with non-empty list of values - column '{column_props['name']}' ")
806802
ensure(len(column_props['values']) == len(column_props['weights']),
807-
"length of list of weights must be equal to length of list of values - column '{}' ".format(
808-
column_props['name']))
803+
f"length of list of weights must be equal to length of list of values - column '{column_props['name']}' ")
809804

810805
def getPlanEntry(self):
811806
""" Get execution plan entry for object
@@ -816,7 +811,7 @@ def getPlanEntry(self):
816811
if desc is not None:
817812
return " |-- " + desc
818813
else:
819-
return " |-- building column generator for column {}".format(self.name)
814+
return f" |-- building column generator for column {self.name}"
820815

821816
def _makeWeightedColumnValuesExpression(self, values, weights, seed_column_name):
822817
"""make SQL expression to compute the weighted values expression
@@ -872,17 +867,17 @@ def _getSeedExpression(self, base_column):
872867
assert len(base_column) > 0, "`baseColumn` must be list of column names"
873868
if len(base_column) == 1:
874869
if self._baseColumnComputeMethod == HASH_COMPUTE_METHOD:
875-
return expr("hash({})".format(base_column[0]))
870+
return expr(f"hash({base_column[0]})")
876871
else:
877872
return col(base_column[0])
878873
elif self._baseColumnComputeMethod == VALUES_COMPUTE_METHOD:
879-
base_values = ["string(ifnull(`{}`, 'null'))".format(x) for x in base_column]
880-
return expr("array({})".format(",".join(base_values)))
874+
base_values = [f"string(ifnull(`{x}`, 'null'))" for x in base_column]
875+
return expr(f"array({','.join(base_values)})")
881876
else:
882-
return expr("hash({})".format(",".join(base_column)))
877+
return expr(f"hash({','.join(base_column)})")
883878
else:
884879
if self._baseColumnComputeMethod == HASH_COMPUTE_METHOD:
885-
return expr("hash({})".format(base_column))
880+
return expr(f"hash({base_column})")
886881
else:
887882
return col(base_column)
888883

@@ -1002,7 +997,7 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
1002997
self.executionHistory.append(f".. using SQL expression `{self.expr}` as base")
1003998
self.executionHistory.append(f".. casting to `{self.datatype}`")
1004999
elif self._dataRange is not None and self._dataRange.isFullyPopulated():
1005-
self.executionHistory.append(".. computing ranged value: {}".format(self._dataRange))
1000+
self.executionHistory.append(f".. computing ranged value: {self._dataRange}")
10061001
new_def = self._computeRangedColumn(base_column=self.baseColumn, datarange=self._dataRange,
10071002
is_random=col_is_rand)
10081003
elif type(self.datatype) is DateType:
@@ -1011,7 +1006,7 @@ def _makeSingleGenerationExpression(self, index=None, use_pandas_optimizations=F
10111006
# record execution history
10121007
self.executionHistory.append(".. using random date expression")
10131008
sql_random_generator = self._getUniformRandomSQLExpression(self.name)
1014-
new_def = expr("date_sub(current_date, rounding({}*1024))".format(sql_random_generator)).astype(
1009+
new_def = expr(f"date_sub(current_date, rounding({sql_random_generator}*1024))").astype(
10151010
self.datatype)
10161011
else:
10171012
if self._baseColumnComputeMethod == VALUES_COMPUTE_METHOD:
@@ -1051,7 +1046,7 @@ def _applyTextFormatExpression(self, new_def, sformat):
10511046
# note :
10521047
# while it seems like this could use a shared instance, this does not work if initialized
10531048
# in a class method
1054-
self.executionHistory.append(".. applying column format `{}`".format(sformat))
1049+
self.executionHistory.append(f".. applying column format `{sformat}`")
10551050
new_def = format_string(sformat, new_def)
10561051
return new_def
10571052

@@ -1083,13 +1078,11 @@ def _applyTextGenerationExpression(self, new_def, use_pandas_optimizations):
10831078
# in a class method
10841079
tg = self.textGenerator
10851080
if use_pandas_optimizations:
1086-
self.executionHistory.append(".. text generation via pandas scalar udf `{}`"
1087-
.format(str(tg)))
1081+
self.executionHistory.append(f".. text generation via pandas scalar udf `{tg}`")
10881082
u_value_from_generator = pandas_udf(tg.pandasGenerateText,
10891083
returnType=StringType()).asNondeterministic()
10901084
else:
1091-
self.executionHistory.append(".. text generation via udf `{}`"
1092-
.format(str(tg)))
1085+
self.executionHistory.append(f".. text generation via udf `{tg}`")
10931086
u_value_from_generator = udf(tg.classicGenerateText,
10941087
StringType()).asNondeterministic()
10951088
new_def = u_value_from_generator(new_def)
@@ -1102,7 +1095,7 @@ def _applyFinalCastExpression(self, col_type, new_def):
11021095
:param new_def: column definition being created
11031096
:returns: new column definition
11041097
"""
1105-
self.executionHistory.append(".. casting column [{}] to `{}`".format(self.name, col_type))
1098+
self.executionHistory.append(f".. casting column [{self.name}] to `{col_type}`")
11061099

11071100
# cast the result to the appropriate type. For dates, cast first to timestamp, then to date
11081101
if type(col_type) is DateType:
@@ -1119,7 +1112,7 @@ def _applyComputePercentNullsExpression(self, newDef, probabilityNulls):
11191112
:param probabilityNulls: Probability of nulls to be generated for particular column. Values can be 0.0 - 1.0
11201113
:returns: new column definition with probability of nulls applied
11211114
"""
1122-
assert self.nullable, "Column `{}` must be nullable for `percent_nulls` option".format(self.name)
1115+
assert self.nullable, f"Column `{self.name}` must be nullable for `percent_nulls` option"
11231116
self.executionHistory.append(".. applying null generator - `when rnd > prob then value - else null`")
11241117

11251118
assert probabilityNulls is not None, "option 'percent_nulls' must not be null value or None"
@@ -1140,7 +1133,7 @@ def _computeImpliedRangeIfNeeded(self, col_type):
11401133
self._dataRange = NRange(0, len(self.values) - 1, 1)
11411134
elif type(col_type) is BooleanType:
11421135
self._dataRange = NRange(0, 1, 1)
1143-
self.executionHistory.append(".. using adjusted effective range: {}".format(self._dataRange))
1136+
self.executionHistory.append(f".. using adjusted effective range: {self._dataRange}")
11441137

11451138
def makeGenerationExpressions(self):
11461139
""" Generate structured column if multiple columns or features are specified
@@ -1171,7 +1164,7 @@ def makeGenerationExpressions(self):
11711164
exec_step_history += f"`{self.baseColumn}`, method: `{self._baseColumnComputeMethod}`"
11721165
self.executionHistory.append(exec_step_history)
11731166
else:
1174-
self.executionHistory.append("generating multiple columns {0} - `{1}`".format(num_columns, self['name']))
1167+
self.executionHistory.append(f"generating multiple columns {num_columns} - `{self['name']}`")
11751168
retval = [self._makeSingleGenerationExpression(x) for x in range(num_columns)]
11761169

11771170
if struct_type == 'array':

dbldatagen/column_spec_options.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -204,10 +204,10 @@ def checkBoolOption(self, v, name=None, optional=True):
204204
assert name is not None, "`name` must be specified"
205205
if optional:
206206
ensure(v is None or type(v) is bool,
207-
"Option `{}` must be boolean if specified - value: {}, type: {}".format(name, v, type(v)))
207+
f"Option `{name}` must be boolean if specified - value: {v}, type: {type(v)}")
208208
else:
209209
ensure(type(v) is bool,
210-
"Option `{}` must be boolean - value: {}, type: {}".format(name, v, type(v)))
210+
f"Option `{name}` must be boolean - value: {v}, type: {type(v)}")
211211

212212
def checkExclusiveOptions(self, options):
213213
"""check if the options are exclusive - i.e only one is not None
@@ -227,8 +227,7 @@ def checkOptionValues(self, option, option_values):
227227
"""
228228
assert option is not None and len(option.strip()) > 0, "option must be non empty"
229229
assert type(option_values) is list, "`option_values` must be list"
230-
assert self[option] in option_values, "option: `{}` must have one of the values {}".format(option,
231-
option_values)
230+
assert self[option] in option_values, f"option: `{option}` must have one of the values {option_values}"
232231

233232
def checkValidColumnProperties(self, columnProps):
234233
"""
@@ -254,20 +253,16 @@ def checkValidColumnProperties(self, columnProps):
254253
f"invalid column option {k}")
255254

256255
for arg in self._REQUIRED_PROPERTIES:
257-
ensure(arg in columnProps.keys() and columnProps[arg] is not None,
258-
f"missing column option {arg}")
256+
ensure(columnProps.get(arg) is not None, f"missing column option {arg}")
259257

260258
for arg in self._FORBIDDEN_PROPERTIES:
261-
ensure(arg not in columnProps.keys(),
262-
f"forbidden column option {arg}")
259+
ensure(arg not in columnProps, f"forbidden column option {arg}")
263260

264261
# check weights and values
265-
if 'weights' in columnProps.keys():
266-
ensure('values' in columnProps.keys(),
267-
"weights are only allowed for columns with values - column '{0}' ".format(columnProps['name']))
262+
if 'weights' in columnProps:
263+
ensure('values' in columnProps,
264+
f"weights are only allowed for columns with values - column '{columnProps['name']}' ")
268265
ensure(columnProps['values'] is not None and len(columnProps['values']) > 0,
269-
"weights must be associated with non-empty list of values - column '{0}' ".format(
270-
columnProps['name']))
266+
f"weights must be associated with non-empty list of values - column '{columnProps['name']}' ")
271267
ensure(len(columnProps['values']) == len(columnProps['weights']),
272-
"length of list of weights must be equal to length of list of values - column '{0}' ".format(
273-
columnProps['name']))
268+
f"length of list of weights must be equal to length of list of values - column '{columnProps['name']}' ")

dbldatagen/data_analyzer.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def _lookupFieldType(self, typ):
5757
def _summarizeField(self, field):
5858
"""Generate summary for individual field"""
5959
if isinstance(field, StructField):
60-
return "{} {}".format(field.name, self._lookupFieldType(str(field.dataType)))
60+
return f"{field.name} {self._lookupFieldType(str(field.dataType))}"
6161
else:
6262
return str(field)
6363

@@ -86,7 +86,7 @@ def _displayRow(self, row):
8686
results = []
8787
row_key_pairs = row.asDict()
8888
for x in row_key_pairs:
89-
results.append("{}: {}".format(str(x), str(row[x])))
89+
results.append(f"{x}: {row[x]}")
9090

9191
return ", ".join(results)
9292

0 commit comments

Comments
 (0)