@@ -426,8 +426,8 @@ dfTestData.write.format("delta").mode("overwrite").save(
426426## Using SQL in data generation
427427Any column specification can use arbitrary SQL expressions during data generation via the ` expr ` parameter.
428428
429- The following example shows generation of synthetic names, email addresses, payment instruments and
430- use of a SQL expression to compute MD5 hashes of synthetic credit card numbers :
429+ The following example shows generation of synthetic names, email addresses and
430+ use of a SQL expression to compute MD5 hashes of hypothetical synthetic credit card :
431431
432432``` python
433433import dbldatagen as dg
@@ -440,8 +440,14 @@ spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)
440440
441441dataspec = (dg.DataGenerator(spark, rows = data_rows, partitions = 8 , randomSeedMethod = " hash_fieldname" )
442442 .withColumn(" name" , percentNulls = 0.01 , template = r ' \\ w \\ w| \\ w a. \\ w' )
443- .withColumn(" payment_instrument_type" , values = [' paypal' , ' visa' , ' mastercard' , ' amex' ], random = True )
444- .withColumn(" payment_instrument" , minValue = 1000000 , maxValue = 10000000 , template = " dddd dddddd ddddd" )
443+ .withColumn(" payment_instrument_type" , values = [' paypal' , ' visa' , ' mastercard' , ' amex' ],
444+ random = True )
445+ .withColumn(" int_payment_instrument" , " int" , minValue = 0000 , maxValue = 9999 ,
446+ baseColumn = " name" ,
447+ baseColumnType = " hash" , omit = True )
448+ .withColumn(" payment_instrument" ,
449+ expr = " format_number(int_payment_instrument, '**** ****** *####')" ,
450+ baseColumn = " int_payment_instrument" )
445451 .withColumn(" email" , template = r ' \\ w. \\ w@\\ w. com' )
446452 .withColumn(" md5_payment_instrument" ,
447453 expr = " md5(concat(payment_instrument_type, ':', payment_instrument))" ,
@@ -483,12 +489,17 @@ data_rows = 10000000
483489spark.conf.set(" spark.sql.shuffle.partitions" , shuffle_partitions_requested)
484490
485491dataspec = (
486- dg.DataGenerator(spark, rows = data_rows, partitions = 8 , randomSeedMethod = " hash_fieldname" , randomSeed = 42 )
492+ dg.DataGenerator(spark, rows = data_rows, partitions = 8 , randomSeedMethod = " hash_fieldname" ,
493+ randomSeed = 42 )
487494 .withColumn(" name" , percentNulls = 0.01 , template = r ' \\ w \\ w| \\ w a. \\ w' )
488495 .withColumn(" payment_instrument_type" , values = [' paypal' , ' visa' , ' mastercard' , ' amex' ],
489496 random = True )
490- .withColumn(" payment_instrument" , minValue = 1000000 , maxValue = 10000000 ,
491- template = " dddd dddddd ddddd" )
497+ .withColumn(" int_payment_instrument" , " int" , minValue = 0000 , maxValue = 9999 ,
498+ baseColumn = " name" ,
499+ baseColumnType = " hash" , omit = True )
500+ .withColumn(" payment_instrument" ,
501+ expr = " format_number(int_payment_instrument, '**** ****** *####')" ,
502+ baseColumn = " int_payment_instrument" )
492503 .withColumn(" email" , template = r ' \\ w. \\ w@\\ w. com' )
493504 .withColumn(" md5_payment_instrument" ,
494505 expr = " md5(concat(payment_instrument_type, ':', payment_instrument))" ,
0 commit comments