-
Notifications
You must be signed in to change notification settings - Fork 86
Closed
Labels
Milestone
Description
Want to have option to generate varying number of elements in array valued columns
Current Behavior
import dbldatagen as dg
from pyspark.sql.types import ArrayType, StringType
dataspec = dg.DataGenerator(spark, rows=10 * 1000000)
dataspec = (dataspec
.withColumn("name", "string", percentNulls=0.01, template=r'\\w \\w|\\w A. \\w|test')
.withColumn("serial_number", "string", minValue=1000000, maxValue=10000000,
prefix="dr", random=True)
.withColumn("email", "string", template=r'\\w.\\w@\\w.com', random=True, numColumns=5, structType="array",
omit=True)
.withColumn("emails", ArrayType(StringType()), expr="slice(email, 1, (abs(hash(id)) % 4)+1)",
baseColumns=["email"])
.withColumn("license_plate", "string", template=r'\\n-\\n')
)
dfTestData = dataspec.build()
display(dfTestData)
Future Behavior
import dbldatagen as dg
from pyspark.sql.types import ArrayType, StringType
dataspec = dg.DataGenerator(spark, rows=10 * 1000000)
dataspec = (dataspec
.withColumn("name", "string", percentNulls=0.01, template=r'\\w \\w|\\w A. \\w|test')
.withColumn("serial_number", "string", minValue=1000000, maxValue=10000000,
prefix="dr", random=True)
.withColumn("emails", "string", template=r'\\w.\\w@\\w.com', random=True, numColumns=(1,5), structType="array")
.withColumn("license_plate", "string", template=r'\\n-\\n')
)
dfTestData = dataspec.build()
display(dfTestData)
Context
Your Environment
dbldatagenversion used:- Databricks Runtime version:
- Cloud environment used: