Skip to content

Commit 621ef47

Browse files
authored
Merge pull request #416 from ldbc/factor-format
Add factor table format option to executables
2 parents 1551688 + 3c95382 commit 621ef47

File tree

3 files changed

+19
-5
lines changed

3 files changed

+19
-5
lines changed

src/main/scala/ldbc/snb/datagen/LdbcDatagen.scala

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ object LdbcDatagen extends SparkApp {
2626
batchPeriod: String = "day",
2727
numThreads: Option[Int] = None,
2828
format: String = "csv",
29-
generateFactors: Boolean = false,
3029
formatOptions: Map[String, String] = Map.empty,
3130
oversizeFactor: Option[Double] = None,
32-
epochMillis: Boolean = false
31+
epochMillis: Boolean = false,
32+
generateFactors: Boolean = false,
33+
factorFormat: String = "parquet"
3334
)
3435

3536
override type ArgsType = Args
@@ -118,6 +119,10 @@ object LdbcDatagen extends SparkApp {
118119
.action((x, c) => args.generateFactors.set(c)(true))
119120
.text("Generate factor tables")
120121

122+
opt[String]("factor-format")
123+
.action((x, c) => args.factorFormat.set(c)(x))
124+
.text("Output format of factor tables")
125+
121126
help('h', "help").text("prints this usage text")
122127

123128
opt[Unit]("epoch-millis")
@@ -146,7 +151,11 @@ object LdbcDatagen extends SparkApp {
146151
GenerationStage.run(generatorArgs)
147152

148153
if (args.generateFactors) {
149-
val factorArgs = FactorGenerationStage.Args(outputDir = args.outputDir, irFormat = irFormat)
154+
val factorArgs = FactorGenerationStage.Args(
155+
outputDir = args.outputDir,
156+
irFormat = irFormat,
157+
format = args.factorFormat
158+
)
150159
FactorGenerationStage.run(factorArgs)
151160
}
152161

src/main/scala/ldbc/snb/datagen/factors/FactorGenerationStage.scala

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
3131
case class Args(
3232
outputDir: String = "out",
3333
irFormat: String = "parquet",
34+
format: String = "parquet",
3435
only: Option[Regex] = None,
3536
force: Boolean = false
3637
)
@@ -54,6 +55,10 @@ object FactorGenerationStage extends DatagenStage with Logging {
5455
.action((x, c) => args.irFormat.set(c)(x))
5556
.text("Format of the raw input")
5657

58+
opt[String]("format")
59+
.action((x, c) => args.format.set(c)(x))
60+
.text("Output format")
61+
5762
opt[String]("only")
5863
.action((x, c) => args.only.set(c)(Some(x.r.anchored)))
5964
.text("Only generate factor tables whose name matches the supplied regex")
@@ -87,7 +92,7 @@ object FactorGenerationStage extends DatagenStage with Logging {
8792
FactorTable(name, calc(resolvedEntities), g)
8893
}
8994
)
90-
.foreach(_.write(FactorTableSink(args.outputDir, overwrite = args.force)))
95+
.foreach(_.write(FactorTableSink(args.outputDir, format = args.format, overwrite = args.force)))
9196
}
9297

9398
private def frequency(df: DataFrame, value: Column, by: Seq[Column], agg: Column => Column = count) =

src/main/scala/ldbc/snb/datagen/factors/io/package.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import ldbc.snb.datagen.util.Logging
88
import org.apache.spark.sql.SaveMode
99

1010
package object io {
11-
case class FactorTableSink(path: String, format: String = "csv", overwrite: Boolean = false)
11+
case class FactorTableSink(path: String, format: String = "parquet", overwrite: Boolean = false)
1212

1313
import ldbc.snb.datagen.io.Writer.ops._
1414
import ldbc.snb.datagen.io.dataframes.instances._

0 commit comments

Comments
 (0)