Skip to content

Commit 10aa0c9

Browse files
committed
Fix factor generation: use sampling for people2Hops and people4Hops
1 parent 07a3d18 commit 10aa0c9

File tree

1 file changed

+20
-16
lines changed

1 file changed

+20
-16
lines changed

src/main/scala/ldbc/snb/datagen/factors/FactorGenerationStage.scala

Lines changed: 20 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -385,12 +385,13 @@ object FactorGenerationStage extends DatagenStage with Logging {
385385
.select($"knows.Person1Id".alias("Person1Id"), $"knows.Person2Id".alias("Person2Id"))
386386
}
387387

388-
nHops(
389-
allKnows,
390-
n = 4,
391-
joinKeys = ("Person2Id", "Person1Id"),
392-
sample = Some(chinesePeopleSample)
393-
).join(person.as("Person1"), $"Person1.id" === $"Person1Id")
388+
val personPairs = nHops(
389+
allKnows,
390+
n = 4,
391+
joinKeys = ("Person2Id", "Person1Id"),
392+
sample = Some(chinesePeopleSample)
393+
)
394+
.join(person.as("Person1"), $"Person1.id" === $"Person1Id")
394395
.join(person.as("Person2"), $"Person2.id" === $"Person1Id")
395396
.select(
396397
$"Person1Id",
@@ -400,8 +401,9 @@ object FactorGenerationStage extends DatagenStage with Logging {
400401
$"Person2.creationDate".as("Person2CreationDate"),
401402
$"Person2.deletionDate".as("Person2DeletionDate")
402403
)
403-
.sort($"Person1Id", $"Person2Id")
404-
.limit(10000)
404+
405+
val sampleFractionPersonPairs = 10000.0 / personPairs.count()
406+
personPairs.sample(sampleFractionPersonPairs, 42)
405407
},
406408
"people2Hops" -> Factor(PersonType, PlaceType, PersonKnowsPersonType) { case Seq(person, place, knows) =>
407409
val cities = place.where($"type" === "City").cache()
@@ -431,12 +433,13 @@ object FactorGenerationStage extends DatagenStage with Logging {
431433
.select($"knows.Person1Id".alias("Person1Id"), $"knows.Person2Id".alias("Person2Id"))
432434
}
433435

434-
nHops(
435-
allKnows,
436-
n = 2,
437-
joinKeys = ("Person2Id", "Person1Id"),
438-
sample = Some(chinesePeopleSample)
439-
).join(person.as("Person1"), $"Person1.id" === $"Person1Id")
436+
val personPairs = nHops(
437+
allKnows,
438+
n = 2,
439+
joinKeys = ("Person2Id", "Person1Id"),
440+
sample = Some(chinesePeopleSample)
441+
)
442+
.join(person.as("Person1"), $"Person1.id" === $"Person1Id")
440443
.join(person.as("Person2"), $"Person2.id" === $"Person1Id")
441444
.select(
442445
$"Person1Id",
@@ -446,8 +449,9 @@ object FactorGenerationStage extends DatagenStage with Logging {
446449
$"Person2.creationDate".as("Person2CreationDate"),
447450
$"Person2.deletionDate".as("Person2DeletionDate")
448451
)
449-
.sort($"Person1Id", $"Person2Id")
450-
.limit(10000)
452+
453+
val sampleFractionPersonPairs = 10000.0 / personPairs.count()
454+
personPairs.sample(sampleFractionPersonPairs, 42)
451455
},
452456
"sameUniversityKnows" -> LargeFactor(PersonKnowsPersonType, PersonStudyAtUniversityType) { case Seq(personKnowsPerson, studyAt) =>
453457
val size = Math.max(Math.ceil(personKnowsPerson.rdd.getNumPartitions / 10).toInt, 1)

0 commit comments

Comments
 (0)