Skip to content

Commit ac62f92

Browse files
committed
rank by credset count and then by nsamples to create score
1 parent 9c9b2a4 commit ac62f92

File tree

1 file changed

+6
-32
lines changed

1 file changed

+6
-32
lines changed

src/main/scala/io/opentargets/etl/backend/Search.scala

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package io.opentargets.etl.backend
22

33
import com.typesafe.scalalogging.LazyLogging
4-
import io.opentargets.etl.backend.spark.Helpers.{columnExpr, flattenCat, nest}
4+
import io.opentargets.etl.backend.spark.Helpers.{columnExpr, flattenCat, nest, safeArrayUnion}
55
import io.opentargets.etl.backend.spark.IoHelpers.IOResources
66
import io.opentargets.etl.backend.spark.{IOResource, IoHelpers, Helpers => C}
77
import org.apache.spark.sql._
@@ -578,39 +578,13 @@ object Transformers {
578578
.withColumnRenamed("geneId", "targetId")
579579
.join(targets, Seq("targetId"), "left_outer")
580580

581+
val window = Window.orderBy(col("credibleSetCount").desc, col("nSamples").desc)
581582
val studiesWithTargetsAndCredSets = studiesWithTargets
582583
.join(credibleSets, Seq("studyId"), "left_outer")
583-
val credibleSetCountMax = credibleSets
584-
.select(max(col("credibleSetCount")))
585-
.first()
586-
.getDouble(0)
587-
val credibleSetCountMin = credibleSets
588-
.select(min(col("credibleSetCount")))
589-
.first()
590-
.getDouble(0)
591-
val credibleSetMultiplier = when(
592-
col("credibleSetCount").isNotNull,
593-
(col(
594-
"credibleSetCount"
595-
) - credibleSetCountMin) / (credibleSetCountMax - (credibleSetCountMin))
596-
).otherwise(0.01d)
597-
val nSamplesMax = studiesWithTargetsAndCredSets
598-
.select(max(col("nSamples").cast(DoubleType)))
599-
.first()
600-
.getDouble(0)
601-
val nSamplesMin = studiesWithTargetsAndCredSets
602-
.select(min(col("nSamples").cast(DoubleType)))
603-
.first()
604-
.getDouble(0)
605-
val nSamplesMultiplier = when(
606-
col("nSamples").isNotNull,
607-
(col("nSamples") - nSamplesMin) / (nSamplesMax - nSamplesMin)
608-
).otherwise(0.01d)
609-
// multiplier is a function of credible set count and nSamples giving double the weight to credible set counts
610-
val multiplier =
611-
lit(1d) + (
612-
((lit(2d) * credibleSetMultiplier) + nSamplesMultiplier) / lit(3d)
613-
)
584+
.withColumn("rank", rank().over(window))
585+
586+
val max = studiesWithTargetsAndCredSets.agg(functions.max("rank")).first().getInt(0)
587+
val multiplier = expr(s"1 + (($max - rank) / ($max - 1))")
614588

615589
SearchIndex(
616590
id = col("studyId"),

0 commit comments

Comments
 (0)