|
1 | 1 | package io.opentargets.etl.backend |
2 | 2 |
|
3 | 3 | import com.typesafe.scalalogging.LazyLogging |
4 | | -import io.opentargets.etl.backend.spark.Helpers.{columnExpr, flattenCat, nest} |
| 4 | +import io.opentargets.etl.backend.spark.Helpers.{columnExpr, flattenCat, nest, safeArrayUnion} |
5 | 5 | import io.opentargets.etl.backend.spark.IoHelpers.IOResources |
6 | 6 | import io.opentargets.etl.backend.spark.{IOResource, IoHelpers, Helpers => C} |
7 | 7 | import org.apache.spark.sql._ |
@@ -578,39 +578,13 @@ object Transformers { |
578 | 578 | .withColumnRenamed("geneId", "targetId") |
579 | 579 | .join(targets, Seq("targetId"), "left_outer") |
580 | 580 |
|
| 581 | + val window = Window.orderBy(col("credibleSetCount").desc, col("nSamples").desc) |
581 | 582 | val studiesWithTargetsAndCredSets = studiesWithTargets |
582 | 583 | .join(credibleSets, Seq("studyId"), "left_outer") |
583 | | - val credibleSetCountMax = credibleSets |
584 | | - .select(max(col("credibleSetCount"))) |
585 | | - .first() |
586 | | - .getDouble(0) |
587 | | - val credibleSetCountMin = credibleSets |
588 | | - .select(min(col("credibleSetCount"))) |
589 | | - .first() |
590 | | - .getDouble(0) |
591 | | - val credibleSetMultiplier = when( |
592 | | - col("credibleSetCount").isNotNull, |
593 | | - (col( |
594 | | - "credibleSetCount" |
595 | | - ) - credibleSetCountMin) / (credibleSetCountMax - (credibleSetCountMin)) |
596 | | - ).otherwise(0.01d) |
597 | | - val nSamplesMax = studiesWithTargetsAndCredSets |
598 | | - .select(max(col("nSamples").cast(DoubleType))) |
599 | | - .first() |
600 | | - .getDouble(0) |
601 | | - val nSamplesMin = studiesWithTargetsAndCredSets |
602 | | - .select(min(col("nSamples").cast(DoubleType))) |
603 | | - .first() |
604 | | - .getDouble(0) |
605 | | - val nSamplesMultiplier = when( |
606 | | - col("nSamples").isNotNull, |
607 | | - (col("nSamples") - nSamplesMin) / (nSamplesMax - nSamplesMin) |
608 | | - ).otherwise(0.01d) |
609 | | - // multiplier is a function of credible set count and nSamples giving double the weight to credible set counts |
610 | | - val multiplier = |
611 | | - lit(1d) + ( |
612 | | - ((lit(2d) * credibleSetMultiplier) + nSamplesMultiplier) / lit(3d) |
613 | | - ) |
| 584 | + .withColumn("rank", rank().over(window)) |
| 585 | + |
| 586 | + val max = studiesWithTargetsAndCredSets.agg(functions.max("rank")).first().getInt(0) |
| 587 | + val multiplier = expr(s"1 + (($max - rank) / ($max - 1))") |
614 | 588 |
|
615 | 589 | SearchIndex( |
616 | 590 | id = col("studyId"), |
|
0 commit comments