Skip to content

Commit 9c9b2a4

Browse files
committed
add credible set count to search ranking
add all search steps back in
1 parent a8c3ef3 commit 9c9b2a4

File tree

3 files changed

+83
-11
lines changed

3 files changed

+83
-11
lines changed

src/main/resources/reference.conf

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1198,6 +1198,10 @@ search {
11981198
targets = ${target.outputs.target}
11991199
drugs = ${drug.outputs}
12001200
associations = ${associations.outputs.indirect-by-overall}
1201+
credible-sets = {
1202+
format = "parquet"
1203+
path = ${common.path}"/output/credible_set"
1204+
}
12011205
studies = {
12021206
format = "parquet"
12031207
path = ${common.path}"/output/study"

src/main/scala/io/opentargets/etl/backend/Configuration.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ object Configuration extends LazyLogging {
211211
drugs: DrugOutputs,
212212
associations: IOResourceConfig,
213213
variants: IOResourceConfig,
214-
studies: IOResourceConfig
214+
studies: IOResourceConfig,
215+
credibleSets: IOResourceConfig
215216
)
216217

217218
case class SearchOutputsSection(

src/main/scala/io/opentargets/etl/backend/Search.scala

Lines changed: 77 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ object Transformers {
146146
"consequenceScore",
147147
when(col("transcriptConsequences.consequenceScore").isNotNull,
148148
col("transcriptConsequences.consequenceScore")
149-
).otherwise(lit(0) + lit(1))
149+
).otherwise(lit(1))
150150
)
151151
.withColumn("targetId", col("transcriptConsequences.targetId"))
152152
.withColumn("transcriptScore",
@@ -568,11 +568,50 @@ object Transformers {
568568
"rsIds",
569569
"array(locationColon)"
570570
),
571-
ngrams = C.flattenCat("array(variantId)", "dbXrefs.id")
571+
ngrams = C.flattenCat("array(variantId)", "dbXrefs.id"),
572+
multiplier = lit(1.0d)
572573
)(variants).output
573574
}
574575

575-
def setIdAndSelectFromStudies(): DataFrame =
576+
def setIdAndSelectFromStudies(targets: DataFrame, credibleSets: DataFrame): DataFrame = {
577+
val studiesWithTargets = df
578+
.withColumnRenamed("geneId", "targetId")
579+
.join(targets, Seq("targetId"), "left_outer")
580+
581+
val studiesWithTargetsAndCredSets = studiesWithTargets
582+
.join(credibleSets, Seq("studyId"), "left_outer")
583+
val credibleSetCountMax = credibleSets
584+
.select(max(col("credibleSetCount")))
585+
.first()
586+
.getDouble(0)
587+
val credibleSetCountMin = credibleSets
588+
.select(min(col("credibleSetCount")))
589+
.first()
590+
.getDouble(0)
591+
val credibleSetMultiplier = when(
592+
col("credibleSetCount").isNotNull,
593+
(col(
594+
"credibleSetCount"
595+
) - credibleSetCountMin) / (credibleSetCountMax - (credibleSetCountMin))
596+
).otherwise(0.01d)
597+
val nSamplesMax = studiesWithTargetsAndCredSets
598+
.select(max(col("nSamples").cast(DoubleType)))
599+
.first()
600+
.getDouble(0)
601+
val nSamplesMin = studiesWithTargetsAndCredSets
602+
.select(min(col("nSamples").cast(DoubleType)))
603+
.first()
604+
.getDouble(0)
605+
val nSamplesMultiplier = when(
606+
col("nSamples").isNotNull,
607+
(col("nSamples") - nSamplesMin) / (nSamplesMax - nSamplesMin)
608+
).otherwise(0.01d)
609+
// multiplier is a function of credible set count and nSamples giving double the weight to credible set counts
610+
val multiplier =
611+
lit(1d) + (
612+
((lit(2d) * credibleSetMultiplier) + nSamplesMultiplier) / lit(3d)
613+
)
614+
576615
SearchIndex(
577616
id = col("studyId"),
578617
name = col("studyId"),
@@ -583,10 +622,24 @@ object Transformers {
583622
prefixes =
584623
C.flattenCat("array(studyId)", "array(pubmedId)", "array(publicationFirstAuthor)"),
585624
ngrams = C.flattenCat("array(studyId)"),
586-
terms5 = C.flattenCat("array(traitFromSource)", "diseaseIds"),
587-
terms25 = C.flattenCat("array(traitFromSource)", "diseaseIds"),
588-
terms = C.flattenCat("array(traitFromSource)", "diseaseIds")
589-
)(df).output
625+
terms5 = C.flattenCat("array(traitFromSource)",
626+
"diseaseIds",
627+
"array(approvedSymbol)",
628+
"array(targetId)"
629+
),
630+
terms25 = C.flattenCat("array(traitFromSource)",
631+
"diseaseIds",
632+
"array(approvedSymbol)",
633+
"array(targetId)"
634+
),
635+
terms = C.flattenCat("array(traitFromSource)",
636+
"diseaseIds",
637+
"array(approvedSymbol)",
638+
"array(targetId)"
639+
),
640+
multiplier = multiplier
641+
)(studiesWithTargetsAndCredSets).output
642+
}
590643
}
591644
}
592645

@@ -607,7 +660,8 @@ object Search extends LazyLogging {
607660
"target" -> searchSec.inputs.targets,
608661
"association" -> searchSec.inputs.associations,
609662
"variants" -> searchSec.inputs.variants,
610-
"studies" -> searchSec.inputs.studies
663+
"studies" -> searchSec.inputs.studies,
664+
"credibleSets" -> searchSec.inputs.credibleSets
611665
)
612666

613667
val inputDataFrame = IoHelpers.readFrom(mappedInputs)
@@ -682,7 +736,20 @@ object Search extends LazyLogging {
682736
)
683737

684738
val studies = inputDataFrame("studies").data
685-
.select("studyId", "traitFromSource", "pubmedId", "publicationFirstAuthor", "diseaseIds")
739+
.select("studyId",
740+
"traitFromSource",
741+
"pubmedId",
742+
"publicationFirstAuthor",
743+
"diseaseIds",
744+
"nSamples",
745+
"geneId"
746+
)
747+
748+
// read in the credible sets, extract the studyId field. Then create a column with the count of each studyId
749+
val credibleSets = inputDataFrame("credibleSets").data
750+
.select("studyId")
751+
.groupBy("studyId")
752+
.agg(count("studyId").cast(DoubleType) as "credibleSetCount")
686753

687754
val dLUT = diseases
688755
.withColumn(
@@ -809,7 +876,7 @@ object Search extends LazyLogging {
809876
.setIdAndSelectFromVariants()
810877
.repartition(100)
811878

812-
val searchStudies = studies.setIdAndSelectFromStudies().repartition(100)
879+
val searchStudies = studies.setIdAndSelectFromStudies(targets, credibleSets).repartition(100)
813880

814881
val conf = context.configuration.search
815882
val outputs = Map(

0 commit comments

Comments
 (0)