@@ -146,7 +146,7 @@ object Transformers {
146146 " consequenceScore" ,
147147 when(col(" transcriptConsequences.consequenceScore" ).isNotNull,
148148 col(" transcriptConsequences.consequenceScore" )
149- ).otherwise(lit(0 ) + lit( 1 ))
149+ ).otherwise(lit(1 ))
150150 )
151151 .withColumn(" targetId" , col(" transcriptConsequences.targetId" ))
152152 .withColumn(" transcriptScore" ,
@@ -568,11 +568,50 @@ object Transformers {
568568 " rsIds" ,
569569 " array(locationColon)"
570570 ),
571- ngrams = C .flattenCat(" array(variantId)" , " dbXrefs.id" )
571+ ngrams = C .flattenCat(" array(variantId)" , " dbXrefs.id" ),
572+ multiplier = lit(1.0d )
572573 )(variants).output
573574 }
574575
575- def setIdAndSelectFromStudies (): DataFrame =
576+ def setIdAndSelectFromStudies (targets : DataFrame , credibleSets : DataFrame ): DataFrame = {
577+ val studiesWithTargets = df
578+ .withColumnRenamed(" geneId" , " targetId" )
579+ .join(targets, Seq (" targetId" ), " left_outer" )
580+
581+ val studiesWithTargetsAndCredSets = studiesWithTargets
582+ .join(credibleSets, Seq (" studyId" ), " left_outer" )
583+ val credibleSetCountMax = credibleSets
584+ .select(max(col(" credibleSetCount" )))
585+ .first()
586+ .getDouble(0 )
587+ val credibleSetCountMin = credibleSets
588+ .select(min(col(" credibleSetCount" )))
589+ .first()
590+ .getDouble(0 )
591+ val credibleSetMultiplier = when(
592+ col(" credibleSetCount" ).isNotNull,
593+ (col(
594+ " credibleSetCount"
595+ ) - credibleSetCountMin) / (credibleSetCountMax - (credibleSetCountMin))
596+ ).otherwise(0.01d )
597+ val nSamplesMax = studiesWithTargetsAndCredSets
598+ .select(max(col(" nSamples" ).cast(DoubleType )))
599+ .first()
600+ .getDouble(0 )
601+ val nSamplesMin = studiesWithTargetsAndCredSets
602+ .select(min(col(" nSamples" ).cast(DoubleType )))
603+ .first()
604+ .getDouble(0 )
605+ val nSamplesMultiplier = when(
606+ col(" nSamples" ).isNotNull,
607+ (col(" nSamples" ) - nSamplesMin) / (nSamplesMax - nSamplesMin)
608+ ).otherwise(0.01d )
609+ // multiplier is a function of credible set count and nSamples giving double the weight to credible set counts
610+ val multiplier =
611+ lit(1d ) + (
612+ ((lit(2d ) * credibleSetMultiplier) + nSamplesMultiplier) / lit(3d )
613+ )
614+
576615 SearchIndex (
577616 id = col(" studyId" ),
578617 name = col(" studyId" ),
@@ -583,10 +622,24 @@ object Transformers {
583622 prefixes =
584623 C .flattenCat(" array(studyId)" , " array(pubmedId)" , " array(publicationFirstAuthor)" ),
585624 ngrams = C .flattenCat(" array(studyId)" ),
586- terms5 = C .flattenCat(" array(traitFromSource)" , " diseaseIds" ),
587- terms25 = C .flattenCat(" array(traitFromSource)" , " diseaseIds" ),
588- terms = C .flattenCat(" array(traitFromSource)" , " diseaseIds" )
589- )(df).output
625+ terms5 = C .flattenCat(" array(traitFromSource)" ,
626+ " diseaseIds" ,
627+ " array(approvedSymbol)" ,
628+ " array(targetId)"
629+ ),
630+ terms25 = C .flattenCat(" array(traitFromSource)" ,
631+ " diseaseIds" ,
632+ " array(approvedSymbol)" ,
633+ " array(targetId)"
634+ ),
635+ terms = C .flattenCat(" array(traitFromSource)" ,
636+ " diseaseIds" ,
637+ " array(approvedSymbol)" ,
638+ " array(targetId)"
639+ ),
640+ multiplier = multiplier
641+ )(studiesWithTargetsAndCredSets).output
642+ }
590643 }
591644}
592645
@@ -607,7 +660,8 @@ object Search extends LazyLogging {
607660 " target" -> searchSec.inputs.targets,
608661 " association" -> searchSec.inputs.associations,
609662 " variants" -> searchSec.inputs.variants,
610- " studies" -> searchSec.inputs.studies
663+ " studies" -> searchSec.inputs.studies,
664+ " credibleSets" -> searchSec.inputs.credibleSets
611665 )
612666
613667 val inputDataFrame = IoHelpers .readFrom(mappedInputs)
@@ -682,7 +736,20 @@ object Search extends LazyLogging {
682736 )
683737
684738 val studies = inputDataFrame(" studies" ).data
685- .select(" studyId" , " traitFromSource" , " pubmedId" , " publicationFirstAuthor" , " diseaseIds" )
739+ .select(" studyId" ,
740+ " traitFromSource" ,
741+ " pubmedId" ,
742+ " publicationFirstAuthor" ,
743+ " diseaseIds" ,
744+ " nSamples" ,
745+ " geneId"
746+ )
747+
748+ // read in the credible sets, extract the studyId field. Then create a column with the count of each studyId
749+ val credibleSets = inputDataFrame(" credibleSets" ).data
750+ .select(" studyId" )
751+ .groupBy(" studyId" )
752+ .agg(count(" studyId" ).cast(DoubleType ) as " credibleSetCount" )
686753
687754 val dLUT = diseases
688755 .withColumn(
@@ -809,7 +876,7 @@ object Search extends LazyLogging {
809876 .setIdAndSelectFromVariants()
810877 .repartition(100 )
811878
812- val searchStudies = studies.setIdAndSelectFromStudies().repartition(100 )
879+ val searchStudies = studies.setIdAndSelectFromStudies(targets, credibleSets ).repartition(100 )
813880
814881 val conf = context.configuration.search
815882 val outputs = Map (
0 commit comments