formatting

javfg · javfg · commit 538bff76d8a3 · 2025-08-18T11:34:35.000+01:00
diff --git a/src/main/scala/io/opentargets/etl/backend/evidence/Evidence.scala b/src/main/scala/io/opentargets/etl/backend/evidence/Evidence.scala
@@ -197,18 +197,21 @@ object Evidence extends LazyLogging {
     resolved
   }
 
-  /**
-   * Resolves publication dates for evidence based on literature identifiers.
-   * 
-   * This function takes evidence records with literature arrays and matches them
-   * against a publication date mapping to add publicationDate and evidenceDate columns.
-   * The evidenceDate uses publicationDate when available, falling back to releaseDate.
-   *
-   * @param df the evidence DataFrame containing literature arrays
-   * @param publication_date_mapping DataFrame with publication dates mapped to identifiers
-   * @param context the ETL session context containing Spark session
-   * @return DataFrame with added publicationDate and evidenceDate columns
-   */
+  /** Resolves publication dates for evidence based on literature identifiers.
+    *
+    * This function takes evidence records with literature arrays and matches them against a
+    * publication date mapping to add publicationDate and evidenceDate columns. The evidenceDate
+    * uses publicationDate when available, falling back to releaseDate.
+    *
+    * @param df
+    *   the evidence DataFrame containing literature arrays
+    * @param publication_date_mapping
+    *   DataFrame with publication dates mapped to identifiers
+    * @param context
+    *   the ETL session context containing Spark session
+    * @return
+    *   DataFrame with added publicationDate and evidenceDate columns
+    */
   def resolvePublicationDates(
       df: DataFrame,
       publication_date_mapping: DataFrame
@@ -218,7 +221,7 @@ object Evidence extends LazyLogging {
     implicit val session: SparkSession = context.sparkSession
 
     // Filter for MED, AGR and pre-prints (PPR) and create temp view called pub_data:
-     val processedPublicationData = publication_date_mapping
+    val processedPublicationData = publication_date_mapping
       .filter(col("source").isin("MED", "PPR", "AGR"))
       .select(
         col("firstPublicationDate").alias("publicationDate"),
@@ -260,7 +263,7 @@ object Evidence extends LazyLogging {
       .withColumn(
         "evidenceDate",
         coalesce(
-          col("publicationDate"), 
+          col("publicationDate"),
           col("releaseDate")
         )
       )
diff --git a/src/test/scala/io/opentargets/etl/backend/evidence/EvidenceTest.scala b/src/test/scala/io/opentargets/etl/backend/evidence/EvidenceTest.scala
@@ -1,7 +1,5 @@
 package io.opentargets.etl.backend.evidence
 
-
-
 import io.opentargets.etl.backend.{Configuration, ETLSessionContext}
 import io.opentargets.etl.backend.Configuration.OTConfig
 import io.opentargets.etl.backend.spark.{IOResource, IoHelpers}
@@ -28,54 +26,59 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
   )
 
   // Shared test data available to all tests
-  val evidenceSchema = StructType(Array(
-    StructField("id", StringType, nullable = false),
-    StructField("releaseDate", StringType, nullable = true),
-    StructField("literature", ArrayType(StringType), nullable = true)
-  ))
-
-    val testEvidenceData = sparkSession.createDataFrame(
-        sparkSession.sparkContext.parallelize(
-            Seq(
-                Row("e1", null, Array.empty[String]), // No dates, empty array instead of null
-                Row("e2", "2021-02-03", Array.empty[String]), // Release date is given, empty array
-                Row("e3", "2021-02-03", Array("123", "PMC456")), // Both release date and literature is given
-                Row("e4", null, Array("123", "PMC456")), // Only literature is given
-                Row("e5", null, Array("PMC456")) // Only literature but only one source.
-            )
-        ),
-        evidenceSchema
+  val evidenceSchema = StructType(
+    Array(
+      StructField("id", StringType, nullable = false),
+      StructField("releaseDate", StringType, nullable = true),
+      StructField("literature", ArrayType(StringType), nullable = true)
     )
+  )
 
-    val literatureMapSchema = StructType(
-            Array(
-                StructField("source", StringType, nullable = false),
-                StructField("firstPublicationDate", StringType, nullable = true),
-                StructField("pmid", StringType, nullable = true),
-                StructField("id", StringType, nullable = true),
-                StructField("pmcid", StringType, nullable = true)
-            )
-        )
-
-    val testPublicationData = sparkSession.createDataFrame(
-        sparkSession.sparkContext.parallelize(
-            Seq(
-                Row("MED", "2021-06-15", "123", "123", "PMC9936"),
-                Row("MED", "2021-08-15", null, "PMC456", "PMC456"),
-                Row("AGR", "2021-07-30", "AGR001", "AGR001", null)
-            )
-        ),
-        literatureMapSchema
+  val testEvidenceData = sparkSession.createDataFrame(
+    sparkSession.sparkContext.parallelize(
+      Seq(
+        Row("e1", null, Array.empty[String]), // No dates, empty array instead of null
+        Row("e2", "2021-02-03", Array.empty[String]), // Release date is given, empty array
+        Row("e3",
+            "2021-02-03",
+            Array("123", "PMC456")
+        ), // Both release date and literature is given
+        Row("e4", null, Array("123", "PMC456")), // Only literature is given
+        Row("e5", null, Array("PMC456")) // Only literature but only one source.
+      )
+    ),
+    evidenceSchema
+  )
+
+  val literatureMapSchema = StructType(
+    Array(
+      StructField("source", StringType, nullable = false),
+      StructField("firstPublicationDate", StringType, nullable = true),
+      StructField("pmid", StringType, nullable = true),
+      StructField("id", StringType, nullable = true),
+      StructField("pmcid", StringType, nullable = true)
     )
+  )
 
-    // Apply the function using shared test data
-    val result = Evidence.resolvePublicationDates(testEvidenceData, testPublicationData)
+  val testPublicationData = sparkSession.createDataFrame(
+    sparkSession.sparkContext.parallelize(
+      Seq(
+        Row("MED", "2021-06-15", "123", "123", "PMC9936"),
+        Row("MED", "2021-08-15", null, "PMC456", "PMC456"),
+        Row("AGR", "2021-07-30", "AGR001", "AGR001", null)
+      )
+    ),
+    literatureMapSchema
+  )
+
+  // Apply the function using shared test data
+  val result = Evidence.resolvePublicationDates(testEvidenceData, testPublicationData)
 
   "resolvePublicationDates" should "return dataframe" in {
 
     // Compile-time type assertion
     implicitly[result.type <:< DataFrame]
-    
+
   }
 
   it should "return all evidence" in {
@@ -84,7 +87,7 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
 
     // Should have all expected columns
     val expectedColumns = testEvidenceData.columns
-    result.columns should contain allElementsOf(expectedColumns)
+    result.columns should contain allElementsOf (expectedColumns)
 
   }
 
@@ -108,12 +111,12 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
 
   it should "correctly resolve publication dates for specific evidence" in {
     // Test specific evidence records
-    
+
     // e1: No literature, no releaseDate - publicationDate should be null, evidenceDate should be null
     val evidence1 = result.filter(col("id") === "e1").collect().head
     evidence1.getString(evidence1.fieldIndex("publicationDate")) should be(null)
     evidence1.getString(evidence1.fieldIndex("evidenceDate")) should be(null)
-    
+
     // e2: No literature, has releaseDate - publicationDate should be null, evidenceDate should be releaseDate
     val evidence2 = result.filter(col("id") === "e2").collect().head
     evidence2.getString(evidence2.fieldIndex("publicationDate")) should be(null)