Skip to content

Commit 538bff7

Browse files
committed
formatting
1 parent 4d13f70 commit 538bff7

File tree

2 files changed

+64
-58
lines changed

2 files changed

+64
-58
lines changed

src/main/scala/io/opentargets/etl/backend/evidence/Evidence.scala

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -197,18 +197,21 @@ object Evidence extends LazyLogging {
197197
resolved
198198
}
199199

200-
/**
201-
* Resolves publication dates for evidence based on literature identifiers.
202-
*
203-
* This function takes evidence records with literature arrays and matches them
204-
* against a publication date mapping to add publicationDate and evidenceDate columns.
205-
* The evidenceDate uses publicationDate when available, falling back to releaseDate.
206-
*
207-
* @param df the evidence DataFrame containing literature arrays
208-
* @param publication_date_mapping DataFrame with publication dates mapped to identifiers
209-
* @param context the ETL session context containing Spark session
210-
* @return DataFrame with added publicationDate and evidenceDate columns
211-
*/
200+
/** Resolves publication dates for evidence based on literature identifiers.
201+
*
202+
* This function takes evidence records with literature arrays and matches them against a
203+
* publication date mapping to add publicationDate and evidenceDate columns. The evidenceDate
204+
* uses publicationDate when available, falling back to releaseDate.
205+
*
206+
* @param df
207+
* the evidence DataFrame containing literature arrays
208+
* @param publication_date_mapping
209+
* DataFrame with publication dates mapped to identifiers
210+
* @param context
211+
* the ETL session context containing Spark session
212+
* @return
213+
* DataFrame with added publicationDate and evidenceDate columns
214+
*/
212215
def resolvePublicationDates(
213216
df: DataFrame,
214217
publication_date_mapping: DataFrame
@@ -218,7 +221,7 @@ object Evidence extends LazyLogging {
218221
implicit val session: SparkSession = context.sparkSession
219222

220223
// Filter for MED, AGR and pre-prints (PPR) and create temp view called pub_data:
221-
val processedPublicationData = publication_date_mapping
224+
val processedPublicationData = publication_date_mapping
222225
.filter(col("source").isin("MED", "PPR", "AGR"))
223226
.select(
224227
col("firstPublicationDate").alias("publicationDate"),
@@ -260,7 +263,7 @@ object Evidence extends LazyLogging {
260263
.withColumn(
261264
"evidenceDate",
262265
coalesce(
263-
col("publicationDate"),
266+
col("publicationDate"),
264267
col("releaseDate")
265268
)
266269
)

src/test/scala/io/opentargets/etl/backend/evidence/EvidenceTest.scala

Lines changed: 47 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
package io.opentargets.etl.backend.evidence
22

3-
4-
53
import io.opentargets.etl.backend.{Configuration, ETLSessionContext}
64
import io.opentargets.etl.backend.Configuration.OTConfig
75
import io.opentargets.etl.backend.spark.{IOResource, IoHelpers}
@@ -28,54 +26,59 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
2826
)
2927

3028
// Shared test data available to all tests
31-
val evidenceSchema = StructType(Array(
32-
StructField("id", StringType, nullable = false),
33-
StructField("releaseDate", StringType, nullable = true),
34-
StructField("literature", ArrayType(StringType), nullable = true)
35-
))
36-
37-
val testEvidenceData = sparkSession.createDataFrame(
38-
sparkSession.sparkContext.parallelize(
39-
Seq(
40-
Row("e1", null, Array.empty[String]), // No dates, empty array instead of null
41-
Row("e2", "2021-02-03", Array.empty[String]), // Release date is given, empty array
42-
Row("e3", "2021-02-03", Array("123", "PMC456")), // Both release date and literature is given
43-
Row("e4", null, Array("123", "PMC456")), // Only literature is given
44-
Row("e5", null, Array("PMC456")) // Only literature but only one source.
45-
)
46-
),
47-
evidenceSchema
29+
val evidenceSchema = StructType(
30+
Array(
31+
StructField("id", StringType, nullable = false),
32+
StructField("releaseDate", StringType, nullable = true),
33+
StructField("literature", ArrayType(StringType), nullable = true)
4834
)
35+
)
4936

50-
val literatureMapSchema = StructType(
51-
Array(
52-
StructField("source", StringType, nullable = false),
53-
StructField("firstPublicationDate", StringType, nullable = true),
54-
StructField("pmid", StringType, nullable = true),
55-
StructField("id", StringType, nullable = true),
56-
StructField("pmcid", StringType, nullable = true)
57-
)
58-
)
59-
60-
val testPublicationData = sparkSession.createDataFrame(
61-
sparkSession.sparkContext.parallelize(
62-
Seq(
63-
Row("MED", "2021-06-15", "123", "123", "PMC9936"),
64-
Row("MED", "2021-08-15", null, "PMC456", "PMC456"),
65-
Row("AGR", "2021-07-30", "AGR001", "AGR001", null)
66-
)
67-
),
68-
literatureMapSchema
37+
val testEvidenceData = sparkSession.createDataFrame(
38+
sparkSession.sparkContext.parallelize(
39+
Seq(
40+
Row("e1", null, Array.empty[String]), // No dates, empty array instead of null
41+
Row("e2", "2021-02-03", Array.empty[String]), // Release date is given, empty array
42+
Row("e3",
43+
"2021-02-03",
44+
Array("123", "PMC456")
45+
), // Both release date and literature is given
46+
Row("e4", null, Array("123", "PMC456")), // Only literature is given
47+
Row("e5", null, Array("PMC456")) // Only literature but only one source.
48+
)
49+
),
50+
evidenceSchema
51+
)
52+
53+
val literatureMapSchema = StructType(
54+
Array(
55+
StructField("source", StringType, nullable = false),
56+
StructField("firstPublicationDate", StringType, nullable = true),
57+
StructField("pmid", StringType, nullable = true),
58+
StructField("id", StringType, nullable = true),
59+
StructField("pmcid", StringType, nullable = true)
6960
)
61+
)
7062

71-
// Apply the function using shared test data
72-
val result = Evidence.resolvePublicationDates(testEvidenceData, testPublicationData)
63+
val testPublicationData = sparkSession.createDataFrame(
64+
sparkSession.sparkContext.parallelize(
65+
Seq(
66+
Row("MED", "2021-06-15", "123", "123", "PMC9936"),
67+
Row("MED", "2021-08-15", null, "PMC456", "PMC456"),
68+
Row("AGR", "2021-07-30", "AGR001", "AGR001", null)
69+
)
70+
),
71+
literatureMapSchema
72+
)
73+
74+
// Apply the function using shared test data
75+
val result = Evidence.resolvePublicationDates(testEvidenceData, testPublicationData)
7376

7477
"resolvePublicationDates" should "return dataframe" in {
7578

7679
// Compile-time type assertion
7780
implicitly[result.type <:< DataFrame]
78-
81+
7982
}
8083

8184
it should "return all evidence" in {
@@ -84,7 +87,7 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
8487

8588
// Should have all expected columns
8689
val expectedColumns = testEvidenceData.columns
87-
result.columns should contain allElementsOf(expectedColumns)
90+
result.columns should contain allElementsOf (expectedColumns)
8891

8992
}
9093

@@ -108,12 +111,12 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
108111

109112
it should "correctly resolve publication dates for specific evidence" in {
110113
// Test specific evidence records
111-
114+
112115
// e1: No literature, no releaseDate - publicationDate should be null, evidenceDate should be null
113116
val evidence1 = result.filter(col("id") === "e1").collect().head
114117
evidence1.getString(evidence1.fieldIndex("publicationDate")) should be(null)
115118
evidence1.getString(evidence1.fieldIndex("evidenceDate")) should be(null)
116-
119+
117120
// e2: No literature, has releaseDate - publicationDate should be null, evidenceDate should be releaseDate
118121
val evidence2 = result.filter(col("id") === "e2").collect().head
119122
evidence2.getString(evidence2.fieldIndex("publicationDate")) should be(null)

0 commit comments

Comments
 (0)