11package io .opentargets .etl .backend .evidence
22
3-
4-
53import io .opentargets .etl .backend .{Configuration , ETLSessionContext }
64import io .opentargets .etl .backend .Configuration .OTConfig
75import io .opentargets .etl .backend .spark .{IOResource , IoHelpers }
@@ -28,54 +26,59 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
2826 )
2927
3028 // Shared test data available to all tests
31- val evidenceSchema = StructType (Array (
32- StructField (" id" , StringType , nullable = false ),
33- StructField (" releaseDate" , StringType , nullable = true ),
34- StructField (" literature" , ArrayType (StringType ), nullable = true )
35- ))
36-
37- val testEvidenceData = sparkSession.createDataFrame(
38- sparkSession.sparkContext.parallelize(
39- Seq (
40- Row (" e1" , null , Array .empty[String ]), // No dates, empty array instead of null
41- Row (" e2" , " 2021-02-03" , Array .empty[String ]), // Release date is given, empty array
42- Row (" e3" , " 2021-02-03" , Array (" 123" , " PMC456" )), // Both release date and literature is given
43- Row (" e4" , null , Array (" 123" , " PMC456" )), // Only literature is given
44- Row (" e5" , null , Array (" PMC456" )) // Only literature but only one source.
45- )
46- ),
47- evidenceSchema
29+ val evidenceSchema = StructType (
30+ Array (
31+ StructField (" id" , StringType , nullable = false ),
32+ StructField (" releaseDate" , StringType , nullable = true ),
33+ StructField (" literature" , ArrayType (StringType ), nullable = true )
4834 )
35+ )
4936
50- val literatureMapSchema = StructType (
51- Array (
52- StructField (" source" , StringType , nullable = false ),
53- StructField (" firstPublicationDate" , StringType , nullable = true ),
54- StructField (" pmid" , StringType , nullable = true ),
55- StructField (" id" , StringType , nullable = true ),
56- StructField (" pmcid" , StringType , nullable = true )
57- )
58- )
59-
60- val testPublicationData = sparkSession.createDataFrame(
61- sparkSession.sparkContext.parallelize(
62- Seq (
63- Row (" MED" , " 2021-06-15" , " 123" , " 123" , " PMC9936" ),
64- Row (" MED" , " 2021-08-15" , null , " PMC456" , " PMC456" ),
65- Row (" AGR" , " 2021-07-30" , " AGR001" , " AGR001" , null )
66- )
67- ),
68- literatureMapSchema
37+ val testEvidenceData = sparkSession.createDataFrame(
38+ sparkSession.sparkContext.parallelize(
39+ Seq (
40+ Row (" e1" , null , Array .empty[String ]), // No dates, empty array instead of null
41+ Row (" e2" , " 2021-02-03" , Array .empty[String ]), // Release date is given, empty array
42+ Row (" e3" ,
43+ " 2021-02-03" ,
44+ Array (" 123" , " PMC456" )
45+ ), // Both release date and literature is given
46+ Row (" e4" , null , Array (" 123" , " PMC456" )), // Only literature is given
47+ Row (" e5" , null , Array (" PMC456" )) // Only literature but only one source.
48+ )
49+ ),
50+ evidenceSchema
51+ )
52+
53+ val literatureMapSchema = StructType (
54+ Array (
55+ StructField (" source" , StringType , nullable = false ),
56+ StructField (" firstPublicationDate" , StringType , nullable = true ),
57+ StructField (" pmid" , StringType , nullable = true ),
58+ StructField (" id" , StringType , nullable = true ),
59+ StructField (" pmcid" , StringType , nullable = true )
6960 )
61+ )
7062
71- // Apply the function using shared test data
72- val result = Evidence .resolvePublicationDates(testEvidenceData, testPublicationData)
63+ val testPublicationData = sparkSession.createDataFrame(
64+ sparkSession.sparkContext.parallelize(
65+ Seq (
66+ Row (" MED" , " 2021-06-15" , " 123" , " 123" , " PMC9936" ),
67+ Row (" MED" , " 2021-08-15" , null , " PMC456" , " PMC456" ),
68+ Row (" AGR" , " 2021-07-30" , " AGR001" , " AGR001" , null )
69+ )
70+ ),
71+ literatureMapSchema
72+ )
73+
74+ // Apply the function using shared test data
75+ val result = Evidence .resolvePublicationDates(testEvidenceData, testPublicationData)
7376
7477 " resolvePublicationDates" should " return dataframe" in {
7578
7679 // Compile-time type assertion
7780 implicitly[result.type <:< DataFrame ]
78-
81+
7982 }
8083
8184 it should " return all evidence" in {
@@ -84,7 +87,7 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
8487
8588 // Should have all expected columns
8689 val expectedColumns = testEvidenceData.columns
87- result.columns should contain allElementsOf(expectedColumns)
90+ result.columns should contain allElementsOf (expectedColumns)
8891
8992 }
9093
@@ -108,12 +111,12 @@ class EvidenceDatingTest extends AnyFlatSpec with Matchers {
108111
109112 it should " correctly resolve publication dates for specific evidence" in {
110113 // Test specific evidence records
111-
114+
112115 // e1: No literature, no releaseDate - publicationDate should be null, evidenceDate should be null
113116 val evidence1 = result.filter(col(" id" ) === " e1" ).collect().head
114117 evidence1.getString(evidence1.fieldIndex(" publicationDate" )) should be(null )
115118 evidence1.getString(evidence1.fieldIndex(" evidenceDate" )) should be(null )
116-
119+
117120 // e2: No literature, has releaseDate - publicationDate should be null, evidenceDate should be releaseDate
118121 val evidence2 = result.filter(col(" id" ) === " e2" ).collect().head
119122 evidence2.getString(evidence2.fieldIndex(" publicationDate" )) should be(null )
0 commit comments