Skip to content

Commit bccbaa7

Browse files
committed
CNAM-154 Added filter to remove patients without outcomes
CNAM-154 Added comments to config file
1 parent 8521f6b commit bccbaa7

File tree

4 files changed

+81
-22
lines changed

4 files changed

+81
-22
lines changed

src/main/resources/config/filtering-default.conf

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -41,22 +41,23 @@ default = {
4141

4242
}
4343
mlpp_parameters = {
44-
bucket_size = [30] # in days
45-
lag_count = [10]
46-
min_timestamp = ${default.dates.study_start}
47-
max_timestamp = ${default.dates.study_end}
48-
include_death_bucket = false
44+
bucket_size = [30] # Number of days of each bucket of time
45+
lag_count = [10] # Number of lags to be created
46+
min_timestamp = ${default.dates.study_start} # Min timestamp to be considered by the time buckets
47+
max_timestamp = ${default.dates.study_end} # Max timestamp to be considered by the time buckets
48+
include_death_bucket = false # If false, the row corresponding to the death bucket is filled with 0s
4949

5050
exposures = {
51-
min_purchases = 1
52-
start_delay = 0
53-
purchases_window = 0
54-
only_first = false
55-
filter_lost_patients = false
56-
filter_diagnosed_patients = true
57-
diagnosed_patients_threshold = 0
58-
filter_delayed_entries = true
59-
delayed_entry_threshold = 12
51+
min_purchases = 1 # Number of drug purchases within <purchases_window> to form an exposure
52+
start_delay = 0 # Months after the drug purchases for the exposure to start
53+
purchases_window = 0 # Period in months to look for multiple drug purchases
54+
only_first = false # If true, only the first exposure is kept for each <patient, molecule> pair
55+
filter_never_sick_patients = false # if true, patients who never got a target disease are removed
56+
filter_lost_patients = false # if true, patients with a trackloss are removed
57+
filter_diagnosed_patients = true # if true, patients with an early diagnostic are removed
58+
diagnosed_patients_threshold = 0 # number of months after study start for a diagnostic to be considered "early"
59+
filter_delayed_entries = true # if true, patients who entered the study (i.e. first drug purchase) after a delay are removed
60+
delayed_entry_threshold = 12 # delay in months for the delayed entries filter
6061
}
6162
}
6263
}

src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPConfig.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ object MLPPConfig {
1313
startDelay: Int,
1414
purchasesWindow: Int,
1515
onlyFirst: Boolean,
16+
filterNeverSickPatients: Boolean,
1617
filterLostPatients: Boolean,
17-
filterDiagnosedPatients: Boolean,
18+
filterEarlyDiagnosedPatients: Boolean,
1819
diagnosedPatientsThreshold: Int,
1920
filterDelayedEntries: Boolean,
2021
delayedEntryThreshold: Int
@@ -33,8 +34,9 @@ object MLPPConfig {
3334
startDelay = conf.getInt("exposures.start_delay"),
3435
purchasesWindow = conf.getInt("exposures.purchases_window"),
3536
onlyFirst = conf.getBoolean("exposures.only_first"),
37+
filterNeverSickPatients = conf.getBoolean("exposures.filter_never_sick_patients"),
3638
filterLostPatients = conf.getBoolean("exposures.filter_lost_patients"),
37-
filterDiagnosedPatients = conf.getBoolean("exposures.filter_diagnosed_patients"),
39+
filterEarlyDiagnosedPatients = conf.getBoolean("exposures.filter_diagnosed_patients"),
3840
diagnosedPatientsThreshold = conf.getInt("exposures.diagnosed_patients_threshold"),
3941
filterDelayedEntries = conf.getBoolean("exposures.filter_delayed_entries"),
4042
delayedEntryThreshold = conf.getInt("exposures.delayed_entry_threshold")

src/main/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPExposuresTransformer.scala

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ object MLPPExposuresTransformer extends ExposuresTransformer {
1515
private def filterLostPatients = MLPPConfig.exposureDefinition.filterLostPatients
1616
private def filterDelayedEntries = MLPPConfig.exposureDefinition.filterDelayedEntries
1717
private def delayedEntryThreshold = MLPPConfig.exposureDefinition.delayedEntryThreshold
18-
private def filterDiagnosedPatients = MLPPConfig.exposureDefinition.filterDiagnosedPatients
18+
private def filterEarlyDiagnosedPatients = MLPPConfig.exposureDefinition.filterEarlyDiagnosedPatients
1919
private def diagnosedPatientsThreshold = MLPPConfig.exposureDefinition.diagnosedPatientsThreshold
20+
private def filterNeverSickPatients = MLPPConfig.exposureDefinition.filterNeverSickPatients
2021

2122
val outputColumns = List(
2223
col("patientID"),
@@ -33,9 +34,9 @@ object MLPPExposuresTransformer extends ExposuresTransformer {
3334
implicit class ExposuresDataFrame(data: DataFrame) {
3435

3536
/**
36-
* Drops patients whose got a target disease before periodStart + delay (default = 0)
37+
* Drops patients who got a target disease before periodStart + delay (default = 0)
3738
*/
38-
def filterDiagnosedPatients(doFilter: Boolean): DataFrame = {
39+
def filterEarlyDiagnosedPatients(doFilter: Boolean): DataFrame = {
3940

4041
if (doFilter) {
4142
val window = Window.partitionBy("patientID")
@@ -104,6 +105,29 @@ object MLPPExposuresTransformer extends ExposuresTransformer {
104105
}
105106
}
106107

108+
/**
109+
* Drops patients who never had a target disease event
110+
*/
111+
def filterNeverSickPatients(doFilter: Boolean): DataFrame = {
112+
113+
if (doFilter) {
114+
val window = Window.partitionBy("patientID")
115+
116+
val filterColumn: Column = max(
117+
when(
118+
(col("category") === "disease") &&
119+
(col("eventId") === "targetDisease") &&
120+
(col("start") <= MLPPConfig.maxTimestamp), lit(1)
121+
).otherwise(lit(0))
122+
).over(window).cast(BooleanType)
123+
124+
data.withColumn("filter", filterColumn).where(col("filter")).drop("filter")
125+
}
126+
else {
127+
data
128+
}
129+
}
130+
107131
def withExposureStart(minPurchases: Int = 1, intervalSize: Int = 6,
108132
startDelay: Int = 0, firstOnly: Boolean = false): DataFrame = {
109133

@@ -142,8 +166,9 @@ object MLPPExposuresTransformer extends ExposuresTransformer {
142166

143167
input.toDF
144168
.filterDelayedEntries(filterDelayedEntries)
145-
.filterDiagnosedPatients(filterDiagnosedPatients)
169+
.filterEarlyDiagnosedPatients(filterEarlyDiagnosedPatients)
146170
.filterLostPatients(filterLostPatients)
171+
.filterNeverSickPatients(filterNeverSickPatients)
147172
.where(col("category") === "molecule")
148173
.withExposureStart(
149174
minPurchases = minPurchases,

src/test/scala/fr/polytechnique/cmap/cnam/filtering/mlpp/MLPPExposuresTransformerSuite.scala

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ class MLPPExposuresTransformerSuite extends SharedContext {
7777

7878
// When
7979
import MLPPExposuresTransformer.ExposuresDataFrame
80-
val result = input.filterDiagnosedPatients(true).select("patientID", "category")
80+
val result = input.filterEarlyDiagnosedPatients(true).select("patientID", "category")
8181

8282
// Then
8383
import RichDataFrames._
@@ -100,7 +100,7 @@ class MLPPExposuresTransformerSuite extends SharedContext {
100100

101101
// When
102102
import MLPPExposuresTransformer.ExposuresDataFrame
103-
val result = input.filterDiagnosedPatients(false)
103+
val result = input.filterEarlyDiagnosedPatients(false)
104104

105105
// Then
106106
import RichDataFrames._
@@ -135,6 +135,37 @@ class MLPPExposuresTransformerSuite extends SharedContext {
135135
assert(result === expected)
136136
}
137137

138+
"filterNeverSickPatients" should "remove patients who never have a target disease" in {
139+
val sqlCtx = sqlContext
140+
import sqlCtx.implicits._
141+
142+
// Given
143+
val input = Seq(
144+
("Patient_A", "molecule", "", makeTS(2006, 1, 1)),
145+
("Patient_A", "molecule", "", makeTS(2006, 3, 1)),
146+
("Patient_A", "disease", "targetDisease", makeTS(2006, 6, 1)),
147+
("Patient_B", "molecule", "", makeTS(2006, 5, 1)),
148+
("Patient_B", "molecule", "", makeTS(2007, 1, 1)),
149+
("Patient_C", "molecule", "", makeTS(2006, 11, 1))
150+
).toDF("patientID", "category", "eventId", "start")
151+
152+
val expected = Seq(
153+
("Patient_A", "molecule", "", makeTS(2006, 1, 1)),
154+
("Patient_A", "molecule", "", makeTS(2006, 3, 1)),
155+
("Patient_A", "disease", "targetDisease", makeTS(2006, 6, 1))
156+
).toDF("patientID", "category", "eventId", "start")
157+
158+
// When
159+
import MLPPExposuresTransformer.ExposuresDataFrame
160+
val result = input.filterNeverSickPatients(true)
161+
162+
// Then
163+
import RichDataFrames._
164+
result.show
165+
expected.show
166+
assert(result === expected)
167+
}
168+
138169
"withExposureStart" should "add a column with the start of the default MLPP exposure definition" in {
139170
val sqlCtx = sqlContext
140171
import sqlCtx.implicits._

0 commit comments

Comments
 (0)