Skip to content

Commit 24d96c7

Browse files
authored
Merge pull request #418 from PDCMFinder/develop
Develop
2 parents 638aa2d + eb68556 commit 24d96c7

File tree

9 files changed

+285
-143
lines changed

9 files changed

+285
-143
lines changed

etl/entities_registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ def get_all_entities_names_to_store_db():
366366
"spark_job": etl.jobs.transformation.platform_transformer_job.main,
367367
"expected_database_columns": [
368368
"id",
369+
"platform_id",
369370
"library_strategy",
370371
"provider_group_id",
371372
"instrument_model",
@@ -695,6 +696,7 @@ def get_all_entities_names_to_store_db():
695696
"cancer_annotation_resources",
696697
"model_availability",
697698
"date_submitted",
699+
"email_list",
698700
"model_generator",
699701
"view_data_at",
700702
"scores",

etl/jobs/transformation/model_metadata_transformer_job.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def get_formatted_model(model_df: DataFrame) -> DataFrame:
171171
"drug_concentration",
172172
"model_availability",
173173
"date_submitted",
174+
"email_list",
174175
Constants.DATA_SOURCE_COLUMN,
175176
)
176177
return model_df

etl/jobs/transformation/model_transformer_job.py

Lines changed: 77 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
def main(argv):
1212
"""
13-
13+
1414
:param list argv: the list elements should be:
1515
[1]: Parquet file path with raw external_model_ids configuration
1616
[2]: Parquet file path with initial df with model_information data
@@ -33,8 +33,12 @@ def main(argv):
3333
output_path = argv[9]
3434

3535
spark = SparkSession.builder.getOrCreate()
36-
raw_external_model_ids_df = spark.read.parquet(raw_external_model_ids_resources_parquet_path)
37-
initial_model_information_df = spark.read.parquet(initial_model_information_parquet_path)
36+
raw_external_model_ids_df = spark.read.parquet(
37+
raw_external_model_ids_resources_parquet_path
38+
)
39+
initial_model_information_df = spark.read.parquet(
40+
initial_model_information_parquet_path
41+
)
3842
publication_group_df = spark.read.parquet(publication_group_parquet_path)
3943
accessibility_group_df = spark.read.parquet(accessibility_group_parquet_path)
4044
contact_people_df = spark.read.parquet(contact_people_parquet_path)
@@ -50,21 +54,22 @@ def main(argv):
5054
contact_people_df,
5155
contact_form_df,
5256
source_database_df,
53-
license_df)
57+
license_df,
58+
)
5459

5560
model_df.write.mode("overwrite").parquet(output_path)
5661

5762

5863
def transform_model(
59-
raw_external_model_ids_df: DataFrame,
60-
initial_model_information_df: DataFrame,
61-
publication_group_df: DataFrame,
62-
accessibility_group_df: DataFrame,
63-
contact_people_df: DataFrame,
64-
contact_form_df: DataFrame,
65-
source_database_df: DataFrame,
66-
license_df: DataFrame) -> DataFrame:
67-
64+
raw_external_model_ids_df: DataFrame,
65+
initial_model_information_df: DataFrame,
66+
publication_group_df: DataFrame,
67+
accessibility_group_df: DataFrame,
68+
contact_people_df: DataFrame,
69+
contact_form_df: DataFrame,
70+
source_database_df: DataFrame,
71+
license_df: DataFrame,
72+
) -> DataFrame:
6873
model_df = initial_model_information_df
6974
model_df = set_fk_publication_group(model_df, publication_group_df)
7075
model_df = set_fk_accessibility_group(model_df, accessibility_group_df)
@@ -73,51 +78,85 @@ def transform_model(
7378
model_df = set_fk_source_database(model_df, source_database_df)
7479
model_df = set_fk_license(model_df, license_df)
7580
model_df = add_model_links(model_df, raw_external_model_ids_df)
76-
81+
7782
model_df = get_columns_expected_order(model_df)
7883

7984
return model_df
8085

8186

82-
def set_fk_publication_group(model_df: DataFrame, publication_group_df: DataFrame) -> DataFrame:
87+
def set_fk_publication_group(
88+
model_df: DataFrame, publication_group_df: DataFrame
89+
) -> DataFrame:
8390
model_df = transform_to_fk(
84-
model_df, publication_group_df, "publications", "pubmed_ids", "id", "publication_group_id")
91+
model_df,
92+
publication_group_df,
93+
"publications",
94+
"pubmed_ids",
95+
"id",
96+
"publication_group_id",
97+
)
8598
return model_df
8699

87100

88-
def set_fk_accessibility_group(model_df: DataFrame, accessibility_group_df: DataFrame) -> DataFrame:
89-
model_df = model_df.withColumnRenamed("europdx_access_modality", "europdx_access_modalities")
90-
accessibility_group_df = accessibility_group_df.withColumnRenamed("id", "accessibility_group_id")
101+
def set_fk_accessibility_group(
102+
model_df: DataFrame, accessibility_group_df: DataFrame
103+
) -> DataFrame:
104+
model_df = model_df.withColumnRenamed(
105+
"europdx_access_modality", "europdx_access_modalities"
106+
)
107+
accessibility_group_df = accessibility_group_df.withColumnRenamed(
108+
"id", "accessibility_group_id"
109+
)
91110
model_df = model_df.join(
92111
accessibility_group_df,
93-
on=['accessibility', 'europdx_access_modalities'], how='left')
112+
on=["accessibility", "europdx_access_modalities"],
113+
how="left",
114+
)
94115
return model_df
95116

96117

97-
def set_fk_contact_people(model_df: DataFrame, contact_people_df: DataFrame) -> DataFrame:
98-
contact_people_df = contact_people_df.select("id", "email_list", "name_list", Constants.DATA_SOURCE_COLUMN)
118+
def set_fk_contact_people(
119+
model_df: DataFrame, contact_people_df: DataFrame
120+
) -> DataFrame:
121+
contact_people_df = contact_people_df.select(
122+
"id", "email_list", "name_list", Constants.DATA_SOURCE_COLUMN
123+
)
99124
model_df = model_df.withColumnRenamed("email", "email_list")
100125
model_df = model_df.withColumnRenamed("name", "name_list")
101126
contact_people_df = contact_people_df.withColumnRenamed("id", "contact_people_id")
102127

103-
cond = [model_df.name_list.eqNullSafe(contact_people_df.name_list),
104-
model_df.email_list.eqNullSafe(contact_people_df.email_list),
105-
model_df[Constants.DATA_SOURCE_COLUMN] == contact_people_df[Constants.DATA_SOURCE_COLUMN]]
128+
cond = [
129+
model_df.name_list.eqNullSafe(contact_people_df.name_list),
130+
model_df.email_list.eqNullSafe(contact_people_df.email_list),
131+
model_df[Constants.DATA_SOURCE_COLUMN]
132+
== contact_people_df[Constants.DATA_SOURCE_COLUMN],
133+
]
106134

107-
model_df = model_df.join(contact_people_df, cond, how='left')
135+
model_df = model_df.join(contact_people_df, cond, how="left")
136+
model_df = model_df.drop(contact_people_df.email_list)
137+
model_df = model_df.drop(contact_people_df.name_list)
108138
model_df = model_df.drop(contact_people_df[Constants.DATA_SOURCE_COLUMN])
109139
return model_df
110140

111141

112142
def set_fk_contact_form(model_df: DataFrame, contact_form_df: DataFrame) -> DataFrame:
113143
model_df = transform_to_fk(
114-
model_df, contact_form_df, "form_url", "form_url", "id", "contact_form_id")
144+
model_df, contact_form_df, "form_url", "form_url", "id", "contact_form_id"
145+
)
115146
return model_df
116147

117148

118-
def set_fk_source_database(model_df: DataFrame, source_database_df: DataFrame) -> DataFrame:
149+
def set_fk_source_database(
150+
model_df: DataFrame, source_database_df: DataFrame
151+
) -> DataFrame:
119152
model_df = transform_to_fk(
120-
model_df, source_database_df, "database_url", "database_url", "id", "source_database_id")
153+
model_df,
154+
source_database_df,
155+
"database_url",
156+
"database_url",
157+
"id",
158+
"source_database_id",
159+
)
121160
return model_df
122161

123162

@@ -126,12 +165,16 @@ def set_fk_license(model_df: DataFrame, license_df: DataFrame) -> DataFrame:
126165
license_df = license_df.withColumnRenamed("name", "license_name")
127166
license_df = license_df.withColumnRenamed("url", "license_url")
128167

129-
model_df = model_df.join(license_df, model_df.license == license_df.license_name, how='left')
168+
model_df = model_df.join(
169+
license_df, model_df.license == license_df.license_name, how="left"
170+
)
130171
return model_df
131172

132173

133174
def get_provider_type_from_sharing(raw_sharing_df: DataFrame) -> DataFrame:
134-
provider_type_df = raw_sharing_df.select(format_name_column("provider_type").alias("name"))
175+
provider_type_df = raw_sharing_df.select(
176+
format_name_column("provider_type").alias("name")
177+
)
135178
provider_type_df = provider_type_df.select("name").where("name is not null")
136179
provider_type_df = provider_type_df.drop_duplicates()
137180
return provider_type_df
@@ -179,8 +222,9 @@ def get_columns_expected_order(model_df: DataFrame) -> DataFrame:
179222
"drug_concentration",
180223
"other_model_links",
181224
"date_submitted",
182-
"model_availability"
183-
)
225+
"model_availability",
226+
"email_list",
227+
)
184228

185229

186230
if __name__ == "__main__":

etl/jobs/transformation/scoring/calculation_methods/generic_metadata_calculator.py

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
raw_data_score_weight = 0.07
1111
cancer_annotation_score_weight = 0.03
1212

13-
columns_with_multiple_values = ['quality_assurance', 'xenograft_model_specimens']
13+
columns_with_multiple_values = ["quality_assurance", "xenograft_model_specimens"]
1414

1515

1616
def get_list_resources_available_molecular_data(resources_df: DataFrame):
@@ -33,29 +33,35 @@ def get_metadata_max_score(column_weights):
3333
if value is None:
3434
value = 0
3535
total_score += value
36-
36+
3737
return total_score
3838

3939

4040
def is_valid_value(attribute_value: str) -> bool:
41-
lc_attribute_value = attribute_value.lower() if attribute_value is not None else ''
42-
return (lc_attribute_value != ''
43-
and lc_attribute_value != 'not provided'
44-
and lc_attribute_value != 'not collected'
45-
and lc_attribute_value != 'unknown')
46-
47-
48-
def calculate_score_single_value_column(column_name: str, column_value: str, column_weights) -> float:
41+
lc_attribute_value = attribute_value.lower() if attribute_value is not None else ""
42+
return (
43+
lc_attribute_value != ""
44+
and lc_attribute_value != "not provided"
45+
and lc_attribute_value != "not collected"
46+
and lc_attribute_value != "unknown"
47+
)
48+
49+
50+
def calculate_score_single_value_column(
51+
column_name: str, column_value: str, column_weights
52+
) -> float:
4953
column_weight = column_weights.get(column_name)
5054
if is_valid_value(column_value):
5155
return column_weight
5256
else:
5357
return 0
5458

5559

56-
def calculate_score_multiple_value_column(column_name: str, column_value: str, column_weights) -> float:
60+
def calculate_score_multiple_value_column(
61+
column_name: str, column_value: str, column_weights
62+
) -> float:
5763
score = 0
58-
if column_value == '[]' or column_value is None:
64+
if column_value == "[]" or column_value is None:
5965
return score
6066

6167
# `column_value` is expected to be a string representing a JSON array with
@@ -67,7 +73,6 @@ def calculate_score_multiple_value_column(column_name: str, column_value: str, c
6773
rows_count = len(json_array)
6874
for obj in json_array:
6975
for attribute, value in obj.items():
70-
7176
if attribute not in valid_elements_per_column:
7277
valid_elements_per_column[attribute] = 0
7378

@@ -86,21 +91,29 @@ def calculate_score_multiple_value_column(column_name: str, column_value: str, c
8691
return score
8792

8893

89-
def calculate_score_by_column(column_name: str, column_value: str, column_weights) -> float:
94+
def calculate_score_by_column(
95+
column_name: str, column_value: str, column_weights
96+
) -> float:
9097
score = 0
9198
if column_name in column_weights.keys():
9299
if is_valid_value(column_value):
93-
score += calculate_score_single_value_column(column_name, column_value, column_weights)
100+
score += calculate_score_single_value_column(
101+
column_name, column_value, column_weights
102+
)
94103
elif column_name in columns_with_multiple_values:
95-
score += calculate_score_multiple_value_column(column_name, column_value, column_weights)
104+
score += calculate_score_multiple_value_column(
105+
column_name, column_value, column_weights
106+
)
96107
return score
97108

98109

99110
def calculate_metadata_score(row, column_weights):
100111
score = 0
101112
row_as_dict = row.asDict()
102113
for column_name in row_as_dict:
103-
score += calculate_score_by_column(column_name, row_as_dict[column_name], column_weights)
114+
score += calculate_score_by_column(
115+
column_name, row_as_dict[column_name], column_weights
116+
)
104117
return score / get_metadata_max_score(column_weights) * 100
105118

106119

@@ -130,10 +143,14 @@ def calculate_cancer_annotation_score(row, total_cancer_annotation_resources):
130143
def calculate_score_for_row(row, total_cancer_annotation_resources, column_weights):
131144
columns = {"pdcm_model_id": row["pdcm_model_id"]}
132145

133-
metadata_score = calculate_metadata_score(row, column_weights) * metadata_score_weight
146+
metadata_score = (
147+
calculate_metadata_score(row, column_weights) * metadata_score_weight
148+
)
134149
raw_data_score = calculate_raw_data_score(row) * raw_data_score_weight
135-
cancer_annotation_score = calculate_cancer_annotation_score(
136-
row, total_cancer_annotation_resources) * cancer_annotation_score_weight
150+
cancer_annotation_score = (
151+
calculate_cancer_annotation_score(row, total_cancer_annotation_resources)
152+
* cancer_annotation_score_weight
153+
)
137154

138155
score = int(metadata_score + raw_data_score + cancer_annotation_score)
139156

@@ -142,21 +159,31 @@ def calculate_score_for_row(row, total_cancer_annotation_resources, column_weigh
142159
return output
143160

144161

145-
def calculate_model_metadata_score(input_df: DataFrame, raw_external_resources_df: DataFrame, column_weights: dict) -> DataFrame:
162+
def calculate_model_metadata_score(
163+
input_df: DataFrame, raw_external_resources_df: DataFrame, column_weights: dict
164+
) -> DataFrame:
146165
"""
147-
Calculates metadata score. It receives a dataframe `input_df` (a subset of `search_index_df` filtered by a model type)
166+
Calculates metadata score. It receives a dataframe `input_df` (a subset of `search_index_df` filtered by a model type)
148167
and returns a dataset with (pdcm_model_id, score)
149168
"""
150169
input_df = input_df.drop_duplicates()
151-
152-
total_cancer_annotation_resources = count_cancer_annotation_resources(raw_external_resources_df)
153-
154-
rdd_with_score = input_df.rdd.map(lambda x: calculate_score_for_row(x, total_cancer_annotation_resources, column_weights))
155170

156-
score_schema = StructType([
157-
StructField('pdcm_model_id', LongType(), True),
158-
StructField('score', IntegerType(), True)
159-
])
171+
total_cancer_annotation_resources = count_cancer_annotation_resources(
172+
raw_external_resources_df
173+
)
174+
175+
rdd_with_score = input_df.rdd.map(
176+
lambda x: calculate_score_for_row(
177+
x, total_cancer_annotation_resources, column_weights
178+
)
179+
)
180+
181+
score_schema = StructType(
182+
[
183+
StructField("pdcm_model_id", LongType(), True),
184+
StructField("score", IntegerType(), True),
185+
]
186+
)
160187

161188
score_df = rdd_with_score.toDF(score_schema)
162189

etl/jobs/transformation/scoring/weights_per_fields.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,7 @@
3030
"quality_assurance.STR_analysis": 0,
3131
"quality_assurance.comments": 0,
3232
"supplier": 0,
33-
"supplier_type": 0
34-
33+
"supplier_type": 0,
3534
}
3635

3736
# Weights for fields that only apply to PDX models
@@ -54,8 +53,8 @@
5453
"growth_properties": 1,
5554
"growth_media": 1,
5655
"media_id": 1,
57-
"plate_coating": 1,
58-
"other_plate_coating" :1,
56+
"plate_coating": 1,
57+
"other_plate_coating": 1,
5958
"passage_number": 1,
6059
"contaminated": 1,
6160
"contamination_details": 0.5,
@@ -66,4 +65,3 @@
6665
"quality_assurance.tumour_status": 1,
6766
"quality_assurance.model_purity": 1,
6867
}
69-

0 commit comments

Comments
 (0)