Skip to content

Commit f67b773

Browse files
committed
Rename score and count metadata variables for consistency and update score_set routes to process scores, counts and related metadata file uploads
1 parent b537f6e commit f67b773

File tree

5 files changed

+174
-154
lines changed

5 files changed

+174
-154
lines changed

src/mavedb/lib/validation/dataframe/dataframe.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@
3232
def validate_and_standardize_dataframe_pair(
3333
scores_df: pd.DataFrame,
3434
counts_df: Optional[pd.DataFrame],
35-
scores_column_metadata: Optional[dict[str, DatasetColumnMetadata]],
36-
counts_column_metadata: Optional[dict[str, DatasetColumnMetadata]],
35+
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
36+
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]],
3737
targets: list[TargetGene],
3838
hdp: Optional["RESTDataProvider"],
3939
) -> Tuple[pd.DataFrame, Optional[pd.DataFrame], Optional[dict[str, DatasetColumnMetadata]], Optional[dict[str, DatasetColumnMetadata]]]:
@@ -46,9 +46,9 @@ def validate_and_standardize_dataframe_pair(
4646
The scores dataframe
4747
counts_df : Optional[pandas.DataFrame]
4848
The counts dataframe, can be None if not present
49-
scores_column_metadata: Optional[dict[str, DatasetColumnMetadata]]
49+
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
5050
The scores column metadata, can be None if not present
51-
counts_column_metadata: Optional[dict[str, DatasetColumnMetadata]]
51+
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
5252
The counts column metadata, can be None if not present
5353
targets : str
5454
The target genes on which to validate dataframes
@@ -73,26 +73,26 @@ def validate_and_standardize_dataframe_pair(
7373

7474
validate_dataframe(standardized_scores_df, "scores", targets, hdp)
7575

76-
if scores_column_metadata is not None:
77-
standardized_scores_column_metadata = standardize_dict_keys(scores_column_metadata)
78-
validate_df_column_metadata_match(standardized_scores_df, standardized_scores_column_metadata)
76+
if score_columns_metadata is not None:
77+
standardized_score_columns_metadata = standardize_dict_keys(score_columns_metadata)
78+
validate_df_column_metadata_match(standardized_scores_df, standardized_score_columns_metadata)
7979
else:
80-
standardized_scores_column_metadata = None
80+
standardized_score_columns_metadata = None
8181

8282
if standardized_counts_df is not None:
8383
validate_dataframe(standardized_counts_df, "counts", targets, hdp)
8484
validate_variant_columns_match(standardized_scores_df, standardized_counts_df)
85-
if counts_column_metadata is not None:
86-
standardized_scores_column_metadata = standardize_dict_keys(counts_column_metadata)
87-
validate_df_column_metadata_match(standardized_counts_df, standardized_scores_column_metadata)
85+
if count_columns_metadata is not None:
86+
standardized_count_columns_metadata = standardize_dict_keys(count_columns_metadata)
87+
validate_df_column_metadata_match(standardized_counts_df, standardized_count_columns_metadata)
8888
else:
89-
standardized_counts_column_metadata = None
89+
standardized_count_columns_metadata = None
9090
else:
91-
if counts_column_metadata is not None:
91+
if count_columns_metadata is not None and len(count_columns_metadata.keys()) > 0:
9292
raise ValidationError("Counts column metadata provided without counts dataframe")
93-
standardized_counts_column_metadata = None
93+
standardized_count_columns_metadata = None
9494

95-
return standardized_scores_df, standardized_counts_df, standardized_scores_column_metadata, standardized_counts_column_metadata
95+
return standardized_scores_df, standardized_counts_df, standardized_score_columns_metadata, standardized_count_columns_metadata
9696

9797

9898
def validate_dataframe(

src/mavedb/routers/score_sets.py

Lines changed: 129 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from mavedb.view_models.contributor import ContributorCreate
1616
from mavedb.view_models.doi_identifier import DoiIdentifierCreate
1717
from mavedb.view_models.publication_identifier import PublicationIdentifierCreate
18+
from mavedb.view_models.score_set_dataset_columns import DatasetColumnMetadata
1819
from mavedb.view_models.target_gene import TargetGeneCreate
1920
from sqlalchemy import null, or_, select
2021
from sqlalchemy.exc import MultipleResultsFound, NoResultFound
@@ -88,56 +89,54 @@ async def enqueue_variant_creation(
8889
*,
8990
item: ScoreSet,
9091
user_data: UserData,
92+
new_scores_df: Optional[pd.DataFrame] = None,
93+
new_counts_df: Optional[pd.DataFrame] = None,
94+
new_score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]] = None,
95+
new_count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]] = None,
9196
worker: ArqRedis,
92-
) -> None:
97+
) -> str | None:
9398
assert item.dataset_columns is not None
9499

95-
# score_columns_metadata and count_columns_metadata are the only values of dataset_columns that can be set manually.
96-
# The others, scores_columns and count_columns, are calculated based on the uploaded data and should not be changed here.
97-
# if item_update.dataset_columns.get("countColumnsMetadata") is not None:
98-
# item.dataset_columns= {**item.dataset_columns, "count_columns_metadata": item_update.dataset_columns["countColumnsMetadata"]}
99-
# if item_update.dataset_columns.get("scoreColumnsMetadata") is not None:
100-
# item.dataset_columns = {**item.dataset_columns, "score_columns_metadata": item_update.dataset_columns["scoreColumnsMetadata"]}
101-
102-
score_columns = [
103-
"hgvs_nt",
104-
"hgvs_splice",
105-
"hgvs_pro",
106-
] + item.dataset_columns["score_columns"]
107-
count_columns = [
108-
"hgvs_nt",
109-
"hgvs_splice",
110-
"hgvs_pro",
111-
] + item.dataset_columns["count_columns"]
112-
113-
scores_data = pd.DataFrame(
114-
variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data")
115-
).replace("NA", pd.NA)
116-
117-
if item.dataset_columns["count_columns"]:
118-
count_data = pd.DataFrame(
119-
variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data")
100+
# create CSV from existing variants on the score set if no new dataframe provided
101+
existing_scores_df = None
102+
if new_scores_df is None:
103+
score_columns = [
104+
"hgvs_nt",
105+
"hgvs_splice",
106+
"hgvs_pro",
107+
] + item.dataset_columns.get("score_columns", [])
108+
existing_scores_df = pd.DataFrame(
109+
variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data")
120110
).replace("NA", pd.NA)
121-
else:
122-
count_data = None
123111

124-
scores_column_metadata = item.dataset_columns.get("scores_column_metadata")
125-
counts_column_metadata = item.dataset_columns.get("counts_column_metadata")
112+
# create CSV from existing variants on the score set if no new dataframe provided
113+
existing_counts_df = None
114+
if new_counts_df is None and item.dataset_columns.get("count_columns") is not None:
115+
count_columns = [
116+
"hgvs_nt",
117+
"hgvs_splice",
118+
"hgvs_pro",
119+
] + item.dataset_columns["count_columns"]
120+
existing_counts_df = pd.DataFrame(
121+
variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data")
122+
).replace("NA", pd.NA)
126123

127-
# await the insertion of this job into the worker queue, not the job itself.
124+
# Await the insertion of this job into the worker queue, not the job itself.
125+
# Uses provided score and counts dataframes and metadata files, or falls back to existing data on the score set if not provided.
128126
job = await worker.enqueue_job(
129127
"create_variants_for_score_set",
130128
correlation_id_for_context(),
131129
item.id,
132130
user_data.user.id,
133-
scores_data,
134-
count_data,
135-
scores_column_metadata,
136-
counts_column_metadata,
131+
existing_scores_df if new_scores_df is None else new_scores_df,
132+
existing_counts_df if new_counts_df is None else new_counts_df,
133+
item.dataset_columns.get("score_columns_metadata") if new_score_columns_metadata is None else new_score_columns_metadata,
134+
item.dataset_columns.get("count_columns_metadata") if new_count_columns_metadata is None else new_count_columns_metadata,
137135
)
138136
if job is not None:
139-
save_to_logging_context({"worker_job_id": job.job_id})
140-
logger.info(msg="Enqueud variant creation job.", extra=logging_context())
137+
return job.job_id
138+
else:
139+
return None
141140

142141
class ScoreSetUpdateResult(TypedDict):
143142
item: ScoreSet
@@ -252,6 +251,7 @@ async def score_set_update(
252251
item.score_ranges = item_update_dict.get("score_ranges", null())
253252

254253
if "target_genes" in item_update_dict:
254+
# stash existing target gene ids to compare after update, to determine if variants need to be re-created
255255
assert all(tg.id is not None for tg in item.target_genes)
256256
existing_target_ids: list[int] = [tg.id for tg in item.target_genes if tg.id is not None]
257257

@@ -371,6 +371,59 @@ async def score_set_update(
371371
save_to_logging_context({"updated_resource": item.urn})
372372
return {"item": item, "should_create_variants": should_create_variants}
373373

374+
class ParseScoreSetUpdate(TypedDict):
375+
scores_df: Optional[pd.DataFrame]
376+
counts_df: Optional[pd.DataFrame]
377+
score_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
378+
count_columns_metadata: Optional[dict[str, DatasetColumnMetadata]]
379+
380+
async def parse_score_set_variants_uploads(
381+
scores_file: Optional[UploadFile] = File(None),
382+
counts_file: Optional[UploadFile] = File(None),
383+
score_columns_metadata_file: Optional[UploadFile] = File(None),
384+
count_columns_metadata_file: Optional[UploadFile] = File(None),
385+
) -> ParseScoreSetUpdate:
386+
if scores_file and scores_file.file:
387+
try:
388+
scores_df = csv_data_to_df(scores_file.file)
389+
# Handle non-utf8 file problem.
390+
except UnicodeDecodeError as e:
391+
raise HTTPException(status_code=400, detail=f"Error decoding file: {e}. Ensure the file has correct values.")
392+
else:
393+
scores_df = None
394+
395+
if counts_file and counts_file.file:
396+
try:
397+
counts_df = csv_data_to_df(counts_file.file)
398+
# Handle non-utf8 file problem.
399+
except UnicodeDecodeError as e:
400+
raise HTTPException(status_code=400, detail=f"Error decoding file: {e}. Ensure the file has correct values.")
401+
else:
402+
counts_df = None
403+
404+
if score_columns_metadata_file and score_columns_metadata_file.file:
405+
try:
406+
score_columns_metadata = json.load(score_columns_metadata_file.file)
407+
except json.JSONDecodeError as e:
408+
raise HTTPException(status_code=400, detail=f"Error decoding scores metadata file: {e}. Ensure the file is valid JSON.")
409+
else:
410+
score_columns_metadata = None
411+
412+
if count_columns_metadata_file and count_columns_metadata_file.file:
413+
try:
414+
count_columns_metadata = json.load(count_columns_metadata_file.file)
415+
except json.JSONDecodeError as e:
416+
raise HTTPException(status_code=400, detail=f"Error decoding counts metadata file: {e}. Ensure the file is valid JSON.")
417+
else:
418+
count_columns_metadata = None
419+
420+
return {
421+
"scores_df": scores_df,
422+
"counts_df": counts_df,
423+
"score_columns_metadata": score_columns_metadata,
424+
"count_columns_metadata": count_columns_metadata,
425+
}
426+
374427
async def fetch_score_set_by_urn(
375428
db, urn: str, user: Optional[UserData], owner_or_contributor: Optional[UserData], only_published: bool
376429
) -> ScoreSet:
@@ -1261,88 +1314,47 @@ async def upload_score_set_variant_data(
12611314
assert_permission(user_data, item, Action.UPDATE)
12621315
assert_permission(user_data, item, Action.SET_SCORES)
12631316

1264-
# get existing column metadata for scores if no new file is provided
1265-
if score_columns_metadata_file and score_columns_metadata_file.file:
1266-
try:
1267-
scores_column_metadata = json.load(score_columns_metadata_file.file)
1268-
except json.JSONDecodeError as e:
1269-
raise HTTPException(status_code=400, detail=f"Error decoding scores metadata file: {e}. Ensure the file is valid JSON.")
1270-
else:
1271-
scores_column_metadata = item.dataset_columns.get("scores_column_metadata") if item.dataset_columns else None
1272-
1273-
# get existing column metadata for counts if no new file is provided
1274-
if count_columns_metadata_file and count_columns_metadata_file.file:
1275-
try:
1276-
counts_column_metadata = json.load(count_columns_metadata_file.file)
1277-
except json.JSONDecodeError as e:
1278-
raise HTTPException(status_code=400, detail=f"Error decoding counts metadata file: {e}. Ensure the file is valid JSON.")
1279-
else:
1280-
counts_column_metadata = item.dataset_columns.get("counts_column_metadata") if item.dataset_columns else None
1281-
1317+
score_set_variants_data = await parse_score_set_variants_uploads(
1318+
scores_file,
1319+
counts_file,
1320+
score_columns_metadata_file,
1321+
count_columns_metadata_file,
1322+
)
12821323

1283-
if scores_file and scores_file.file:
1284-
try:
1285-
scores_df = csv_data_to_df(scores_file.file)
1286-
counts_df = None
1287-
if counts_file and counts_file.filename:
1288-
counts_df = csv_data_to_df(counts_file.file)
1289-
# Handle non-utf8 file problem.
1290-
except UnicodeDecodeError as e:
1291-
raise HTTPException(status_code=400, detail=f"Error decoding file: {e}. Ensure the file has correct values.")
1292-
elif item.variants:
1293-
assert item.dataset_columns is not None
1294-
score_columns = [
1295-
"hgvs_nt",
1296-
"hgvs_splice",
1297-
"hgvs_pro",
1298-
] + item.dataset_columns["score_columns"]
1299-
count_columns = [
1300-
"hgvs_nt",
1301-
"hgvs_splice",
1302-
"hgvs_pro",
1303-
] + item.dataset_columns["count_columns"]
1324+
for key, val in score_set_variants_data.items():
1325+
logger.info(msg=f"{key}: {val}", extra=logging_context())
1326+
1327+
# Although this is also updated within the variant creation job, update it here
1328+
# as well so that we can display the proper UI components (queue invocation delay
1329+
# races the score set GET request).
1330+
item.processing_state = ProcessingState.processing
1331+
1332+
logger.info(msg="Enqueuing variant creation job.", extra=logging_context())
1333+
jobId = await enqueue_variant_creation(
1334+
item=item,
1335+
user_data=user_data,
1336+
new_scores_df=score_set_variants_data["scores_df"],
1337+
new_counts_df=score_set_variants_data["counts_df"],
1338+
new_score_columns_metadata=score_set_variants_data["score_columns_metadata"],
1339+
new_count_columns_metadata=score_set_variants_data["count_columns_metadata"],
1340+
worker=worker
1341+
)
13041342

1305-
scores_df = pd.DataFrame(
1306-
variants_to_csv_rows(item.variants, columns=score_columns, dtype="score_data")
1307-
).replace("NA", pd.NA)
13081343

1309-
if item.dataset_columns["count_columns"]:
1310-
counts_df = pd.DataFrame(
1311-
variants_to_csv_rows(item.variants, columns=count_columns, dtype="count_data")
1312-
).replace("NA", pd.NA)
1313-
else:
1314-
counts_df = None
1344+
if jobId is None:
1345+
item.processing_state = ProcessingState.failed
1346+
logger.warning(msg="Failed to enqueue variant creation job.", extra=logging_context())
13151347
else:
1316-
scores_df = pd.DataFrame()
1317-
1318-
if not scores_df.empty:
1319-
# Although this is also updated within the variant creation job, update it here
1320-
# as well so that we can display the proper UI components (queue invocation delay
1321-
# races the score set GET request).
1322-
item.processing_state = ProcessingState.processing
1323-
1324-
# await the insertion of this job into the worker queue, not the job itself.
1325-
job = await worker.enqueue_job(
1326-
"create_variants_for_score_set",
1327-
correlation_id_for_context(),
1328-
item.id,
1329-
user_data.user.id,
1330-
scores_df,
1331-
counts_df,
1332-
scores_column_metadata,
1333-
counts_column_metadata,
1334-
)
1335-
if job is not None:
1336-
save_to_logging_context({"worker_job_id": job.job_id})
1337-
logger.info(msg="Enqueud variant creation job.", extra=logging_context())
1348+
save_to_logging_context({"worker_job_id": jobId})
1349+
logger.info(msg="Enqueued variant creation job.", extra=logging_context())
13381350

13391351
db.add(item)
13401352
db.commit()
13411353
db.refresh(item)
1354+
13421355
enriched_experiment = enrich_experiment_with_num_score_sets(item.experiment, user_data)
13431356
return score_set.ScoreSet.model_validate(item).copy(update={"experiment": enriched_experiment})
13441357

1345-
13461358
@router.post(
13471359
"/score-sets/{urn}/ranges/data",
13481360
response_model=score_set.ScoreSet,
@@ -1419,7 +1431,7 @@ async def update_score_set_with_variants(
14191431

14201432
itemUpdateResult = await score_set_update(db=db, urn=urn, item_update=item_update, exclude_unset=True, user_data=user_data)
14211433
updatedItem = itemUpdateResult["item"]
1422-
# should_create_variants = itemUpdateResult["should_create_variants"]
1434+
should_create_variants = itemUpdateResult["should_create_variants"]
14231435

14241436
# TODO handle uploaded files
14251437

@@ -1454,8 +1466,15 @@ async def update_score_set(
14541466
# races the score set GET request).
14551467
updatedItem.processing_state = ProcessingState.processing
14561468

1457-
await enqueue_variant_creation(item=updatedItem, user_data=user_data, worker=worker)
1469+
logger.info(msg="Enqueuing variant creation job.", extra=logging_context())
1470+
jobId = await enqueue_variant_creation(item=updatedItem, user_data=user_data, worker=worker)
14581471

1472+
if jobId is None:
1473+
updatedItem.processing_state = ProcessingState.failed
1474+
logger.warning(msg="Failed to enqueue variant creation job.", extra=logging_context())
1475+
else:
1476+
save_to_logging_context({"worker_job_id": jobId})
1477+
logger.info(msg="Enqueued variant creation job.", extra=logging_context())
14591478
db.add(updatedItem)
14601479
db.commit()
14611480
db.refresh(updatedItem)

0 commit comments

Comments
 (0)