|
15 | 15 | from google.cloud import storage |
16 | 16 | import highdicom as hd |
17 | 17 | from oauthlib.oauth2 import BackendApplicationClient |
| 18 | +import pandas as pd |
18 | 19 | from requests_oauthlib import OAuth2Session |
19 | 20 |
|
20 | 21 | from idc_annotation_conversion import cloud_io |
@@ -339,6 +340,8 @@ def run( |
339 | 340 | "Unsuccessful connecting to requested DICOM archive." |
340 | 341 | ) from e |
341 | 342 |
|
| 343 | + errors = [] |
| 344 | + |
342 | 345 | # Loop over requested collections |
343 | 346 | for collection in collections: |
344 | 347 | prefix = f'cnn-nuclear-segmentations-2019/data-files/{collection}/' |
@@ -376,141 +379,156 @@ def run( |
376 | 379 |
|
377 | 380 | logging.info(f"Processing container: {container_id}") |
378 | 381 |
|
379 | | - selection_query = f""" |
380 | | - SELECT |
381 | | - gcs_url, |
382 | | - Cast(NumberOfFrames AS int) AS NumberOfFrames |
383 | | - FROM |
384 | | - bigquery-public-data.idc_current.dicom_all |
385 | | - WHERE |
386 | | - ContainerIdentifier='{container_id}' |
387 | | - ORDER BY |
388 | | - NumberOfFrames DESC |
389 | | - """ |
390 | | - selection_result = bq_client.query(selection_query) |
391 | | - selection_df = selection_result.result().to_dataframe() |
392 | | - |
393 | | - if len(selection_df) == 0: |
394 | | - # No image found, skip this for now |
395 | | - logging.error( |
396 | | - f"Could not locate image for container {container_id}." |
397 | | - ) |
398 | | - continue |
399 | | - |
400 | | - source_images = [] |
401 | | - for i, url in enumerate(selection_df.gcs_url): |
402 | | - blob_name = "/".join(url.split("/")[3:]) |
403 | | - wsi_dcm = cloud_io.read_dataset_from_blob( |
404 | | - bucket=public_bucket, |
405 | | - blob_name=blob_name, |
| 382 | + try: |
| 383 | + |
| 384 | + selection_query = f""" |
| 385 | + SELECT |
| 386 | + gcs_url, |
| 387 | + Cast(NumberOfFrames AS int) AS NumberOfFrames |
| 388 | + FROM |
| 389 | + bigquery-public-data.idc_current.dicom_all |
| 390 | + WHERE |
| 391 | + ContainerIdentifier='{container_id}' |
| 392 | + ORDER BY |
| 393 | + NumberOfFrames DESC |
| 394 | + """ |
| 395 | + selection_result = bq_client.query(selection_query) |
| 396 | + selection_df = selection_result.result().to_dataframe() |
| 397 | + |
| 398 | + if len(selection_df) == 0: |
| 399 | + # No image found, skip this for now |
| 400 | + logging.error( |
| 401 | + f"Could not locate image for container {container_id}." |
| 402 | + ) |
| 403 | + continue |
| 404 | + |
| 405 | + source_images = [] |
| 406 | + for i, url in enumerate(selection_df.gcs_url): |
| 407 | + blob_name = "/".join(url.split("/")[3:]) |
| 408 | + wsi_dcm = cloud_io.read_dataset_from_blob( |
| 409 | + bucket=public_bucket, |
| 410 | + blob_name=blob_name, |
| 411 | + ) |
| 412 | + source_images.append(wsi_dcm) |
| 413 | + |
| 414 | + # Store to disk |
| 415 | + if output_dir is not None and store_wsi_dicom: |
| 416 | + wsi_path = ( |
| 417 | + collection_dir / f"{container_id}_im_{i}.dcm" |
| 418 | + ) |
| 419 | + wsi_dcm.save_as(wsi_path) |
| 420 | + |
| 421 | + # Store to DICOM archive |
| 422 | + if dicom_archive is not None: |
| 423 | + web_client = get_dicom_web_client( |
| 424 | + url=dicom_archive, |
| 425 | + token_url=archive_token_url, |
| 426 | + client_id=archive_client_id, |
| 427 | + client_secret=archive_client_secret, |
| 428 | + ) |
| 429 | + web_client.store_instances([wsi_dcm]) |
| 430 | + |
| 431 | + ann_dcm, seg_dcms = convert_annotations( |
| 432 | + annotation_csvs=iter_csvs(ann_blob), |
| 433 | + source_images=source_images, |
| 434 | + include_segmentation=with_segmentation, |
| 435 | + segmentation_type=segmentation_type, |
| 436 | + annotation_coordinate_type=annotation_coordinate_type, |
| 437 | + dimension_organization_type=dimension_organization_type, |
| 438 | + create_pyramid=create_pyramid, |
| 439 | + graphic_type=graphic_type, |
| 440 | + workers=workers, |
406 | 441 | ) |
407 | | - source_images.append(wsi_dcm) |
408 | 442 |
|
409 | | - # Store to disk |
410 | | - if output_dir is not None and store_wsi_dicom: |
411 | | - wsi_path = ( |
412 | | - collection_dir / f"{container_id}_im_{i}.dcm" |
| 443 | + # Store objects to bucket |
| 444 | + if store_bucket: |
| 445 | + if output_bucket is None: |
| 446 | + data_str = (datetime.date.today()) |
| 447 | + output_bucket = ( |
| 448 | + "pan_cancer_nuclei_seg_annotation_" |
| 449 | + f"conversion_{data_str}" |
| 450 | + ) |
| 451 | + output_bucket_obj = output_client.bucket(output_bucket) |
| 452 | + |
| 453 | + if not output_bucket_obj.exists(): |
| 454 | + output_bucket_obj.create( |
| 455 | + location=cloud_config.GCP_DEFAULT_LOCATION |
| 456 | + ) |
| 457 | + |
| 458 | + blob_root = ( |
| 459 | + "" if output_prefix is None else f"{output_prefix}/" |
| 460 | + ) |
| 461 | + ann_blob_name = ( |
| 462 | + f"{blob_root}{collection}/{container_id}_ann.dcm" |
413 | 463 | ) |
414 | | - wsi_dcm.save_as(wsi_path) |
415 | 464 |
|
416 | | - # Store to DICOM archive |
| 465 | + logging.info(f"Uploading annotation to {ann_blob_name}.") |
| 466 | + cloud_io.write_dataset_to_blob( |
| 467 | + ann_dcm, |
| 468 | + output_bucket_obj, |
| 469 | + ann_blob_name, |
| 470 | + ) |
| 471 | + if with_segmentation: |
| 472 | + for s, seg_dcm in enumerate(seg_dcms): |
| 473 | + seg_blob_name = ( |
| 474 | + f"{blob_root}{collection}/{container_id}_seg_{s}.dcm" |
| 475 | + ) |
| 476 | + logging.info( |
| 477 | + f"Uploading segmentation to {seg_blob_name}." |
| 478 | + ) |
| 479 | + cloud_io.write_dataset_to_blob( |
| 480 | + seg_dcm, |
| 481 | + output_bucket_obj, |
| 482 | + seg_blob_name, |
| 483 | + ) |
| 484 | + |
| 485 | + # Store objects to filesystem |
| 486 | + if output_dir is not None: |
| 487 | + ann_path = collection_dir / f"{container_id}_ann.dcm" |
| 488 | + |
| 489 | + logging.info(f"Writing annotation to {str(ann_path)}.") |
| 490 | + ann_dcm.save_as(ann_path) |
| 491 | + |
| 492 | + if with_segmentation: |
| 493 | + for s, seg_dcm in enumerate(seg_dcms): |
| 494 | + seg_path = collection_dir / f"{container_id}_seg_{s}.dcm" |
| 495 | + logging.info(f"Writing segmentation to {str(seg_path)}.") |
| 496 | + seg_dcm.save_as(seg_path) |
| 497 | + |
| 498 | + # Store objects to DICOM archive |
417 | 499 | if dicom_archive is not None: |
| 500 | + # Recreate client each time to deal with token expiration |
418 | 501 | web_client = get_dicom_web_client( |
419 | 502 | url=dicom_archive, |
420 | 503 | token_url=archive_token_url, |
421 | 504 | client_id=archive_client_id, |
422 | 505 | client_secret=archive_client_secret, |
423 | 506 | ) |
424 | | - web_client.store_instances([wsi_dcm]) |
425 | | - |
426 | | - ann_dcm, seg_dcms = convert_annotations( |
427 | | - annotation_csvs=iter_csvs(ann_blob), |
428 | | - source_images=source_images, |
429 | | - include_segmentation=with_segmentation, |
430 | | - segmentation_type=segmentation_type, |
431 | | - annotation_coordinate_type=annotation_coordinate_type, |
432 | | - dimension_organization_type=dimension_organization_type, |
433 | | - create_pyramid=create_pyramid, |
434 | | - graphic_type=graphic_type, |
435 | | - workers=workers, |
436 | | - ) |
437 | 507 |
|
438 | | - # Store objects to bucket |
439 | | - if store_bucket: |
440 | | - if output_bucket is None: |
441 | | - data_str = (datetime.date.today()) |
442 | | - output_bucket = ( |
443 | | - "pan_cancer_nuclei_seg_annotation_" |
444 | | - f"conversion_{data_str}" |
445 | | - ) |
446 | | - output_bucket_obj = output_client.bucket(output_bucket) |
447 | | - |
448 | | - if not output_bucket_obj.exists(): |
449 | | - output_bucket_obj.create( |
450 | | - location=cloud_config.GCP_DEFAULT_LOCATION |
451 | | - ) |
452 | | - |
453 | | - blob_root = ( |
454 | | - "" if output_prefix is None else f"{output_prefix}/" |
455 | | - ) |
456 | | - ann_blob_name = ( |
457 | | - f"{blob_root}{collection}/{container_id}_ann.dcm" |
458 | | - ) |
459 | | - |
460 | | - logging.info(f"Uploading annotation to {ann_blob_name}.") |
461 | | - cloud_io.write_dataset_to_blob( |
462 | | - ann_dcm, |
463 | | - output_bucket_obj, |
464 | | - ann_blob_name, |
465 | | - ) |
466 | | - if with_segmentation: |
467 | | - for s, seg_dcm in enumerate(seg_dcms): |
468 | | - seg_blob_name = ( |
469 | | - f"{blob_root}{collection}/{container_id}_seg_{s}.dcm" |
470 | | - ) |
471 | | - logging.info( |
472 | | - f"Uploading segmentation to {seg_blob_name}." |
473 | | - ) |
474 | | - cloud_io.write_dataset_to_blob( |
475 | | - seg_dcm, |
476 | | - output_bucket_obj, |
477 | | - seg_blob_name, |
478 | | - ) |
479 | | - |
480 | | - # Store objects to filesystem |
481 | | - if output_dir is not None: |
482 | | - ann_path = collection_dir / f"{container_id}_ann.dcm" |
483 | | - |
484 | | - logging.info(f"Writing annotation to {str(ann_path)}.") |
485 | | - ann_dcm.save_as(ann_path) |
486 | | - |
487 | | - if with_segmentation: |
488 | | - for s, seg_dcm in enumerate(seg_dcms): |
489 | | - seg_path = collection_dir / f"{container_id}_seg_{s}.dcm" |
490 | | - logging.info(f"Writing segmentation to {str(seg_path)}.") |
491 | | - seg_dcm.save_as(seg_path) |
492 | | - |
493 | | - # Store objects to DICOM archive |
494 | | - if dicom_archive is not None: |
495 | | - # Recreate client each time to deal with token expiration |
496 | | - web_client = get_dicom_web_client( |
497 | | - url=dicom_archive, |
498 | | - token_url=archive_token_url, |
499 | | - client_id=archive_client_id, |
500 | | - client_secret=archive_client_secret, |
| 508 | + logging.info(f"Writing annotation to {dicom_archive}.") |
| 509 | + web_client.store_instances([ann_dcm]) |
| 510 | + |
| 511 | + if with_segmentation: |
| 512 | + logging.info(f"Writing segmentation(s) to {dicom_archive}.") |
| 513 | + for seg_dcm in seg_dcms: |
| 514 | + web_client.store_instances([seg_dcm]) |
| 515 | + |
| 516 | + image_stop_time = time() |
| 517 | + time_for_image = image_stop_time - image_start_time |
| 518 | + logging.info(f"Processed {container_id} in {time_for_image:.2f}s") |
| 519 | + |
| 520 | + except Exception as e: |
| 521 | + logging.error(f"Error {str(e)}") |
| 522 | + errors.append( |
| 523 | + { |
| 524 | + "collection": collection, |
| 525 | + "container_id": container_id, |
| 526 | + "error_message": str(e), |
| 527 | + "datetime": str(datetime.datetime.now()), |
| 528 | + } |
501 | 529 | ) |
502 | | - |
503 | | - logging.info(f"Writing annotation to {dicom_archive}.") |
504 | | - web_client.store_instances([ann_dcm]) |
505 | | - |
506 | | - if with_segmentation: |
507 | | - logging.info(f"Writing segmentation(s) to {dicom_archive}.") |
508 | | - for seg_dcm in seg_dcms: |
509 | | - web_client.store_instances([seg_dcm]) |
510 | | - |
511 | | - image_stop_time = time() |
512 | | - time_for_image = image_stop_time - image_start_time |
513 | | - logging.info(f"Processed {container_id} in {time_for_image:.2f}s") |
| 530 | + errors_df = pd.DataFrame(errors) |
| 531 | + errors_df.to_csv("error_log.csv") |
514 | 532 |
|
515 | 533 |
|
516 | 534 | if __name__ == "__main__": |
|
0 commit comments