diff --git a/src/app.py b/src/app.py index 26d907ef..41df4c29 100644 --- a/src/app.py +++ b/src/app.py @@ -3378,353 +3378,6 @@ def get_associated_donors_from_dataset(id): return jsonify(final_result) -""" -Get the complete provenance info for all datasets - -Authentication -------- -No token is required, however if a token is given it must be valid or an error will be raised. If no token with HuBMAP -Read Group access is given, only datasets designated as "published" will be returned - -Query Parameters -------- - format : string - Designates the output format of the returned data. Accepted values are "json" and "tsv". If none provided, by - default will return a tsv. - group_uuid : string - Filters returned datasets by a given group uuid. - organ : string - Filters returned datasets related to a samples of the given organ. Accepts 2 character organ codes. These codes - must match the organ types yaml at https://raw.githubusercontent.com/hubmapconsortium/search-api/main/src/search-schema/data/definitions/enums/organ_types.yaml - or an error will be raised - has_rui_info : string - Accepts strings "true" or "false. Any other value will result in an error. If true, only datasets connected to - an sample that contain rui info will be returned. If false, only datasets that are NOT connected to samples - containing rui info will be returned. By default, no filtering is performed. - dataset_status : string - Filters results by dataset status. Accepted values are "Published", "QA", and "NEW". If a user only has access - to published datasets and enters QA or New, an error will be raised. By default, no filtering is performed - -Returns -------- -If the response is small enough to be returned directly through the gateway, an HTTP 200 response code will be -returned. If the response is too large to pass through the gateway, and HTTP 303 response code will be returned, and -the response body will contain a URL to an AWS S3 Object. The Object must be retrieved by following the URL before -it expires. - -json - an array of each datatset's provenance info -tsv - a text file of tab separated values where each row is a dataset and the columns include all its prov info -""" -@app.route('/datasets/prov-info', methods=['GET']) -def get_prov_info(): - global anS3Worker - - # String constants - HEADER_DATASET_UUID = 'dataset_uuid' - HEADER_DATASET_HUBMAP_ID = 'dataset_hubmap_id' - HEADER_DATASET_STATUS = 'dataset_status' - HEADER_DATASET_GROUP_NAME = 'dataset_group_name' - HEADER_DATASET_GROUP_UUID = 'dataset_group_uuid' - HEADER_DATASET_DATE_TIME_CREATED = 'dataset_date_time_created' - HEADER_DATASET_CREATED_BY_EMAIL = 'dataset_created_by_email' - HEADER_DATASET_DATE_TIME_MODIFIED = 'dataset_date_time_modified' - HEADER_DATASET_MODIFIED_BY_EMAIL = 'dataset_modified_by_email' - HEADER_DATASET_LAB_ID = 'lab_id_or_name' - HEADER_DATASET_DATASET_TYPE = 'dataset_dataset_type' - HEADER_DATASET_PORTAL_URL = 'dataset_portal_url' - HEADER_FIRST_SAMPLE_HUBMAP_ID = 'first_sample_hubmap_id' - HEADER_FIRST_SAMPLE_SUBMISSION_ID = 'first_sample_submission_id' - HEADER_FIRST_SAMPLE_UUID = 'first_sample_uuid' - HEADER_FIRST_SAMPLE_TYPE = 'first_sample_type' - HEADER_FIRST_SAMPLE_PORTAL_URL = 'first_sample_portal_url' - HEADER_ORGAN_HUBMAP_ID = 'organ_hubmap_id' - HEADER_ORGAN_SUBMISSION_ID = 'organ_submission_id' - HEADER_ORGAN_UUID = 'organ_uuid' - HEADER_ORGAN_TYPE = 'organ_type' - HEADER_DONOR_HUBMAP_ID = 'donor_hubmap_id' - HEADER_DONOR_SUBMISSION_ID = 'donor_submission_id' - HEADER_DONOR_UUID = 'donor_uuid' - HEADER_DONOR_GROUP_NAME = 'donor_group_name' - HEADER_RUI_LOCATION_HUBMAP_ID = 'rui_location_hubmap_id' - HEADER_RUI_LOCATION_SUBMISSION_ID = 'rui_location_submission_id' - HEADER_RUI_LOCATION_UUID = 'rui_location_uuid' - HEADER_SAMPLE_METADATA_HUBMAP_ID = 'sample_metadata_hubmap_id' - HEADER_SAMPLE_METADATA_SUBMISSION_ID = 'sample_metadata_submission_id' - HEADER_SAMPLE_METADATA_UUID = 'sample_metadata_uuid' - HEADER_PROCESSED_DATASET_UUID = 'processed_dataset_uuid' - HEADER_PROCESSED_DATASET_HUBMAP_ID = 'processed_dataset_hubmap_id' - HEADER_PROCESSED_DATASET_STATUS = 'processed_dataset_status' - HEADER_PROCESSED_DATASET_PORTAL_URL = 'processed_dataset_portal_url' - HEADER_PREVIOUS_VERSION_HUBMAP_IDS = 'previous_version_hubmap_ids' - - headers = [ - HEADER_DATASET_UUID, HEADER_DATASET_HUBMAP_ID, HEADER_DATASET_STATUS, HEADER_DATASET_GROUP_NAME, - HEADER_DATASET_GROUP_UUID, HEADER_DATASET_DATE_TIME_CREATED, HEADER_DATASET_CREATED_BY_EMAIL, - HEADER_DATASET_DATE_TIME_MODIFIED, HEADER_DATASET_MODIFIED_BY_EMAIL, HEADER_DATASET_LAB_ID, - HEADER_DATASET_DATASET_TYPE, HEADER_DATASET_PORTAL_URL, HEADER_FIRST_SAMPLE_HUBMAP_ID, - HEADER_FIRST_SAMPLE_SUBMISSION_ID, HEADER_FIRST_SAMPLE_UUID, HEADER_FIRST_SAMPLE_TYPE, - HEADER_FIRST_SAMPLE_PORTAL_URL, HEADER_ORGAN_HUBMAP_ID, HEADER_ORGAN_SUBMISSION_ID, HEADER_ORGAN_UUID, - HEADER_ORGAN_TYPE, HEADER_DONOR_HUBMAP_ID, HEADER_DONOR_SUBMISSION_ID, HEADER_DONOR_UUID, - HEADER_DONOR_GROUP_NAME, HEADER_RUI_LOCATION_HUBMAP_ID, HEADER_RUI_LOCATION_SUBMISSION_ID, - HEADER_RUI_LOCATION_UUID, HEADER_SAMPLE_METADATA_HUBMAP_ID, HEADER_SAMPLE_METADATA_SUBMISSION_ID, - HEADER_SAMPLE_METADATA_UUID, HEADER_PROCESSED_DATASET_UUID, HEADER_PROCESSED_DATASET_HUBMAP_ID, - HEADER_PROCESSED_DATASET_STATUS, HEADER_PROCESSED_DATASET_PORTAL_URL, HEADER_PREVIOUS_VERSION_HUBMAP_IDS - ] - published_only = True - - # Token is not required, but if an invalid token is provided, - # we need to tell the client with a 401 error - validate_token_if_auth_header_exists(request) - organ_types_dict = schema_manager.get_organ_types() - if user_in_hubmap_read_group(request): - published_only = False - - # Processing and validating query parameters - accepted_arguments = ['format', 'organ', 'has_rui_info', 'dataset_status', 'group_uuid'] - return_json = False - param_dict = {} - if bool(request.args): - for argument in request.args: - if argument not in accepted_arguments: - bad_request_error(f"{argument} is an unrecognized argument.") - return_format = request.args.get('format') - if return_format is not None: - if return_format.lower() not in ['json', 'tsv']: - bad_request_error( - "Invalid Format. Accepted formats are json and tsv. If no format is given, TSV will be the default") - if return_format.lower() == 'json': - return_json = True - group_uuid = request.args.get('group_uuid') - if group_uuid is not None: - groups_by_id_dict = auth_helper_instance.get_globus_groups_info()['by_id'] - if group_uuid not in groups_by_id_dict: - bad_request_error( - f"Invalid Group UUID.") - if not groups_by_id_dict[group_uuid]['data_provider']: - bad_request_error(f"Invalid Group UUID. Group must be a data provider") - param_dict['group_uuid'] = group_uuid - organ = request.args.get('organ') - if organ is not None: - validate_organ_code(organ) - param_dict['organ'] = organ - has_rui_info = request.args.get('has_rui_info') - if has_rui_info is not None: - if has_rui_info.lower() not in ['true', 'false']: - bad_request_error("Invalid value for 'has_rui_info'. Only values of true or false are acceptable") - param_dict['has_rui_info'] = has_rui_info - dataset_status = request.args.get('dataset_status') - if dataset_status is not None: - if dataset_status.lower() not in ['new', 'qa', 'published']: - bad_request_error("Invalid Dataset Status. Must be 'new', 'qa', or 'published' Case-Insensitive") - if published_only and dataset_status.lower() != 'published': - bad_request_error(f"Invalid Dataset Status. No auth token given or token is not a member of HuBMAP-Read" - " Group. If no token with HuBMAP-Read Group access is given, only datasets marked " - "'Published' are available. Try again with a proper token, or change/remove " - "dataset_status") - if not published_only: - param_dict['dataset_status'] = dataset_status - - # Instantiation of the list dataset_prov_list - dataset_prov_list = [] - - # Call to app_neo4j_queries to prepare and execute the database query - prov_info = app_neo4j_queries.get_prov_info(neo4j_driver_instance, param_dict, published_only) - - # Each dataset's provinence info is placed into a dictionary - for dataset in prov_info: - internal_dict = collections.OrderedDict() - internal_dict[HEADER_DATASET_UUID] = dataset['uuid'] - internal_dict[HEADER_DATASET_HUBMAP_ID] = dataset['hubmap_id'] - internal_dict[HEADER_DATASET_STATUS] = dataset['status'] - internal_dict[HEADER_DATASET_GROUP_NAME] = dataset['group_name'] - internal_dict[HEADER_DATASET_GROUP_UUID] = dataset['group_uuid'] - internal_dict[HEADER_DATASET_DATE_TIME_CREATED] = str(datetime.fromtimestamp(int(dataset['created_timestamp'] / 1000.0))) - internal_dict[HEADER_DATASET_CREATED_BY_EMAIL] = dataset['created_by_user_email'] - internal_dict[HEADER_DATASET_DATE_TIME_MODIFIED] = str(datetime.fromtimestamp(int(dataset['last_modified_timestamp'] / 1000.0))) - internal_dict[HEADER_DATASET_MODIFIED_BY_EMAIL] = dataset['last_modified_user_email'] - internal_dict[HEADER_DATASET_LAB_ID] = dataset['lab_dataset_id'] - internal_dict[HEADER_DATASET_DATASET_TYPE] = dataset['dataset_dataset_type'] - internal_dict[HEADER_DATASET_PORTAL_URL] = app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace('', dataset['uuid']) - - # first_sample properties are retrieved from its own dictionary - if dataset['first_sample'] is not None: - first_sample_hubmap_id_list = [] - first_sample_submission_id_list = [] - first_sample_uuid_list = [] - first_sample_type_list = [] - first_sample_portal_url_list = [] - for item in dataset['first_sample']: - first_sample_hubmap_id_list.append(item['hubmap_id']) - first_sample_submission_id_list.append(item['submission_id']) - first_sample_uuid_list.append(item['uuid']) - first_sample_type_list.append(item['sample_category']) - - first_sample_portal_url_list.append(app.config['DOI_REDIRECT_URL'].replace('', 'sample').replace('', item['uuid'])) - internal_dict[HEADER_FIRST_SAMPLE_HUBMAP_ID] = first_sample_hubmap_id_list - internal_dict[HEADER_FIRST_SAMPLE_SUBMISSION_ID] = first_sample_submission_id_list - internal_dict[HEADER_FIRST_SAMPLE_UUID] = first_sample_uuid_list - internal_dict[HEADER_FIRST_SAMPLE_TYPE] = first_sample_type_list - internal_dict[HEADER_FIRST_SAMPLE_PORTAL_URL] = first_sample_portal_url_list - if return_json is False: - internal_dict[HEADER_FIRST_SAMPLE_HUBMAP_ID] = ",".join(first_sample_hubmap_id_list) - internal_dict[HEADER_FIRST_SAMPLE_SUBMISSION_ID] = ",".join(first_sample_submission_id_list) - internal_dict[HEADER_FIRST_SAMPLE_UUID] = ",".join(first_sample_uuid_list) - internal_dict[HEADER_FIRST_SAMPLE_TYPE] = ",".join(first_sample_type_list) - internal_dict[HEADER_FIRST_SAMPLE_PORTAL_URL] = ",".join(first_sample_portal_url_list) - - # distinct_organ properties are retrieved from its own dictionary - if dataset['distinct_organ'] is not None: - distinct_organ_hubmap_id_list = [] - distinct_organ_submission_id_list = [] - distinct_organ_uuid_list = [] - distinct_organ_type_list = [] - for item in dataset['distinct_organ']: - distinct_organ_hubmap_id_list.append(item['hubmap_id']) - distinct_organ_submission_id_list.append(item['submission_id']) - distinct_organ_uuid_list.append(item['uuid']) - - organ_code = item['organ'].upper() - validate_organ_code(organ_code) - - distinct_organ_type_list.append(organ_types_dict[organ_code].lower()) - internal_dict[HEADER_ORGAN_HUBMAP_ID] = distinct_organ_hubmap_id_list - internal_dict[HEADER_ORGAN_SUBMISSION_ID] = distinct_organ_submission_id_list - internal_dict[HEADER_ORGAN_UUID] = distinct_organ_uuid_list - internal_dict[HEADER_ORGAN_TYPE] = distinct_organ_type_list - if return_json is False: - internal_dict[HEADER_ORGAN_HUBMAP_ID] = ",".join(distinct_organ_hubmap_id_list) - internal_dict[HEADER_ORGAN_SUBMISSION_ID] = ",".join(distinct_organ_submission_id_list) - internal_dict[HEADER_ORGAN_UUID] = ",".join(distinct_organ_uuid_list) - internal_dict[HEADER_ORGAN_TYPE] = ",".join(distinct_organ_type_list) - - # distinct_donor properties are retrieved from its own dictionary - if dataset['distinct_donor'] is not None: - distinct_donor_hubmap_id_list = [] - distinct_donor_submission_id_list = [] - distinct_donor_uuid_list = [] - distinct_donor_group_name_list = [] - for item in dataset['distinct_donor']: - distinct_donor_hubmap_id_list.append(item['hubmap_id']) - distinct_donor_submission_id_list.append(item['submission_id']) - distinct_donor_uuid_list.append(item['uuid']) - distinct_donor_group_name_list.append(item['group_name']) - internal_dict[HEADER_DONOR_HUBMAP_ID] = distinct_donor_hubmap_id_list - internal_dict[HEADER_DONOR_SUBMISSION_ID] = distinct_donor_submission_id_list - internal_dict[HEADER_DONOR_UUID] = distinct_donor_uuid_list - internal_dict[HEADER_DONOR_GROUP_NAME] = distinct_donor_group_name_list - if return_json is False: - internal_dict[HEADER_DONOR_HUBMAP_ID] = ",".join(distinct_donor_hubmap_id_list) - internal_dict[HEADER_DONOR_SUBMISSION_ID] = ",".join(distinct_donor_submission_id_list) - internal_dict[HEADER_DONOR_UUID] = ",".join(distinct_donor_uuid_list) - internal_dict[HEADER_DONOR_GROUP_NAME] = ",".join(distinct_donor_group_name_list) - - # distinct_rui_sample properties are retrieved from its own dictionary - if dataset['distinct_rui_sample'] is not None: - rui_location_hubmap_id_list = [] - rui_location_submission_id_list = [] - rui_location_uuid_list = [] - for item in dataset['distinct_rui_sample']: - rui_location_hubmap_id_list.append(item['hubmap_id']) - rui_location_submission_id_list.append(item['submission_id']) - rui_location_uuid_list.append(item['uuid']) - internal_dict[HEADER_RUI_LOCATION_HUBMAP_ID] = rui_location_hubmap_id_list - internal_dict[HEADER_RUI_LOCATION_SUBMISSION_ID] = rui_location_submission_id_list - internal_dict[HEADER_RUI_LOCATION_UUID] = rui_location_uuid_list - if return_json is False: - internal_dict[HEADER_RUI_LOCATION_HUBMAP_ID] = ",".join(rui_location_hubmap_id_list) - internal_dict[HEADER_RUI_LOCATION_SUBMISSION_ID] = ",".join(rui_location_submission_id_list) - internal_dict[HEADER_RUI_LOCATION_UUID] = ",".join(rui_location_uuid_list) - - # distinct_metasample properties are retrieved from its own dictionary - if dataset['distinct_metasample'] is not None: - metasample_hubmap_id_list = [] - metasample_submission_id_list = [] - metasample_uuid_list = [] - for item in dataset['distinct_metasample']: - metasample_hubmap_id_list.append(item['hubmap_id']) - metasample_submission_id_list.append(item['submission_id']) - metasample_uuid_list.append(item['uuid']) - internal_dict[HEADER_SAMPLE_METADATA_HUBMAP_ID] = metasample_hubmap_id_list - internal_dict[HEADER_SAMPLE_METADATA_SUBMISSION_ID] = metasample_submission_id_list - internal_dict[HEADER_SAMPLE_METADATA_UUID] = metasample_uuid_list - if return_json is False: - internal_dict[HEADER_SAMPLE_METADATA_HUBMAP_ID] = ",".join(metasample_hubmap_id_list) - internal_dict[HEADER_SAMPLE_METADATA_SUBMISSION_ID] = ",".join(metasample_submission_id_list) - internal_dict[HEADER_SAMPLE_METADATA_UUID] = ",".join(metasample_uuid_list) - - # processed_dataset properties are retrived from its own dictionary - if dataset['processed_dataset'] is not None: - processed_dataset_uuid_list = [] - processed_dataset_hubmap_id_list = [] - processed_dataset_status_list = [] - processed_dataset_portal_url_list = [] - for item in dataset['processed_dataset']: - processed_dataset_uuid_list.append(item['uuid']) - processed_dataset_hubmap_id_list.append(item['hubmap_id']) - processed_dataset_status_list.append(item['status']) - processed_dataset_portal_url_list.append(app.config['DOI_REDIRECT_URL'].replace('', 'dataset').replace('', item['uuid'])) - internal_dict[HEADER_PROCESSED_DATASET_UUID] = processed_dataset_uuid_list - internal_dict[HEADER_PROCESSED_DATASET_HUBMAP_ID] = processed_dataset_hubmap_id_list - internal_dict[HEADER_PROCESSED_DATASET_STATUS] = processed_dataset_status_list - internal_dict[HEADER_PROCESSED_DATASET_PORTAL_URL] = processed_dataset_portal_url_list - if return_json is False: - internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_uuid_list) - internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_hubmap_id_list) - internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_status_list) - internal_dict[HEADER_PROCESSED_DATASET_UUID] = ",".join(processed_dataset_portal_url_list) - - - if dataset['previous_version_hubmap_ids'] is not None: - previous_version_hubmap_ids_list = [] - for item in dataset['previous_version_hubmap_ids']: - previous_version_hubmap_ids_list.append(item) - internal_dict[HEADER_PREVIOUS_VERSION_HUBMAP_IDS] = previous_version_hubmap_ids_list - if return_json is False: - internal_dict[HEADER_PREVIOUS_VERSION_HUBMAP_IDS] = ",".join(previous_version_hubmap_ids_list) - - # Each dataset's dictionary is added to the list to be returned - dataset_prov_list.append(internal_dict) - - # Establish a string for the Response which can be checked to - # see if it is small enough to return directly or must be stashed in S3. - if return_json: - resp_body = json.dumps(dataset_prov_list).encode('utf-8') - else: - # If return_json is false, convert the data to a TSV - new_tsv_file = StringIO() - writer = csv.DictWriter(new_tsv_file, fieldnames=headers, delimiter='\t') - writer.writeheader() - writer.writerows(dataset_prov_list) - new_tsv_file.seek(0) - resp_body = new_tsv_file.read() - - # Check the size of what is to be returned through the AWS Gateway, and replace it with - # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) - - # Return a regular response through the AWS Gateway - if return_json: - return jsonify(dataset_prov_list) - else: - # Return the TSV as an attachment, since it will is small enough to fit through the AWS Gateway. - new_tsv_file.seek(0) - output = Response(new_tsv_file, mimetype='text/tsv') - output.headers['Content-Disposition'] = 'attachment; filename=prov-info.tsv' - return output - - """ Get the complete provenance info for a given dataset @@ -4115,168 +3768,6 @@ def sankey_data(): return jsonify(dataset_sankey_list) -""" -Get the complete provenance info for all samples - -Authentication -------- -Allow no-token/public access - -Query Parameters -------- - group_uuid : string - Filters returned samples by a given group uuid. - -Returns -------- -If the response is small enough to be returned directly through the gateway, an HTTP 200 response code will be -returned. If the response is too large to pass through the gateway, and HTTP 303 response code will be returned, and -the response body will contain a URL to an AWS S3 Object. The Object must be retrieved by following the URL before -it expires. - -json - an array of each datatset's provenance info -""" -@app.route('/samples/prov-info', methods=['GET']) -def get_sample_prov_info(): - global anS3Worker - - # String Constants - HEADER_SAMPLE_UUID = "sample_uuid" - HEADER_SAMPLE_LAB_ID = "lab_id_or_name" - HEADER_SAMPLE_GROUP_NAME = "sample_group_name" - HEADER_SAMPLE_CREATED_BY_EMAIL = "sample_created_by_email" - HEADER_SAMPLE_HAS_METADATA = "sample_has_metadata" - HEADER_SAMPLE_HAS_RUI_INFO = "sample_has_rui_info" - HEADER_SAMPLE_DIRECT_ANCESTOR_ID = "sample_ancestor_id" - HEADER_SAMPLE_DIRECT_ANCESTOR_ENTITY_TYPE = "sample_ancestor_entity" - HEADER_SAMPLE_HUBMAP_ID = "sample_hubmap_id" - HEADER_SAMPLE_SUBMISSION_ID = "sample_submission_id" - HEADER_SAMPLE_TYPE = "sample_type" - HEADER_DONOR_UUID = "donor_uuid" - HEADER_DONOR_SUBMISSION_ID = "donor_submission_id" - HEADER_DONOR_HUBMAP_ID = "donor_hubmap_id" - HEADER_DONOR_HAS_METADATA = "donor_has_metadata" - HEADER_ORGAN_UUID = "organ_uuid" - HEADER_ORGAN_TYPE = "organ_type" - HEADER_ORGAN_HUBMAP_ID = "organ_hubmap_id" - HEADER_ORGAN_SUBMISSION_ID = "organ_submission_id" - - public_only = True - - # Token is not required, but if an invalid token is provided, - # we need to tell the client with a 401 error - validate_token_if_auth_header_exists(request) - - if user_in_hubmap_read_group(request): - public_only = False - - organ_types_dict = schema_manager.get_organ_types() - - # Processing and validating query parameters - accepted_arguments = ['group_uuid'] - param_dict = {} # currently the only filter is group_uuid, but in case this grows, we're using a dictionary - if bool(request.args): - for argument in request.args: - if argument not in accepted_arguments: - bad_request_error(f"{argument} is an unrecognized argument.") - group_uuid = request.args.get('group_uuid') - if group_uuid is not None: - groups_by_id_dict = auth_helper_instance.get_globus_groups_info()['by_id'] - if group_uuid not in groups_by_id_dict: - bad_request_error(f"Invalid Group UUID.") - if not groups_by_id_dict[group_uuid]['data_provider']: - bad_request_error(f"Invalid Group UUID. Group must be a data provider") - param_dict['group_uuid'] = group_uuid - - # Instantiation of the list sample_prov_list - sample_prov_list = [] - - # Call to app_neo4j_queries to prepare and execute database query - prov_info = app_neo4j_queries.get_sample_prov_info(neo4j_driver_instance, param_dict, public_only) - - for sample in prov_info: - # For cases where there is no sample of type organ above a given sample in the provenance, we check to see if - # the given sample is itself an organ. - organ_uuid = None - organ_type = None - organ_hubmap_id = None - organ_submission_id = None - if sample['organ_uuid'] is not None: - organ_uuid = sample['organ_uuid'] - - organ_code = sample['organ_organ_type'].upper() - validate_organ_code(organ_code) - - organ_type = organ_types_dict[organ_code].lower() - organ_hubmap_id = sample['organ_hubmap_id'] - organ_submission_id = sample['organ_submission_id'] - else: - if sample['sample_category'] == "organ": - organ_uuid = sample['sample_uuid'] - - organ_code = sample['sample_organ'].upper() - validate_organ_code(organ_code) - - organ_type = organ_types_dict[organ_code].lower() - organ_hubmap_id = sample['sample_hubmap_id'] - organ_submission_id = sample['sample_submission_id'] - - - sample_has_metadata = False - if sample['sample_metadata'] is not None: - sample_has_metadata = True - - sample_has_rui_info = False - if sample['sample_rui_info'] is not None: - sample_has_rui_info = True - - donor_has_metadata = False - if sample['donor_metadata'] is not None: - donor_has_metadata = True - - internal_dict = collections.OrderedDict() - internal_dict[HEADER_SAMPLE_UUID] = sample['sample_uuid'] - internal_dict[HEADER_SAMPLE_LAB_ID] = sample['lab_sample_id'] - internal_dict[HEADER_SAMPLE_GROUP_NAME] = sample['sample_group_name'] - internal_dict[HEADER_SAMPLE_CREATED_BY_EMAIL] = sample['sample_created_by_email'] - internal_dict[HEADER_SAMPLE_HAS_METADATA] = sample_has_metadata - internal_dict[HEADER_SAMPLE_HAS_RUI_INFO] = sample_has_rui_info - internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ID] = sample['sample_ancestor_id'] - internal_dict[HEADER_SAMPLE_TYPE] = sample['sample_category'] - internal_dict[HEADER_SAMPLE_HUBMAP_ID] = sample['sample_hubmap_id'] - internal_dict[HEADER_SAMPLE_SUBMISSION_ID] = sample['sample_submission_id'] - internal_dict[HEADER_SAMPLE_DIRECT_ANCESTOR_ENTITY_TYPE] = sample['sample_ancestor_entity'] - internal_dict[HEADER_DONOR_UUID] = sample['donor_uuid'] - internal_dict[HEADER_DONOR_HAS_METADATA] = donor_has_metadata - internal_dict[HEADER_DONOR_HUBMAP_ID] = sample['donor_hubmap_id'] - internal_dict[HEADER_DONOR_SUBMISSION_ID] = sample['donor_submission_id'] - internal_dict[HEADER_ORGAN_UUID] = organ_uuid - internal_dict[HEADER_ORGAN_TYPE] = organ_type - internal_dict[HEADER_ORGAN_HUBMAP_ID] = organ_hubmap_id - internal_dict[HEADER_ORGAN_SUBMISSION_ID] = organ_submission_id - - # Each sample's dictionary is added to the list to be returned - sample_prov_list.append(internal_dict) - - # Check the size of what is to be returned through the AWS Gateway, and replace it with - # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - s3_url = anS3Worker.stash_response_body_if_big(json.dumps(sample_prov_list).encode('utf-8')) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(json.dumps(sample_prov_list).encode('utf-8'))=" - f"{len(json.dumps(sample_prov_list).encode('utf-8'))}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) - - # Return a regular response through the AWS Gateway - return jsonify(sample_prov_list) - - """ Retrieve all unpublished datasets (datasets with status value other than 'Published' or 'Hold') diff --git a/src/app_neo4j_queries.py b/src/app_neo4j_queries.py index 602444f3..b127c122 100644 --- a/src/app_neo4j_queries.py +++ b/src/app_neo4j_queries.py @@ -746,136 +746,6 @@ def get_associated_donors_from_dataset(neo4j_driver, dataset_uuid): return results -""" -Retrieve all the provenance information about each dataset. Each dataset's prov-info is given by a dictionary. -Certain fields such as first sample where there can be multiple nearest datasets in the provenance above a given -dataset, that field is a list inside of its given dictionary. Results can be filtered with certain parameters: -has_rui_info (true or false), organ (organ type), group_uuid, and dataset_status. These are passed in as a dictionary if -they are present. - -Parameters ----------- -neo4j_driver : neo4j.Driver object - The neo4j database connection pool -param_dict : dictionary - Dictionary containing any parameters desired to filter for certain results -published_only : boolean - If a user does not have a token with HuBMAP-Read Group access, published_only is set to true. This will cause only - datasets with status = 'Published' to be included in the result. -""" -def get_prov_info(neo4j_driver, param_dict, published_only): - group_uuid_query_string = '' - organ_query_string = 'OPTIONAL MATCH' - organ_where_clause = "" - rui_info_query_string = 'OPTIONAL MATCH (ds)<-[*]-(ruiSample:Sample)' - rui_info_where_clause = "WHERE NOT ruiSample.rui_location IS NULL AND NOT trim(ruiSample.rui_location) = '' " - dataset_status_query_string = '' - published_only_query_string = '' - published_only_revisions_string = '' - if 'group_uuid' in param_dict: - group_uuid_query_string = f" AND toUpper(ds.group_uuid) = '{param_dict['group_uuid'].upper()}'" - if 'organ' in param_dict: - organ_query_string = 'MATCH' - organ_where_clause = f" WHERE toUpper(organ.organ) = '{param_dict['organ'].upper()}'" - if 'has_rui_info' in param_dict: - rui_info_query_string = 'MATCH (ds)<-[*]-(ruiSample:Sample)' - if param_dict['has_rui_info'].lower() == 'false': - rui_info_query_string = 'MATCH (ds:Dataset)' - rui_info_where_clause = "WHERE NOT EXISTS {MATCH (ds)<-[*]-(ruiSample:Sample) WHERE NOT ruiSample.rui_location IS NULL AND NOT TRIM(ruiSample.rui_location) = ''} MATCH (ds)<-[*]-(ruiSample:Sample)" - if 'dataset_status' in param_dict: - dataset_status_query_string = f" AND toUpper(ds.status) = '{param_dict['dataset_status'].upper()}'" - if published_only: - published_only_query_string = f" AND toUpper(ds.status) = 'PUBLISHED'" - published_only_revisions_string = f" WHERE toUpper(rev.status) = 'PUBLISHED'" - query = (f"MATCH (ds:Dataset)<-[:ACTIVITY_OUTPUT]-(a:Activity)<-[*]-(firstSample:Sample)<-[*]-(donor:Donor)" - f" WHERE not (ds)-[:REVISION_OF]->(:Dataset)" - f" AND NOT toLower(a.creation_action) ENDS WITH 'process'" - f"{group_uuid_query_string}" - f"{dataset_status_query_string}" - f"{published_only_query_string}" - f" WITH ds, COLLECT(distinct donor) AS DONOR, COLLECT(distinct firstSample) AS FIRSTSAMPLE" - f" OPTIONAL MATCH (ds)<-[:REVISION_OF]-(rev:Dataset)" - f"{published_only_revisions_string}" - f" WITH ds, DONOR, FIRSTSAMPLE, COLLECT(rev.hubmap_id) as REVISIONS" - f" OPTIONAL MATCH (ds)<-[*]-(metaSample:Sample)" - f" WHERE NOT metaSample.metadata IS NULL AND NOT TRIM(metaSample.metadata) = ''" - f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, collect(distinct metaSample) as METASAMPLE" - f" {rui_info_query_string}" - f" {rui_info_where_clause}" - f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, collect(distinct ruiSample) as RUISAMPLE" - # specimen_type -> sample_category 12/15/2022 - f" {organ_query_string} (donor)-[:ACTIVITY_INPUT]->(oa)-[:ACTIVITY_OUTPUT]->(organ:Sample {{sample_category:'organ'}})-[*]->(ds)" - f" {organ_where_clause}" - f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, COLLECT(DISTINCT organ) AS ORGAN " - f" OPTIONAL MATCH (ds)-[*]->(a3)-[:ACTIVITY_OUTPUT]->(processed_dataset:Dataset)" - f" WHERE toLower(a3.creation_action) ENDS WITH 'process'" - f" WITH ds, FIRSTSAMPLE, DONOR, REVISIONS, METASAMPLE, RUISAMPLE, ORGAN, COLLECT(distinct processed_dataset) AS PROCESSED_DATASET" - f" RETURN ds.uuid, FIRSTSAMPLE, DONOR, RUISAMPLE, ORGAN, ds.hubmap_id, ds.status, ds.group_name," - f" ds.group_uuid, ds.created_timestamp, ds.created_by_user_email, ds.last_modified_timestamp, " - f" ds.last_modified_user_email, ds.lab_dataset_id, ds.dataset_type, METASAMPLE, PROCESSED_DATASET, REVISIONS") - - logger.info("======get_prov_info() query======") - logger.info(query) - - with neo4j_driver.session() as session: - # Because we're returning multiple things, we use session.run rather than session.read_transaction - result = session.run(query) - list_of_dictionaries = [] - for record in result: - record_dict = {} - record_contents = [] - # Individual items within a record are non subscriptable. By putting then in a small list, we can address - # Each item in a record. - for item in record: - record_contents.append(item) - record_dict['uuid'] = record_contents[0] - content_one = [] - for entry in record_contents[1]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_one.append(node_dict) - record_dict['first_sample'] = content_one - content_two = [] - for entry in record_contents[2]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_two.append(node_dict) - record_dict['distinct_donor'] = content_two - content_three = [] - for entry in record_contents[3]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_three.append(node_dict) - record_dict['distinct_rui_sample'] = content_three - content_four = [] - for entry in record_contents[4]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_four.append(node_dict) - record_dict['distinct_organ'] = content_four - record_dict['hubmap_id'] = record_contents[5] - record_dict['status'] = record_contents[6] - record_dict['group_name'] = record_contents[7] - record_dict['group_uuid'] = record_contents[8] - record_dict['created_timestamp'] = record_contents[9] - record_dict['created_by_user_email'] = record_contents[10] - record_dict['last_modified_timestamp'] = record_contents[11] - record_dict['last_modified_user_email'] = record_contents[12] - record_dict['lab_dataset_id'] = record_contents[13] - record_dict['dataset_dataset_type'] = record_contents[14] - content_fifteen = [] - for entry in record_contents[15]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_fifteen.append(node_dict) - record_dict['distinct_metasample'] = content_fifteen - content_sixteen = [] - for entry in record_contents[16]: - node_dict = schema_neo4j_queries.node_to_dict(entry) - content_sixteen.append(node_dict) - record_dict['processed_dataset'] = content_sixteen - content_seventeen = [] - for entry in record_contents[17]: - content_seventeen.append(entry) - record_dict['previous_version_hubmap_ids'] = content_seventeen - list_of_dictionaries.append(record_dict) - return list_of_dictionaries - """ Returns all of the same information as get_prov_info however only for a single dataset at a time. Returns a dictionary @@ -1033,89 +903,6 @@ def get_sankey_info(neo4j_driver): return list_of_dictionaries -""" -Returns sample uuid, sample rui location, sample metadata, sample group name, sample created_by_email, sample ancestor -uuid, sample ancestor entity type, organ uuid, organ type, lab tissue sample id, donor uuid, donor -metadata, sample_hubmap_id, organ_hubmap_id, donor_hubmap_id, sample_submission_id, organ_submission_id, - donor_submission_id, and sample_type all in a dictionary - -Parameters ----------- -neo4j_driver : neo4j.Driver object - The neo4j database connection pool -param_dict : dictionary - dictionary containing any filters to be applied in the samples-prov-info query -public_only : boolean - This value indicates whether the query should return all samples, or only samples where data_access_level = 'Public' -""" -def get_sample_prov_info(neo4j_driver, param_dict, public_only): - group_uuid_query_string = '' - public_only_query_string = '' - clause_modifier = "WHERE" - if 'group_uuid' in param_dict: - group_uuid_query_string = f" WHERE toUpper(s.group_uuid) = '{param_dict['group_uuid'].upper()}'" - clause_modifier = "AND" - if public_only: - public_only_query_string = f" {clause_modifier} toUpper(s.data_access_level) = 'PUBLIC'" - query = ( - f" MATCH (s:Sample)<-[*]-(d:Donor)" - f" {group_uuid_query_string}" - f" {public_only_query_string}" - f" WITH s, d" - # specimen_type -> sample_category 12/15/2022 - f" OPTIONAL MATCH (s)<-[*]-(organ:Sample{{sample_category: 'organ'}})" - f" WITH s, organ, d" - f" MATCH (s)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(da)" - f" RETURN s.uuid, s.lab_tissue_sample_id, s.group_name, s.created_by_user_email, s.metadata, s.rui_location," - f" d.uuid, d.metadata, organ.uuid, organ.sample_category, organ.metadata, da.uuid, da.entity_type, " - f"s.sample_category, organ.organ, s.organ, s.hubmap_id, s.submission_id, organ.hubmap_id, organ.submission_id, " - f"d.hubmap_id, d.submission_id" - ) - - logger.info("======get_sample_prov_info() query======") - logger.info(query) - - with neo4j_driver.session() as session: - # Because we're returning multiple things, we use session.run rather than session.read_transaction - result = session.run(query) - list_of_dictionaries = [] - for record in result: - record_dict = {} - record_contents = [] - # Individual items within a record are not subscriptable. By putting them in a small list, we can address - # each item in a record - for item in record: - record_contents.append(item) - record_dict['sample_uuid'] = record_contents[0] - record_dict['lab_sample_id'] = record_contents[1] - record_dict['sample_group_name'] = record_contents[2] - record_dict['sample_created_by_email'] = record_contents[3] - record_dict['sample_metadata'] = record_contents[4] - record_dict['sample_rui_info'] = record_contents[5] - record_dict['donor_uuid'] = record_contents[6] - record_dict['donor_metadata'] = record_contents[7] - record_dict['organ_uuid'] = record_contents[8] - record_dict['organ_type'] = record_contents[9] - record_dict['organ_metadata'] = record_contents[10] - record_dict['sample_ancestor_id'] = record_contents[11] - record_dict['sample_ancestor_entity'] = record_contents[12] - - # sample_specimen_type -> sample_category 12/15/2022 - record_dict['sample_category'] = record_contents[13] - - record_dict['organ_organ_type'] = record_contents[14] - record_dict['sample_organ'] = record_contents[15] - record_dict['sample_hubmap_id'] = record_contents[16] - record_dict['sample_submission_id'] = record_contents[17] - record_dict['organ_hubmap_id'] = record_contents[18] - record_dict['organ_submission_id'] = record_contents[19] - record_dict['donor_hubmap_id'] = record_contents[20] - record_dict['donor_submission_id'] = record_contents[21] - - list_of_dictionaries.append(record_dict) - return list_of_dictionaries - - """ Returns "data_types", "donor_hubmap_id", "donor_submission_id", "hubmap_id", "organ", "organization", "provider_experiment_id", "uuid" in a dictionary