From b461958921f00290377581e69ae462dc29408a36 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Mon, 27 Oct 2025 17:33:53 -0400 Subject: [PATCH 1/3] modified /ancestors/parents/siblings/tuples/entities/ endpoints to return a url to s3 if the side of the data exceeds 10mb --- src/app.py | 241 ++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 184 insertions(+), 57 deletions(-) diff --git a/src/app.py b/src/app.py index 3a9dafe3..51a4a32c 100644 --- a/src/app.py +++ b/src/app.py @@ -95,7 +95,7 @@ # Read the secret key which may be submitted in HTTP Request Headers to override the lockout of # updates to entities with characteristics prohibiting their modification. -LOCKED_ENTITY_UPDATE_OVERRIDE_KEY = app.config['LOCKED_ENTITY_UPDATE_OVERRIDE_KEY'] +# LOCKED_ENTITY_UPDATE_OVERRIDE_KEY = app.config['LOCKED_ENTITY_UPDATE_OVERRIDE_KEY'] # Suppress InsecureRequestWarning warning when requesting status on https with ssl cert verify disabled requests.packages.urllib3.disable_warnings(category = InsecureRequestWarning) @@ -762,6 +762,8 @@ def get_provenance_metadata_by_id_for_auth_level(id): """ @app.route('/entities/', methods = ['GET']) def get_entity_by_id(id): + global anS3Worker + # Token is not required, but if an invalid token provided, # we need to tell the client with a 401 error validate_token_if_auth_header_exists(request) @@ -900,6 +902,23 @@ def get_entity_by_id(id): if public_entity and not user_in_hubmap_read_group(request): final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1616,6 +1635,8 @@ def update_entity(id): """ @app.route('/ancestors/', methods = ['GET']) def get_ancestors(id): + global anS3Worker + final_result = [] # Token is not required, but if an invalid token provided, @@ -1706,6 +1727,26 @@ def get_ancestors(id): else: filtered_final_result.append(ancestor) final_result = filtered_final_result + + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1824,6 +1865,7 @@ def get_descendants(id): """ @app.route('/parents/', methods = ['GET']) def get_parents(id): + global anS3Worker final_result = [] # Token is not required, but if an invalid token provided, @@ -1915,6 +1957,25 @@ def get_parents(id): filtered_final_result.append(parent) final_result = filtered_final_result + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1935,6 +1996,8 @@ def get_parents(id): """ @app.route('/children/', methods = ['GET']) def get_children(id): + global anS3Worker + final_result = [] # Get user token from Authorization header @@ -1989,6 +2052,25 @@ def get_children(id): # Final result after normalization final_result = schema_manager.normalize_entities_list_for_response(complete_entities_list) + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -2012,6 +2094,8 @@ def get_children(id): """ @app.route('/entities//siblings', methods = ['GET']) def get_siblings(id): + global anS3Worker + final_result = [] # Token is not required, but if an invalid token provided, @@ -2081,39 +2165,60 @@ def get_siblings(id): include_revisions = False sibling_list = app_neo4j_queries.get_siblings(neo4j_driver_instance, uuid, status, property_key, include_revisions) if property_key is not None: - return jsonify(sibling_list) + final_result = sibling_list # Generate trigger data # Skip some of the properties that are time-consuming to generate via triggers # Also skip next_revision_uuid and previous_revision_uuid for Dataset to avoid additional # checks when the target Dataset is public but the revisions are not public - properties_to_skip = [ - # Properties to skip for Sample - 'direct_ancestor', - # Properties to skip for Dataset - 'direct_ancestors', - 'collections', - 'upload', - 'title', - 'next_revision_uuid', - 'previous_revision_uuid', - 'associated_collection', - 'creation_action', - 'local_directory_rel_path' - ] + else: + properties_to_skip = [ + # Properties to skip for Sample + 'direct_ancestor', + # Properties to skip for Dataset + 'direct_ancestors', + 'collections', + 'upload', + 'title', + 'next_revision_uuid', + 'previous_revision_uuid', + 'associated_collection', + 'creation_action', + 'local_directory_rel_path' + ] - complete_entities_list = schema_manager.get_complete_entities_list(request.args, token, sibling_list, properties_to_skip) - # Final result after normalization - final_result = schema_manager.normalize_entities_list_for_response(complete_entities_list) - filtered_final_result = [] - for sibling in final_result: - sibling_entity_type = sibling.get('entity_type') - fields_to_exclude = schema_manager.get_fields_to_exclude(sibling_entity_type) - if public_entity and not user_in_hubmap_read_group(request): - filtered_sibling = schema_manager.exclude_properties_from_response(fields_to_exclude, sibling) - filtered_final_result.append(filtered_sibling) - else: - filtered_final_result.append(sibling) - final_result = filtered_final_result + complete_entities_list = schema_manager.get_complete_entities_list(request.args, token, sibling_list, properties_to_skip) + # Final result after normalization + output = schema_manager.normalize_entities_list_for_response(complete_entities_list) + filtered_final_result = [] + for sibling in output: + sibling_entity_type = sibling.get('entity_type') + fields_to_exclude = schema_manager.get_fields_to_exclude(sibling_entity_type) + if public_entity and not user_in_hubmap_read_group(request): + filtered_sibling = schema_manager.exclude_properties_from_response(fields_to_exclude, sibling) + filtered_final_result.append(filtered_sibling) + else: + filtered_final_result.append(sibling) + final_result = filtered_final_result + + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -2137,6 +2242,7 @@ def get_siblings(id): """ @app.route('/entities//tuplets', methods = ['GET']) def get_tuplets(id): + global anS3Worker final_result = [] # Token is not required, but if an invalid token provided, @@ -2196,39 +2302,60 @@ def get_tuplets(id): bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}") tuplet_list = app_neo4j_queries.get_tuplets(neo4j_driver_instance, uuid, status, property_key) if property_key is not None: - return jsonify(tuplet_list) + final_result = tuplet_list # Generate trigger data # Skip some of the properties that are time-consuming to generate via triggers # Also skip next_revision_uuid and previous_revision_uuid for Dataset to avoid additional # checks when the target Dataset is public but the revisions are not public - properties_to_skip = [ - # Properties to skip for Sample - 'direct_ancestor', - # Properties to skip for Dataset - 'direct_ancestors', - 'collections', - 'upload', - 'title', - 'next_revision_uuid', - 'previous_revision_uuid', - 'associated_collection', - 'creation_action', - 'local_directory_rel_path' - ] + else: + properties_to_skip = [ + # Properties to skip for Sample + 'direct_ancestor', + # Properties to skip for Dataset + 'direct_ancestors', + 'collections', + 'upload', + 'title', + 'next_revision_uuid', + 'previous_revision_uuid', + 'associated_collection', + 'creation_action', + 'local_directory_rel_path' + ] - complete_entities_list = schema_manager.get_complete_entities_list(request.args, token, tuplet_list, properties_to_skip) - # Final result after normalization - final_result = schema_manager.normalize_entities_list_for_response(complete_entities_list) - filtered_final_result = [] - for tuplet in final_result: - tuple_entity_type = tuplet.get('entity_type') - fields_to_exclude = schema_manager.get_fields_to_exclude(tuple_entity_type) - if public_entity and not user_in_hubmap_read_group(request): - filtered_tuplet = schema_manager.exclude_properties_from_response(fields_to_exclude, tuplet) - filtered_final_result.append(filtered_tuplet) - else: - filtered_final_result.append(tuplet) - final_result = filtered_final_result + complete_entities_list = schema_manager.get_complete_entities_list(request.args, token, tuplet_list, properties_to_skip) + # Final result after normalization + output = schema_manager.normalize_entities_list_for_response(complete_entities_list) + filtered_final_result = [] + for tuplet in output: + tuple_entity_type = tuplet.get('entity_type') + fields_to_exclude = schema_manager.get_fields_to_exclude(tuple_entity_type) + if public_entity and not user_in_hubmap_read_group(request): + filtered_tuplet = schema_manager.exclude_properties_from_response(fields_to_exclude, tuplet) + filtered_final_result.append(filtered_tuplet) + else: + filtered_final_result.append(tuplet) + final_result = filtered_final_result + + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + try: + resp_body = json.dumps(final_result).encode('utf-8') + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + + # Return a regular response through the AWS Gateway return jsonify(final_result) From 984724d7665c9c7b86c9164436f7c40dbcbbf685 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Mon, 27 Oct 2025 17:35:37 -0400 Subject: [PATCH 2/3] uncommented locked_entity_update_override_key from testing --- src/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/app.py b/src/app.py index 51a4a32c..a4a81c8f 100644 --- a/src/app.py +++ b/src/app.py @@ -95,7 +95,7 @@ # Read the secret key which may be submitted in HTTP Request Headers to override the lockout of # updates to entities with characteristics prohibiting their modification. -# LOCKED_ENTITY_UPDATE_OVERRIDE_KEY = app.config['LOCKED_ENTITY_UPDATE_OVERRIDE_KEY'] +LOCKED_ENTITY_UPDATE_OVERRIDE_KEY = app.config['LOCKED_ENTITY_UPDATE_OVERRIDE_KEY'] # Suppress InsecureRequestWarning warning when requesting status on https with ssl cert verify disabled requests.packages.urllib3.disable_warnings(category = InsecureRequestWarning) From 654a4a1eb6ea877f027bbff5a1449207f937bac9 Mon Sep 17 00:00:00 2001 From: DerekFurstPitt Date: Thu, 30 Oct 2025 11:32:28 -0400 Subject: [PATCH 3/3] split out the try/catch handling of s3 responses into its own helper function and replaced its usage the 8 or so places it occurred --- src/app.py | 178 +++++++++++++++++------------------------------------ 1 file changed, 56 insertions(+), 122 deletions(-) diff --git a/src/app.py b/src/app.py index a4a81c8f..87c65fd4 100644 --- a/src/app.py +++ b/src/app.py @@ -902,21 +902,12 @@ def get_entity_by_id(id): if public_entity and not user_in_hubmap_read_group(request): final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) + # Check the size of what is to be returned through the AWS Gateway, and replace it with + # a response that links to an Object in the AWS S3 Bucket, if appropriate. + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1730,21 +1721,10 @@ def get_ancestors(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1825,22 +1805,11 @@ def get_descendants(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) - + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -1959,22 +1928,11 @@ def get_parents(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) - + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp + # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -2054,21 +2012,10 @@ def get_children(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -2202,21 +2149,10 @@ def get_siblings(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -2339,21 +2275,10 @@ def get_tuplets(id): # Check the size of what is to be returned through the AWS Gateway, and replace it with # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - resp_body = json.dumps(final_result).encode('utf-8') - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - # The HuBMAP Commons S3Worker will return None for a URL when the response body is - # smaller than it is configured to store, so the response should be returned through - # the AWS Gateway - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) + resp_body = json.dumps(final_result).encode('utf-8') + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp # Return a regular response through the AWS Gateway return jsonify(final_result) @@ -3846,21 +3771,11 @@ def get_prov_info_for_dataset(id): writer.writerows(dataset_prov_list) new_tsv_file.seek(0) resp_body = new_tsv_file.read() - - # Check the size of what is to be returned through the AWS Gateway, and replace it with - # a response that links to an Object in the AWS S3 Bucket, if appropriate. - try: - s3_url = anS3Worker.stash_response_body_if_big(resp_body) - if s3_url is not None: - return Response(response=s3_url - , status=303) # See Other - except Exception as s3exception: - logger.error(f"Error using anS3Worker to handle len(resp_body)=" - f"{len(resp_body)}.") - logger.error(s3exception, exc_info=True) - return Response(response=f"Unexpected error storing large results in S3. See logs." - , status=500) - + + try_resp = try_stash_response_body(resp_body) + if try_resp is not None: + return try_resp + # Return a regular response through the AWS Gateway if return_json: return jsonify(dataset_prov_list[0]) @@ -4590,6 +4505,25 @@ def validate_token_if_auth_header_exists(request): unauthorized_error(user_info.get_data().decode()) +def try_stash_response_body(resp_body): + try: + s3_url = anS3Worker.stash_response_body_if_big(resp_body) + if s3_url is not None: + return Response(response=s3_url + , status=303) # See Other + # The HuBMAP Commons S3Worker will return None for a URL when the response body is + # smaller than it is configured to store, so the response should be returned through + # the AWS Gateway + except Exception as s3exception: + logger.error(f"Error using anS3Worker to handle len(resp_body)=" + f"{len(resp_body)}.") + logger.error(s3exception, exc_info=True) + return Response(response=f"Unexpected error storing large results in S3. See logs." + , status=500) + return None + + + """ Get the token for internal use only