From 68829e2eb9bdf400c19c84a27fab6f028d2aabaf Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Tue, 30 Sep 2025 18:59:35 -0400 Subject: [PATCH 1/9] Bump version to 2.6.1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index e70b4523..6a6a3d8e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.0 +2.6.1 From bd11d07655f00f7f11e2ce693bb14ff40edc8201 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 1 Oct 2025 19:52:13 -0400 Subject: [PATCH 2/9] Initial implementation of property exclusion from URL --- src/app.py | 111 ++++++++++++++++++++-------- src/schema/schema_manager.py | 115 ++++++++++++++++++++++++++++- src/schema/schema_neo4j_queries.py | 14 +++- src/schema/schema_triggers.py | 23 +++++- 4 files changed, 227 insertions(+), 36 deletions(-) diff --git a/src/app.py b/src/app.py index f118f000..d4cb9f19 100644 --- a/src/app.py +++ b/src/app.py @@ -774,11 +774,82 @@ def get_entity_by_id(id): # Otherwise query against uuid-api and neo4j to get the entity dict if the id exists entity_dict = query_target_entity(id, token) normalized_entity_type = entity_dict['entity_type'] + + # These are the top-level fields and nested fields defined in the schema yaml fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type) + + + ###### + + # Only support defined query string parameters for filtering purposes + # 'property' was initially introduced to return a single field + # 'exclude' is newly added as a short-term workaround otherwise AWS API Gateway + # returns 500 error when the large paylod >10 MB + # When both 'property' and 'exclude' are specified in the URL, 'property' dominates + # since the final result is a single field value - Zhou 10/1/2025 + supported_qs_params = ['property', 'exclude'] + + triggered_properties_to_skip = [] + neo4j_properties_to_skip = [] + + if bool(request.args): + # First make sure the user provided query string params are valid + for param in request.args: + if param not in supported_qs_params: + bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") + + # Return a single property key and value using ?property= + if 'property' in request.args: + single_property_key = request.args.get('property') + + # Single property key that is immediately avaibale in Neo4j without running any triggers + # The `data_access_level` property is available in all entities Donor/Sample/Dataset + # and this filter is being used by gateway to check the data_access_level for file assets + # The `status` property is only available in Dataset and being used by search-api for revision + supported_property_keys = ['data_access_level', 'status'] + + # Validate the target property + if single_property_key not in supported_property_keys: + bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}") + + if single_property_key == 'status' and \ + not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): + bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") + + # Response with the property value directly + # Don't use jsonify() on string value + return entity_dict[single_property_key] + + # Exclude fields—either top-level or nested—specified by the user via the URL query string, + # using the format `?exclude=a.b,a.c,x`, where: + # - `x` is a top-level property + # - `a.b` and `a.c` are nested fields (dot-notated) + # + # Note: This is not the most efficient approach, as exclusion is performed after the Neo4j query + # rather than within it. However, it leverages the existing `exclude_properties_from_response()` + # function for simplicity and maintainability. - Zhou 10/1/2025 + if 'exclude' in request.args: + properties_to_exclude_str = request.args.get('exclude') + + if properties_to_exclude_str is not None: + flat_list = [item.strip() for item in properties_to_exclude_str.split(",")] + + logger.info(f"User specified flat_list: {flat_list}") + + # Determine which properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` + triggered_properties_to_skip, neo4j_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, flat_list) + else: + bad_request_error("Must specify the properties to exclude in the form of exclude=[a, b, c, d.e]") + + ###### + + # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly - complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict) + # NOTE: top-level properties in `triggered_properties_to_skip` will skip the trigger methods + # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025 + complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_properties_to_skip) # Determine if the entity is publicly visible base on its data, only. # To verify if a Collection is public, it is necessary to have its Datasets, which @@ -813,37 +884,19 @@ def get_entity_by_id(id): forbidden_error(f"The requested {normalized_entity_type} has non-public data." f" A Globus token with access permission is required.") + ######## + # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict` + for item in neo4j_properties_to_skip: + complete_dict.pop(item) + # Also normalize the result based on schema final_result = schema_manager.normalize_entity_result_for_response(complete_dict) - # Result filtering based on query string - # The `data_access_level` property is available in all entities Donor/Sample/Dataset - # and this filter is being used by gateway to check the data_access_level for file assets - # The `status` property is only available in Dataset and being used by search-api for revision - result_filtering_accepted_property_keys = ['data_access_level', 'status'] - - if bool(request.args): - property_key = request.args.get('property') - - if property_key is not None: - # Validate the target property - if property_key not in result_filtering_accepted_property_keys: - bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}") - - if property_key == 'status' and \ - not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): - bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") - - # Response with the property value directly - # Don't use jsonify() on string value - return complete_dict[property_key] - else: - bad_request_error("The specified query string is not supported. Use '?property=' to filter the result") - else: - # Response with the dict - if public_entity and not user_in_hubmap_read_group(request): - final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) - return jsonify(final_result) + # Response with the dict + if public_entity and not user_in_hubmap_read_group(request): + final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) + + return jsonify(final_result) """ diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 7be00fb3..b49e372b 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -305,7 +305,7 @@ def get_fields_to_exclude(normalized_class=None): Parameters ---------- excluded_fields : list - A list of the fields to be excluded + A JSON list of the fields to be excluded, may have nested fields output_dict : dictionary A dictionary representing the data to be modified @@ -349,6 +349,116 @@ def delete_nested_field(data, nested_path): return output_dict +""" +Transform a flat list of dot-notated strings into a hybrid list that: +- keeps plain strings as-is +- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix + +Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}] + +Used by `GET /entities/?exclude=a.b, a.c, x` to build a JSON list +that can be futher processed by `exclude_properties_from_response()`. + +Parameters +---------- +flat_list : list + A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields + Example: ['a.b', 'a.c', 'x'] + +Returns +------- +list + A list mixing strings and grouped dicts +""" +def flatten_and_group_dot_notation_fields(flat_list): + output_list = [] + grouped_dict = {} + + for item in flat_list: + if '.' in item: + prefix, field = item.split('.', 1) + grouped_dict.setdefault(prefix, []).append(field) + else: + output_list.append(item) + + # Add grouped items as dictionaries + for prefix, fields in grouped_dict.items(): + output_list.append({prefix: fields}) + + return output_list + + +""" +Transform a flat list of dot-notated strings into a hybrid list that: +- keeps plain strings as-is +- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix + +Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}] + +Used by `GET /entities/?exclude=a.b, a.c, x` to build a JSON list +that can be futher processed by `exclude_properties_from_response()`. + +Parameters +---------- +flat_list : list + A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields + Example: ['a.b', 'a.c', 'x'] + +Returns +------- +list + A list mixing strings and grouped dicts +""" +def determine_property_exclusion_type(normalized_entity_type, flat_list): + global _schema + + triggered_properties_to_skip = [] + neo4j_properties_to_skip = [] + + properties = _schema['ENTITIES'][normalized_entity_type]['properties'] + + top_level_list = [] + + for item in flat_list: + # Only target at properties don't use the dot notation + if '.' not in item: + top_level_list.append(item) + + + for item in top_level_list: + if item in properties and 'on_read_trigger' in properties[item]: + triggered_properties_to_skip.append(item) + else: + neo4j_properties_to_skip.append(item) + + + return triggered_properties_to_skip, neo4j_properties_to_skip + + +""" +Use the Flask request.args MultiDict to see if 'reindex' is a URL parameter passed in with the +request and if it indicates reindexing should be supressed. Default to reindexing in all other cases. + +Parameters +---------- +request: Flask request object + The instance of Flask request passed in from application request + +Returns +------- +bool +""" +def get_fields_to_exclude_from_query_string(request): + properties_to_exclude_str = request.args.get('exclude') + + properties_to_exclude_list = [item.strip() for item in properties_to_exclude_str.split(",")] + + # Transform the flat JSON string list to a Python list mixing strings and grouped dicts + prepared_list = flatten_and_group_dot_notation_fields(properties_to_exclude_list) + + return properties_to_exclude_list + + """ Generating triggered data based on the target events and methods @@ -396,6 +506,8 @@ def generate_triggered_data(trigger_type: TriggerTypeEnum, normalized_class, req # decides the ordering of which trigger method gets to run first properties = schema_section[normalized_class]['properties'] + logger.info(f"Skipping triggered data generation for the following properties: {properties_to_skip}") + # Set each property value and put all resulting data into a dictionary for: # before_create_trigger|before_update_trigger|on_read_trigger # No property value to be set for: after_create_trigger|after_update_trigger @@ -2001,7 +2113,6 @@ def convert_str_literal(data_str): data = ast.literal_eval(data_str) if isinstance(data, (list, dict)): - logger.info(f"The input string literal has been converted to {type(data)} successfully") return data else: logger.info(f"The input string literal is not list or dict after evaluation, return the original string input") diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 738ce81d..575ec6a8 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -572,7 +572,7 @@ def get_uploads(neo4j_driver, uuid, property_key = None): list A unique list of uuids of source entities """ -def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): +def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] if property_key: @@ -580,9 +580,15 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): f"WHERE t.uuid = '{uuid}' " f"RETURN apoc.coll.toSet(COLLECT(s.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " - f"WHERE t.uuid = '{uuid}' " - f"RETURN apoc.coll.toSet(COLLECT(s)) AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " + f"WHERE t.uuid = '{uuid}' " + f"WITH apoc.coll.toSet(COLLECT(s)) AS uniqueDirectAncestors " + f"RETURN [a IN uniqueDirectAncestors | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {properties_to_exclude}))] AS {record_field_name}") + else: + query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " + f"WHERE t.uuid = '{uuid}' " + f"RETURN apoc.coll.toSet(COLLECT(s)) AS {record_field_name}") logger.info("======get_dataset_direct_ancestors() query======") logger.debug(query) diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 4445718e..ac3020a8 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -987,7 +987,28 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the mixed fields either top-level or nested from the original query string in request URL + all_properties_to_exclude = schema_manager.get_fields_to_exclude_from_query_string(request) + + logger.info(f"all_properties_to_exclude: {all_properties_to_exclude}") + + # Find the specific sub list, depth is limited to 2 + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = [] + + parsed_fields = schema_manager.flatten_and_group_dot_notation_fields(all_properties_to_exclude) + + for item in parsed_fields: + # Find the depth 2 properties (top-level to this triggered entity) + if isinstance(item, dict) and property_key in item: + neo4j_properties_to_exclude = item[property_key] + + logger.info(f"neo4j_properties_to_exclude: {neo4j_properties_to_exclude}") + + # Stop after finding the first match + break + + direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema From 671d3e9ce373b5a35ae6356b35e5798ade239646 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 1 Oct 2025 23:24:58 -0400 Subject: [PATCH 3/9] Handle exclusion by types --- src/app.py | 59 ++++++++++--------- src/schema/schema_manager.py | 106 ++++++++++++++++++++-------------- src/schema/schema_triggers.py | 21 ++++--- 3 files changed, 104 insertions(+), 82 deletions(-) diff --git a/src/app.py b/src/app.py index d4cb9f19..e7854975 100644 --- a/src/app.py +++ b/src/app.py @@ -778,20 +778,20 @@ def get_entity_by_id(id): # These are the top-level fields and nested fields defined in the schema yaml fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type) - - - ###### - # Only support defined query string parameters for filtering purposes - # 'property' was initially introduced to return a single field - # 'exclude' is newly added as a short-term workaround otherwise AWS API Gateway - # returns 500 error when the large paylod >10 MB + # 'property' was initially introduced to return one of the single fields ['data_access_level', 'status'] + # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance) # When both 'property' and 'exclude' are specified in the URL, 'property' dominates # since the final result is a single field value - Zhou 10/1/2025 supported_qs_params = ['property', 'exclude'] - triggered_properties_to_skip = [] - neo4j_properties_to_skip = [] + # There are three types of properties that can be excluded from the GET response + # - properties generated by trigger methods + # - properties returned as part of Neo4j node properties + # - properties returned by Neo4j but nested + triggered_top_properties_to_skip = [] + neo4j_top_properties_to_skip = [] + neo4j_nested_properties_to_skip = [] if bool(request.args): # First make sure the user provided query string params are valid @@ -823,33 +823,26 @@ def get_entity_by_id(id): # Exclude fields—either top-level or nested—specified by the user via the URL query string, # using the format `?exclude=a.b,a.c,x`, where: - # - `x` is a top-level property - # - `a.b` and `a.c` are nested fields (dot-notated) + # - `x` is a top-level property of the target entity + # - `a.b` and `a.c` are nested fields in a dot-notated form (b and c could be from a different entity type) # # Note: This is not the most efficient approach, as exclusion is performed after the Neo4j query # rather than within it. However, it leverages the existing `exclude_properties_from_response()` # function for simplicity and maintainability. - Zhou 10/1/2025 - if 'exclude' in request.args: - properties_to_exclude_str = request.args.get('exclude') - - if properties_to_exclude_str is not None: - flat_list = [item.strip() for item in properties_to_exclude_str.split(",")] - - logger.info(f"User specified flat_list: {flat_list}") - - # Determine which properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` - triggered_properties_to_skip, neo4j_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, flat_list) - else: - bad_request_error("Must specify the properties to exclude in the form of exclude=[a, b, c, d.e]") - - ###### - + try: + all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + + # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` + # Also get nested properties that are directly returned from Neo4j, which will be handled differently + triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude) + except Exception as e: + bad_request_error(e) # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly - # NOTE: top-level properties in `triggered_properties_to_skip` will skip the trigger methods + # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025 - complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_properties_to_skip) + complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip) # Determine if the entity is publicly visible base on its data, only. # To verify if a Collection is public, it is necessary to have its Datasets, which @@ -884,14 +877,20 @@ def get_entity_by_id(id): forbidden_error(f"The requested {normalized_entity_type} has non-public data." f" A Globus token with access permission is required.") - ######## # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict` - for item in neo4j_properties_to_skip: + # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip` + # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025 + for item in neo4j_top_properties_to_skip: complete_dict.pop(item) # Also normalize the result based on schema final_result = schema_manager.normalize_entity_result_for_response(complete_dict) + # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) + # where that `ingest_metadata` is an actual Neo4j node property containing `dag_provenance_list` + # For such cases, we can't handle via Neo4j query. Instead, exclude at Python app level. - Zhou 10/1/2025 + final_result = schema_manager.exclude_properties_from_response(neo4j_nested_properties_to_skip, final_result) + # Response with the dict if public_entity and not user_in_hubmap_read_group(request): final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index b49e372b..fc9de819 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -349,6 +349,40 @@ def delete_nested_field(data, nested_path): return output_dict +""" +Use the Flask request.args MultiDict to see if 'exclude' is a URL parameter passed in with the +request and parse the comma-separated properties to be excluded from final response + +For now, only support one dot for nested fields (depth 2) + +Parameters +---------- +request: Flask request object + The instance of Flask request passed in from application request + +Returns +------- +list + A flat list of strings containing top-level and/or nested dot-notated properties + Example: ['a.b', 'a.c', 'x'] +""" +def get_all_fields_to_exclude_from_query_string(request): + all_properties_to_exclude = [] + + if 'exclude' in request.args: + # Treat query string value as case-insensitive + properties_to_exclude_str = request.args.get('exclude').lower() + + if properties_to_exclude_str is not None: + all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] + + logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") + else: + raise Exception(f"The value of the 'exclude' query string arameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-insensitive).") + + return all_properties_to_exclude + + """ Transform a flat list of dot-notated strings into a hybrid list that: - keeps plain strings as-is @@ -368,13 +402,14 @@ def delete_nested_field(data, nested_path): Returns ------- list - A list mixing strings and grouped dicts + A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}] """ -def flatten_and_group_dot_notation_fields(flat_list): +def group_dot_notation_fields(flat_list): output_list = [] grouped_dict = {} for item in flat_list: + # For now, only support one dot for nested fields (depth 2) if '.' in item: prefix, field = item.split('.', 1) grouped_dict.setdefault(prefix, []).append(field) @@ -389,17 +424,17 @@ def flatten_and_group_dot_notation_fields(flat_list): """ -Transform a flat list of dot-notated strings into a hybrid list that: -- keeps plain strings as-is -- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix - -Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}] +Group properties by exclusion type -Used by `GET /entities/?exclude=a.b, a.c, x` to build a JSON list -that can be futher processed by `exclude_properties_from_response()`. +Example: ['a.b', 'a.c', 'x', 'y'] where +- x and y are top-level properties +- x is Neo4j node property, and y is generated via trigger method +- a.b and a.c are nested properties while a is a top-level property of either type Parameters ---------- +normalized_entity_type : str + One of the normalized entity types: Dataset, Collection, Sample, Donor, Upload, Publication flat_list : list A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields Example: ['a.b', 'a.c', 'x'] @@ -407,56 +442,39 @@ def flatten_and_group_dot_notation_fields(flat_list): Returns ------- list - A list mixing strings and grouped dicts + Three lists - one for triggered properties and one for Neo4j node properties + Example for Dataset: ['direct_ancestors', 'title'], ['dataset_type'], ['ingest_metadata.dag_provenance_list'] """ def determine_property_exclusion_type(normalized_entity_type, flat_list): global _schema - triggered_properties_to_skip = [] - neo4j_properties_to_skip = [] - - properties = _schema['ENTITIES'][normalized_entity_type]['properties'] - + triggered_top_properties_to_skip = [] + neo4j_top_properties_to_skip = [] + neo4j_nested_properties_to_skip =[] top_level_list = [] + second_level_list = [] + properties = _schema['ENTITIES'][normalized_entity_type]['properties'] + # First find the top-level properties for item in flat_list: - # Only target at properties don't use the dot notation if '.' not in item: top_level_list.append(item) + else: + second_level_list.append(item) - + # Only care about the properties defined in schema yaml for item in top_level_list: if item in properties and 'on_read_trigger' in properties[item]: - triggered_properties_to_skip.append(item) + triggered_top_properties_to_skip.append(item) else: - neo4j_properties_to_skip.append(item) - - - return triggered_properties_to_skip, neo4j_properties_to_skip - - -""" -Use the Flask request.args MultiDict to see if 'reindex' is a URL parameter passed in with the -request and if it indicates reindexing should be supressed. Default to reindexing in all other cases. - -Parameters ----------- -request: Flask request object - The instance of Flask request passed in from application request - -Returns -------- -bool -""" -def get_fields_to_exclude_from_query_string(request): - properties_to_exclude_str = request.args.get('exclude') - - properties_to_exclude_list = [item.strip() for item in properties_to_exclude_str.split(",")] + neo4j_top_properties_to_skip.append(item) - # Transform the flat JSON string list to a Python list mixing strings and grouped dicts - prepared_list = flatten_and_group_dot_notation_fields(properties_to_exclude_list) + # # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) + # where that `ingest_metadata` is an actual Neo4j node property containing `dag_provenance_list` + # For such cases, exclude via `exclude_properties_from_response()` at Python app level. + neo4j_nested_properties_to_skip = group_dot_notation_fields(second_level_list) - return properties_to_exclude_list + return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip """ diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index ac3020a8..f1210c54 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -987,23 +987,28 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the mixed fields either top-level or nested from the original query string in request URL - all_properties_to_exclude = schema_manager.get_fields_to_exclude_from_query_string(request) - - logger.info(f"all_properties_to_exclude: {all_properties_to_exclude}") + # Get all the user specified fields either top-level or nested from the original query string in request URL + try: + all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + except Exception as e: + raise Exception(e) # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 neo4j_properties_to_exclude = [] + grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) - parsed_fields = schema_manager.flatten_and_group_dot_notation_fields(all_properties_to_exclude) - - for item in parsed_fields: + for item in grouped_fields: # Find the depth 2 properties (top-level to this triggered entity) if isinstance(item, dict) and property_key in item: + for field in item[property_key]: + if not isinstance(field, str): + item[property_key].pop(field) + neo4j_properties_to_exclude = item[property_key] - logger.info(f"neo4j_properties_to_exclude: {neo4j_properties_to_exclude}") + logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") # Stop after finding the first match break From 01efb5f677546e7fd487a11c38f08d545c5d9ef6 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Wed, 1 Oct 2025 23:47:01 -0400 Subject: [PATCH 4/9] Apply to sample.direct_ancestor trigger --- src/schema/schema_neo4j_queries.py | 19 +++++--- src/schema/schema_triggers.py | 75 +++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 575ec6a8..43ce4a29 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -566,6 +566,8 @@ def get_uploads(neo4j_driver, uuid, property_key = None): The uuid of target entity property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- @@ -1557,28 +1559,33 @@ def count_attached_published_datasets(neo4j_driver, entity_type, uuid): The uuid of target entity property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- dict The parent dict, can either be a Sample or Donor """ -def get_sample_direct_ancestor(neo4j_driver, uuid, property_key = None): +def get_sample_direct_ancestor(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): result = {} if property_key: query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " # Filter out the Lab entity if it's the ancestor f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " - # COLLECT() returns a list - # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN parent.{property_key} AS {record_field_name}") else: - query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " + if properties_to_exclude: + query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " + # Filter out the Lab entity if it's the ancestor + f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " + f"WITH parent AS p " + f"RETURN apoc.create.vNode(labels(p), apoc.map.removeKeys(properties(p), {properties_to_exclude})) AS {record_field_name}") + else: + query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " # Filter out the Lab entity if it's the ancestor f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " - # COLLECT() returns a list - # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN parent AS {record_field_name}") logger.info("======get_sample_direct_ancestor() query======") diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index f1210c54..6a4dd750 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -988,30 +988,10 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") # Get all the user specified fields either top-level or nested from the original query string in request URL - try: - all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) - except Exception as e: - raise Exception(e) - # Find the specific sub list, depth is limited to 2 # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = [] - grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) - - for item in grouped_fields: - # Find the depth 2 properties (top-level to this triggered entity) - if isinstance(item, dict) and property_key in item: - for field in item[property_key]: - if not isinstance(field, str): - item[property_key].pop(field) - - neo4j_properties_to_exclude = item[property_key] - - logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") - - # Stop after finding the first match - break + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) @@ -2029,7 +2009,13 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}") - direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2695,3 +2681,48 @@ def _get_age_age_units_race_sex_phrase(age:str=None, age_units:str='units', race return f"{age}-{age_units}-old {race} {sex}" +""" +Parse the original user request to determine the Neo4j properties to exclude from trigger generated data + +Parameters +---------- +property_key : str + The target property key of the value to be generated +request: Flask request object + The instance of Flask request passed in from application request + +Returns +------- +list: A list containing Neo4j node properties to exclude +""" +def _get_neo4j_properties_to_exclude(property_key, request): + neo4j_properties_to_exclude = [] + + # Get all the user specified fields either top-level or nested from the original query string in request URL + try: + all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + except Exception as e: + raise Exception(e) + + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) + + for item in grouped_fields: + # Find the depth 2 properties (top-level to this triggered entity) + if isinstance(item, dict) and property_key in item: + for field in item[property_key]: + if not isinstance(field, str): + item[property_key].pop(field) + + neo4j_properties_to_exclude = item[property_key] + + logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") + + # Stop after finding the first match + break + + return neo4j_properties_to_exclude + + From 111e80a705b0259920563f90362f83cf89217f52 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 01:02:49 -0400 Subject: [PATCH 5/9] Fix to exclusion types and more trigger filters --- src/app.py | 8 ++++--- src/schema/schema_manager.py | 36 +++++++++++++++++++++--------- src/schema/schema_neo4j_queries.py | 32 +++++++++++++++++++------- src/schema/schema_triggers.py | 16 +++++++++++-- 4 files changed, 69 insertions(+), 23 deletions(-) diff --git a/src/app.py b/src/app.py index e7854975..a032a026 100644 --- a/src/app.py +++ b/src/app.py @@ -887,9 +887,11 @@ def get_entity_by_id(id): final_result = schema_manager.normalize_entity_result_for_response(complete_dict) # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) - # where that `ingest_metadata` is an actual Neo4j node property containing `dag_provenance_list` - # For such cases, we can't handle via Neo4j query. Instead, exclude at Python app level. - Zhou 10/1/2025 - final_result = schema_manager.exclude_properties_from_response(neo4j_nested_properties_to_skip, final_result) + # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list` + # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level. + # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # `exclude_properties_from_response()` - Zhou 10/1/2025 + final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result) # Response with the dict if public_entity and not user_in_hubmap_read_group(request): diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index fc9de819..d8c41e6d 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -443,7 +443,11 @@ def group_dot_notation_fields(flat_list): ------- list Three lists - one for triggered properties and one for Neo4j node properties - Example for Dataset: ['direct_ancestors', 'title'], ['dataset_type'], ['ingest_metadata.dag_provenance_list'] + + Example for Dataset: + - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] + - neo4j_top_properties_to_skip: ['data_access_level'] + - neo4j_nested_properties_to_skip: ['status_history.status'] """ def determine_property_exclusion_type(normalized_entity_type, flat_list): global _schema @@ -455,7 +459,7 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list): second_level_list = [] properties = _schema['ENTITIES'][normalized_entity_type]['properties'] - # First find the top-level properties + # First find the top-level properties without using dot-notation for item in flat_list: if '.' not in item: top_level_list.append(item) @@ -464,16 +468,28 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list): # Only care about the properties defined in schema yaml for item in top_level_list: - if item in properties and 'on_read_trigger' in properties[item]: - triggered_top_properties_to_skip.append(item) - else: - neo4j_top_properties_to_skip.append(item) + if item in properties: + if 'on_read_trigger' in properties[item]: + triggered_top_properties_to_skip.append(item) + else: + neo4j_top_properties_to_skip.append(item) + + # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip` + # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip` + for item in second_level_list: + prefix = item.split('.')[0] + if prefix in properties: + if 'on_read_trigger' in properties[prefix]: + triggered_top_properties_to_skip.append(item) + else: + neo4j_nested_properties_to_skip.append(item) - # # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) - # where that `ingest_metadata` is an actual Neo4j node property containing `dag_provenance_list` - # For such cases, exclude via `exclude_properties_from_response()` at Python app level. - neo4j_nested_properties_to_skip = group_dot_notation_fields(second_level_list) + logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}") + # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # `exclude_properties_from_response()` - Zhou 10/1/2025 return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 43ce4a29..223ab58f 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -1083,13 +1083,15 @@ def get_collection_associated_datasets(neo4j_driver, uuid, property_key = None): The uuid of dataset or publication property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list A list of collection uuids """ -def get_dataset_collections(neo4j_driver, uuid, property_key = None): +def get_dataset_collections(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] if property_key: @@ -1097,9 +1099,15 @@ def get_dataset_collections(neo4j_driver, uuid, property_key = None): f"WHERE e.uuid = '{uuid}' " f"RETURN apoc.coll.toSet(COLLECT(c.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " - f"WHERE e.uuid = '{uuid}' " - f"RETURN apoc.coll.toSet(COLLECT(c)) AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE e.uuid = '{uuid}' " + f"WITH apoc.coll.toSet(COLLECT(c)) AS uniqueCollections " + f"RETURN [c IN uniqueCollections | apoc.create.vNode(labels(c), apoc.map.removeKeys(properties(c), {properties_to_exclude}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE e.uuid = '{uuid}' " + f"RETURN apoc.coll.toSet(COLLECT(c)) AS {record_field_name}") logger.info("======get_dataset_collections() query======") logger.debug(query) @@ -1164,18 +1172,26 @@ def get_publication_associated_collection(neo4j_driver, uuid): The neo4j database connection pool uuid : str The uuid of dataset +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- dict A Upload dict """ -def get_dataset_upload(neo4j_driver, uuid): +def get_dataset_upload(neo4j_driver, uuid, properties_to_exclude = []): result = {} - query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " - f"WHERE e.uuid = '{uuid}' " - f"RETURN s AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE e.uuid = '{uuid}' " + f"WITH s AS up " + f"RETURN apoc.create.vNode(labels(up), apoc.map.removeKeys(properties(up), {properties_to_exclude})) AS {record_field_name}") + else: + query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE e.uuid = '{uuid}' " + f"RETURN s AS {record_field_name}") logger.info("======get_dataset_upload() query======") logger.debug(query) diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 6a4dd750..c44fe255 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -781,7 +781,13 @@ def get_dataset_collections(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -859,7 +865,13 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema From f6af9a52bb3da816ecfdb07e32b995a9f3687bcd Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 01:15:49 -0400 Subject: [PATCH 6/9] Enhanced validation --- src/schema/schema_manager.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index d8c41e6d..44733fd8 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -373,12 +373,18 @@ def get_all_fields_to_exclude_from_query_string(request): # Treat query string value as case-insensitive properties_to_exclude_str = request.args.get('exclude').lower() - if properties_to_exclude_str is not None: + if properties_to_exclude_str: all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") else: - raise Exception(f"The value of the 'exclude' query string arameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-insensitive).") + raise Exception("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-insensitive).") + + # A bit more validation to limit to depth 2 + for item in all_properties_to_exclude: + if '.' in item: + if len(item.split('.')) > 2: + raise Exception("Only single dot-separated keys are allowed in `exclude` (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") return all_properties_to_exclude From 2666f49012d1ca7d51355f90e24cfa60c71557fb Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 11:05:19 -0400 Subject: [PATCH 7/9] Update to use commons 2.1.21 --- src/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/requirements.txt b/src/requirements.txt index 6026e84d..ae3fa02e 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -18,7 +18,7 @@ PyYAML==5.4.1 # Use the branch name of commons from github for testing new changes made in commons from different branch # Default is main branch specified in docker-compose.development.yml if not set # git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons -hubmap-commons==2.1.19 +hubmap-commons==2.1.21 # For unit test nose2==0.10.0 From 9f4f106bf29f893629ead576ba73327f7d8c8c9e Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 11:47:14 -0400 Subject: [PATCH 8/9] Support exclude to upload.datasets and collection.datasets with lowercase validation --- src/app.py | 6 ++--- src/schema/schema_manager.py | 14 +++++++--- src/schema/schema_neo4j_queries.py | 42 +++++++++++++++++++++++------- src/schema/schema_triggers.py | 16 ++++++++++-- 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/app.py b/src/app.py index a032a026..240a13c7 100644 --- a/src/app.py +++ b/src/app.py @@ -786,9 +786,9 @@ def get_entity_by_id(id): supported_qs_params = ['property', 'exclude'] # There are three types of properties that can be excluded from the GET response - # - properties generated by trigger methods - # - properties returned as part of Neo4j node properties - # - properties returned by Neo4j but nested + # - top-level properties generated by trigger methods + # - top-level properties returned as part of Neo4j node properties + # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query triggered_top_properties_to_skip = [] neo4j_top_properties_to_skip = [] neo4j_nested_properties_to_skip = [] diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 44733fd8..ef3d7590 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -370,21 +370,27 @@ def get_all_fields_to_exclude_from_query_string(request): all_properties_to_exclude = [] if 'exclude' in request.args: - # Treat query string value as case-insensitive - properties_to_exclude_str = request.args.get('exclude').lower() + # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive + properties_to_exclude_str = request.args.get('exclude') if properties_to_exclude_str: + # Must all lowercase values + has_upper = any(c.isupper() for c in properties_to_exclude_str) + + if has_upper: + raise Exception("All the properties specified in 'exclude' query string in URL must be lowercase.") + all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") else: - raise Exception("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-insensitive).") + raise Exception("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") # A bit more validation to limit to depth 2 for item in all_properties_to_exclude: if '.' in item: if len(item.split('.')) > 2: - raise Exception("Only single dot-separated keys are allowed in `exclude` (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") + raise Exception("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") return all_properties_to_exclude diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 223ab58f..5add3874 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -1215,20 +1215,32 @@ def get_dataset_upload(neo4j_driver, uuid, properties_to_exclude = []): The neo4j database connection pool uuid : str The uuid of collection +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list The list containing associated dataset dicts """ -def get_collection_datasets(neo4j_driver, uuid): +def get_collection_datasets(neo4j_driver, uuid, properties_to_exclude = []): results = [] fields_to_omit = SchemaConstants.OMITTED_FIELDS - query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " - f"WHERE c.uuid = '{uuid}' " - f"WITH COLLECT(DISTINCT e) AS uniqueDataset " - f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") + + + if properties_to_exclude: + merged_list = properties_to_exclude + fields_to_omit + + query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE c.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueDataset " + f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {merged_list}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE c.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueDataset " + f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") logger.info("======get_collection_datasets() query======") logger.debug(query) @@ -1424,13 +1436,15 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list): The uuid of Upload property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list The list containing associated dataset dicts """ -def get_upload_datasets(neo4j_driver, uuid, property_key = None): +def get_upload_datasets(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] fields_to_omit = SchemaConstants.OMITTED_FIELDS if property_key: @@ -1440,10 +1454,18 @@ def get_upload_datasets(neo4j_driver, uuid, property_key = None): # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN apoc.coll.toSet(COLLECT(e.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " - f"WHERE s.uuid = '{uuid}' " - f"WITH COLLECT(DISTINCT e) AS uniqueUploads " - f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") + if properties_to_exclude: + merged_list = properties_to_exclude + fields_to_omit + + query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE s.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueUploads " + f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {merged_list}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE s.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueUploads " + f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") logger.info("======get_upload_datasets() query======") logger.debug(query) diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index c44fe255..f5299595 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -708,8 +708,14 @@ def get_collection_datasets(property_key, normalized_type, request, user_token, raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'get_collection_datasets()' trigger method.") logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") + + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) - datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2294,7 +2300,13 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema From 7e411f14f81f2118dfc6abb3243767a16fec7471 Mon Sep 17 00:00:00 2001 From: yuanzhou Date: Thu, 2 Oct 2025 14:52:39 -0400 Subject: [PATCH 9/9] Add validation on prohibited properties --- src/app.py | 4 +++- src/schema/schema_manager.py | 15 ++++++++++++--- src/schema/schema_triggers.py | 2 ++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/app.py b/src/app.py index 240a13c7..22013627 100644 --- a/src/app.py +++ b/src/app.py @@ -835,8 +835,10 @@ def get_entity_by_id(id): # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` # Also get nested properties that are directly returned from Neo4j, which will be handled differently triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude) - except Exception as e: + except ValueError as e: bad_request_error(e) + except Exception as e: + internal_server_error(e) # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index ef3d7590..0f50201c 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -378,19 +378,28 @@ def get_all_fields_to_exclude_from_query_string(request): has_upper = any(c.isupper() for c in properties_to_exclude_str) if has_upper: - raise Exception("All the properties specified in 'exclude' query string in URL must be lowercase.") + raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.") all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") else: - raise Exception("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") + raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") # A bit more validation to limit to depth 2 for item in all_properties_to_exclude: if '.' in item: if len(item.split('.')) > 2: - raise Exception("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") + raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") + + # More validation - ensure prohibited properties are not accepted + # This two properties are required internally by `normalize_entity_result_for_response()` + prohibited_properties = ['uuid', 'entity_type'] + second_level_list = [] + + for item in all_properties_to_exclude: + if item in prohibited_properties or item.split('.')[1] in prohibited_properties: + raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.") return all_properties_to_exclude diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index f5299595..87211de4 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -2725,6 +2725,8 @@ def _get_neo4j_properties_to_exclude(property_key, request): # Get all the user specified fields either top-level or nested from the original query string in request URL try: all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + except ValueError as e: + raise ValueError(e) except Exception as e: raise Exception(e)