diff --git a/src/app.py b/src/app.py index 22013627..dcc4042e 100644 --- a/src/app.py +++ b/src/app.py @@ -783,21 +783,21 @@ def get_entity_by_id(id): # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance) # When both 'property' and 'exclude' are specified in the URL, 'property' dominates # since the final result is a single field value - Zhou 10/1/2025 - supported_qs_params = ['property', 'exclude'] + supported_query_params = ['property', 'exclude'] # There are three types of properties that can be excluded from the GET response # - top-level properties generated by trigger methods # - top-level properties returned as part of Neo4j node properties # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query - triggered_top_properties_to_skip = [] - neo4j_top_properties_to_skip = [] - neo4j_nested_properties_to_skip = [] + triggered_top_props_to_skip = [] + neo4j_top_props_to_skip = [] + neo4j_nested_props_to_skip = [] if bool(request.args): - # First make sure the user provided query string params are valid + # First make sure the user provided query params are valid for param in request.args: - if param not in supported_qs_params: - bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") + if param not in supported_query_params: + bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_query_params)}") # Return a single property key and value using ?property= if 'property' in request.args: @@ -811,11 +811,11 @@ def get_entity_by_id(id): # Validate the target property if single_property_key not in supported_property_keys: - bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}") + bad_request_error(f"Only the following property keys are supported in the query parameter: {COMMA_SEPARATOR.join(supported_property_keys)}") if single_property_key == 'status' and \ not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): - bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") + bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query parameter") # Response with the property value directly # Don't use jsonify() on string value @@ -830,11 +830,11 @@ def get_entity_by_id(id): # rather than within it. However, it leverages the existing `exclude_properties_from_response()` # function for simplicity and maintainability. - Zhou 10/1/2025 try: - all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + all_props_to_exclude = schema_manager.get_excluded_query_props(request) # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` # Also get nested properties that are directly returned from Neo4j, which will be handled differently - triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude) + triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip = schema_manager.get_exclusion_types(normalized_entity_type, all_props_to_exclude) except ValueError as e: bad_request_error(e) except Exception as e: @@ -842,9 +842,9 @@ def get_entity_by_id(id): # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly - # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods + # NOTE: top-level properties in `triggered_top_props_to_skip` will skip the trigger methods # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025 - complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip) + complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_props_to_skip) # Determine if the entity is publicly visible base on its data, only. # To verify if a Collection is public, it is necessary to have its Datasets, which @@ -880,9 +880,9 @@ def get_entity_by_id(id): f" A Globus token with access permission is required.") # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict` - # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip` + # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_props_to_skip` # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025 - for item in neo4j_top_properties_to_skip: + for item in neo4j_top_props_to_skip: complete_dict.pop(item) # Also normalize the result based on schema @@ -891,9 +891,9 @@ def get_entity_by_id(id): # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list` # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level. - # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # NOTE: need to convert the `neo4j_nested_props_to_skip` to a format that can be used by # `exclude_properties_from_response()` - Zhou 10/1/2025 - final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result) + final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_props(neo4j_nested_props_to_skip), final_result) # Response with the dict if public_entity and not user_in_hubmap_read_group(request): diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 724989cf..07783d5d 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -331,7 +331,7 @@ def delete_nested_field(data, nested_path): for item in data[key]: if nested_field in item: del item[nested_field] - elif nested_field in data[key]: + elif isinstance(data[key], dict) and nested_field in data[key]: del data[key][nested_field] elif isinstance(value, dict): delete_nested_field(data[key], value) @@ -366,42 +366,75 @@ def delete_nested_field(data, nested_path): A flat list of strings containing top-level and/or nested dot-notated properties Example: ['a.b', 'a.c', 'x'] """ -def get_all_fields_to_exclude_from_query_string(request): - all_properties_to_exclude = [] +def get_excluded_query_props(request): + all_props_to_exclude = [] if 'exclude' in request.args: # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive - properties_to_exclude_str = request.args.get('exclude') - - if properties_to_exclude_str: - # Must all lowercase values - has_upper = any(c.isupper() for c in properties_to_exclude_str) - - if has_upper: - raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.") + props_to_exclude_str = request.args.get('exclude') - all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] + if not validate_comma_separated_exclude_str(props_to_exclude_str): + raise ValueError( + "The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: " + "[1] Each property must include at least one letter; " + "[2] Only lowercase letters and underscores '_' are allowed; " + "[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b')." + ) - logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") - else: - raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") + all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")] + + logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}") - # A bit more validation to limit to depth 2 - for item in all_properties_to_exclude: - if '.' in item: - if len(item.split('.')) > 2: - raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") - # More validation - ensure prohibited properties are not accepted # This two properties are required internally by `normalize_entity_result_for_response()` prohibited_properties = ['uuid', 'entity_type'] second_level_list = [] - for item in all_properties_to_exclude: + for item in all_props_to_exclude: if item in prohibited_properties or ('.' in item and item.split('.')[1] in prohibited_properties): raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.") - return all_properties_to_exclude + return all_props_to_exclude + + +""" +The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: + +[1] Each property must include at least one letter; +[2] Only lowercase letters and underscores '_' are allowed; +[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b'). + +Parameters +---------- +s : str + Comma-separated input string used to exclude entity properties + +Returns +------- +bool + True if valid or False otherwise +""" +def validate_comma_separated_exclude_str(s: str): + # No empty string + if not s: + return False + + # Split by commas + items = s.split(',') + + # No empty items allowed (prevents ',,' or trailing comma) + if any(not item.strip() for item in items): + return False + + def is_valid_item(item: str): + return ( + all(c.islower() or c in '._' for c in item) + and any(c.isalpha() for c in item) + and item.count('.') <= 1 + and not ((item.startswith('.') or item.endswith('.'))) + ) + + return all(is_valid_item(item.strip()) for item in items) """ @@ -425,7 +458,7 @@ def get_all_fields_to_exclude_from_query_string(request): list A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}] """ -def group_dot_notation_fields(flat_list): +def group_dot_notation_props(flat_list): output_list = [] grouped_dict = {} @@ -466,16 +499,16 @@ def group_dot_notation_fields(flat_list): Three lists - one for triggered properties and one for Neo4j node properties Example for Dataset: - - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] - - neo4j_top_properties_to_skip: ['data_access_level'] - - neo4j_nested_properties_to_skip: ['status_history.status'] + - triggered_top_props_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] + - neo4j_top_props_to_skip: ['data_access_level'] + - neo4j_nested_props_to_skip: ['status_history.status'] """ -def determine_property_exclusion_type(normalized_entity_type, flat_list): +def get_exclusion_types(normalized_entity_type, flat_list): global _schema - triggered_top_properties_to_skip = [] - neo4j_top_properties_to_skip = [] - neo4j_nested_properties_to_skip =[] + triggered_top_props_to_skip = [] + neo4j_top_props_to_skip = [] + neo4j_nested_props_to_skip =[] top_level_list = [] second_level_list = [] properties = _schema['ENTITIES'][normalized_entity_type]['properties'] @@ -491,27 +524,27 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list): for item in top_level_list: if item in properties: if TriggerTypeEnum.ON_READ in properties[item]: - triggered_top_properties_to_skip.append(item) + triggered_top_props_to_skip.append(item) else: - neo4j_top_properties_to_skip.append(item) + neo4j_top_props_to_skip.append(item) - # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip` - # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip` + # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_props_to_skip` + # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_props_to_skip` for item in second_level_list: prefix = item.split('.')[0] if prefix in properties: if TriggerTypeEnum.ON_READ in properties[prefix]: - triggered_top_properties_to_skip.append(item) + triggered_top_props_to_skip.append(item) else: - neo4j_nested_properties_to_skip.append(item) + neo4j_nested_props_to_skip.append(item) - logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}") - logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}") - logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}") + logger.info(f"Determined property exclusion type - triggered_top_props_to_skip: {triggered_top_props_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_top_props_to_skip: {neo4j_top_props_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_nested_props_to_skip: {neo4j_nested_props_to_skip}") - # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # NOTE: Will need to convert the `neo4j_nested_props_to_skip` to a format that can be used by # `exclude_properties_from_response()` - Zhou 10/1/2025 - return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip + return triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip """ diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 87211de4..91d8a112 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -709,13 +709,9 @@ def get_collection_datasets(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) + datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -787,13 +783,9 @@ def get_dataset_collections(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -871,13 +863,9 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) + upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -1005,13 +993,9 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2027,13 +2011,9 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2300,13 +2280,9 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - # Get all the user specified fields either top-level or nested from the original query string in request URL - # Find the specific sub list, depth is limited to 2 - # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method - # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + neo4j_props_to_exclude = _get_excluded_neo4j_props(property_key, request) - datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) + datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_props_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2719,12 +2695,12 @@ def _get_age_age_units_race_sex_phrase(age:str=None, age_units:str='units', race ------- list: A list containing Neo4j node properties to exclude """ -def _get_neo4j_properties_to_exclude(property_key, request): - neo4j_properties_to_exclude = [] +def _get_excluded_neo4j_props(property_key, request): + neo4j_props_to_exclude = [] # Get all the user specified fields either top-level or nested from the original query string in request URL try: - all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + all_props_to_exclude = schema_manager.get_excluded_query_props(request) except ValueError as e: raise ValueError(e) except Exception as e: @@ -2733,7 +2709,7 @@ def _get_neo4j_properties_to_exclude(property_key, request): # Find the specific sub list, depth is limited to 2 # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 - grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) + grouped_fields = schema_manager.group_dot_notation_props(all_props_to_exclude) for item in grouped_fields: # Find the depth 2 properties (top-level to this triggered entity) @@ -2742,13 +2718,13 @@ def _get_neo4j_properties_to_exclude(property_key, request): if not isinstance(field, str): item[property_key].pop(field) - neo4j_properties_to_exclude = item[property_key] + neo4j_props_to_exclude = item[property_key] - logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") + logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_props_to_exclude}") # Stop after finding the first match break - return neo4j_properties_to_exclude + return neo4j_props_to_exclude