hubmapconsortium · yuanzhou · Oct 2, 2025 · Sep 30, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.6.0
+2.6.1
diff --git a/src/app.py b/src/app.py
@@ -774,11 +774,77 @@ def get_entity_by_id(id):
     # Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
     entity_dict = query_target_entity(id, token)
     normalized_entity_type = entity_dict['entity_type']
+
+    # These are the top-level fields and nested fields defined in the schema yaml
     fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type)
 
+    # Only support defined query string parameters for filtering purposes
+    # 'property' was initially introduced to return one of the single fields ['data_access_level', 'status']
+    # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance)
+    # When both 'property' and 'exclude' are specified in the URL, 'property' dominates
+    # since the final result is a single field value - Zhou 10/1/2025
+    supported_qs_params = ['property', 'exclude']
+
+    # There are three types of properties that can be excluded from the GET response
+    # - top-level properties generated by trigger methods
+    # - top-level properties returned as part of Neo4j node properties
+    # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query
+    triggered_top_properties_to_skip = []
+    neo4j_top_properties_to_skip = []
+    neo4j_nested_properties_to_skip = []
+
+    if bool(request.args):
+        # First make sure the user provided query string params are valid
+        for param in request.args:
+            if param not in supported_qs_params:
+                bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")
+
+        # Return a single property key and value using ?property=<property_key>
+        if 'property' in request.args:
+            single_property_key = request.args.get('property')
+
+            # Single property key that is immediately avaibale in Neo4j without running any triggers
+            # The `data_access_level` property is available in all entities Donor/Sample/Dataset
+            # and this filter is being used by gateway to check the data_access_level for file assets
+            # The `status` property is only available in Dataset and being used by search-api for revision
+            supported_property_keys = ['data_access_level', 'status']
+
+            # Validate the target property
+            if single_property_key not in supported_property_keys:
+                bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}")
+
+            if single_property_key == 'status' and \
+                    not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
+                bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")
+
+            # Response with the property value directly
+            # Don't use jsonify() on string value
+            return entity_dict[single_property_key]
+
+        # Exclude fields—either top-level or nested—specified by the user via the URL query string,
+        # using the format `?exclude=a.b,a.c,x`, where:
+        #   - `x` is a top-level property of the target entity
+        #   - `a.b` and `a.c` are nested fields in a dot-notated form (b and c could be from a different entity type)
+        #
+        # Note: This is not the most efficient approach, as exclusion is performed after the Neo4j query
+        # rather than within it. However, it leverages the existing `exclude_properties_from_response()` 
+        # function for simplicity and maintainability. - Zhou 10/1/2025
+        try:
+            all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request)
+
+            # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict`
+            # Also get nested properties that are directly returned from Neo4j, which will be handled differently
+            triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude)
+        except ValueError as e:
+            bad_request_error(e)
+        except Exception as e:
+            internal_server_error(e)
+
     # Get the generated complete entity result from cache if exists
     # Otherwise re-generate on the fly
-    complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict)
+    # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods
+    # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025
+    complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip)
 
     # Determine if the entity is publicly visible base on its data, only.
     # To verify if a Collection is public, it is necessary to have its Datasets, which
@@ -813,37 +879,27 @@ def get_entity_by_id(id):
         forbidden_error(f"The requested {normalized_entity_type} has non-public data."
                         f"  A Globus token with access permission is required.")
 
+    # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict`
+    # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip`
+    # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025
+    for item in neo4j_top_properties_to_skip:
+        complete_dict.pop(item)
+
     # Also normalize the result based on schema
     final_result = schema_manager.normalize_entity_result_for_response(complete_dict)
 
-    # Result filtering based on query string
-    # The `data_access_level` property is available in all entities Donor/Sample/Dataset
-    # and this filter is being used by gateway to check the data_access_level for file assets
-    # The `status` property is only available in Dataset and being used by search-api for revision
-    result_filtering_accepted_property_keys = ['data_access_level', 'status']
-
-    if bool(request.args):
-        property_key = request.args.get('property')
-
-        if property_key is not None:
-            # Validate the target property
-            if property_key not in result_filtering_accepted_property_keys:
-                bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}")
-
-            if property_key == 'status' and \
-                    not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
-                bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")
+    # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) 
+    # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list`
+    # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level.
+    # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by 
+    # `exclude_properties_from_response()`  - Zhou 10/1/2025
+    final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result)
 
-            # Response with the property value directly
-            # Don't use jsonify() on string value
-            return complete_dict[property_key]
-        else:
-            bad_request_error("The specified query string is not supported. Use '?property=<key>' to filter the result")
-    else:
-        # Response with the dict
-        if public_entity and not user_in_hubmap_read_group(request):
-            final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
-        return jsonify(final_result)
+    # Response with the dict
+    if public_entity and not user_in_hubmap_read_group(request):
+        final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
+
+    return jsonify(final_result)
 
 
 """

diff --git a/src/requirements.txt b/src/requirements.txt
@@ -18,7 +18,7 @@ PyYAML==5.4.1
 # Use the branch name of commons from github for testing new changes made in commons from different branch
 # Default is main branch specified in docker-compose.development.yml if not set
 # git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
-hubmap-commons==2.1.19
+hubmap-commons==2.1.21
 
 # For unit test
 nose2==0.10.0
diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
@@ -305,7 +305,7 @@ def get_fields_to_exclude(normalized_class=None):
 Parameters
 ----------
 excluded_fields : list
-    A list of the fields to be excluded
+    A JSON list of the fields to be excluded, may have nested fields
 output_dict : dictionary
     A dictionary representing the data to be modified
 
@@ -349,6 +349,171 @@ def delete_nested_field(data, nested_path):
     return output_dict
 
 
+"""
+Use the Flask request.args MultiDict to see if 'exclude' is a URL parameter passed in with the
+request and parse the comma-separated properties to be excluded from final response
+
+For now, only support one dot for nested fields (depth 2)
+
+Parameters
+----------
+request: Flask request object
+    The instance of Flask request passed in from application request
+
+Returns
+-------
+list
+    A flat list of strings containing top-level and/or nested dot-notated properties
+    Example: ['a.b', 'a.c', 'x']
+"""
+def get_all_fields_to_exclude_from_query_string(request):
+    all_properties_to_exclude = []
+
+    if 'exclude' in request.args:
+        # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive
+        properties_to_exclude_str = request.args.get('exclude')
+
+        if properties_to_exclude_str:
+            # Must all lowercase values
+            has_upper = any(c.isupper() for c in properties_to_exclude_str)
+
+            if has_upper:
+                raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.")
+
+            all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")]
+
+            logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
+        else:
+            raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).")
+
+        # A bit more validation to limit to depth 2
+        for item in all_properties_to_exclude:
+            if '.' in item:
+                if len(item.split('.')) > 2:
+                    raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.")
+
+        # More validation - ensure prohibited properties are not accepted
+        # This two properties are required internally by `normalize_entity_result_for_response()`
+        prohibited_properties = ['uuid', 'entity_type']
+        second_level_list = []
+
+        for item in all_properties_to_exclude:
+            if item in prohibited_properties or item.split('.')[1] in prohibited_properties:
+                raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.")
+
+    return all_properties_to_exclude
+
+
+"""
+Transform a flat list of dot-notated strings into a hybrid list that:
+- keeps plain strings as-is
+- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix
+
+Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}]
+
+Used by `GET /entities/<id>?exclude=a.b, a.c, x` to build a JSON list 
+that can be futher processed by `exclude_properties_from_response()`.
+
+Parameters
+----------
+flat_list : list
+    A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields
+    Example: ['a.b', 'a.c', 'x']
+
+Returns
+-------
+list
+    A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}]
+"""
+def group_dot_notation_fields(flat_list):
+    output_list = []
+    grouped_dict = {}
+
+    for item in flat_list:
+        # For now, only support one dot for nested fields (depth 2)
+        if '.' in item:
+            prefix, field = item.split('.', 1)
+            grouped_dict.setdefault(prefix, []).append(field)
+        else:
+            output_list.append(item)
+
+    # Add grouped items as dictionaries
+    for prefix, fields in grouped_dict.items():
+        output_list.append({prefix: fields})
+
+    return output_list
+
+
+"""
+Group properties by exclusion type
+
+Example: ['a.b', 'a.c', 'x', 'y'] where 
+- x and y are top-level properties
+- x is Neo4j node property, and y is generated via trigger method
+- a.b and a.c are nested properties while a is a top-level property of either type
+
+Parameters
+----------
+normalized_entity_type : str
+    One of the normalized entity types: Dataset, Collection, Sample, Donor, Upload, Publication
+flat_list : list
+    A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields
+    Example: ['a.b', 'a.c', 'x']
+
+Returns
+-------
+list
+    Three lists - one for triggered properties and one for Neo4j node properties
+
+    Example for Dataset:
+    - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
+    - neo4j_top_properties_to_skip: ['data_access_level']
+    - neo4j_nested_properties_to_skip: ['status_history.status']
+"""
+def determine_property_exclusion_type(normalized_entity_type, flat_list):
+    global _schema
+
+    triggered_top_properties_to_skip = []
+    neo4j_top_properties_to_skip = []
+    neo4j_nested_properties_to_skip =[]
+    top_level_list = []
+    second_level_list = []
+    properties = _schema['ENTITIES'][normalized_entity_type]['properties']
+
+    # First find the top-level properties without using dot-notation
+    for item in flat_list:
+        if '.' not in item:
+            top_level_list.append(item)
+        else:
+            second_level_list.append(item)
+
+    # Only care about the properties defined in schema yaml
+    for item in top_level_list:
+        if item in properties:
+            if 'on_read_trigger' in properties[item]:
+                triggered_top_properties_to_skip.append(item)
+            else:
+                neo4j_top_properties_to_skip.append(item)
+
+    # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip`
+    # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip`
+    for item in second_level_list:
+        prefix = item.split('.')[0]
+        if prefix in properties:
+            if 'on_read_trigger' in properties[prefix]:
+                triggered_top_properties_to_skip.append(item)
+            else:
+                neo4j_nested_properties_to_skip.append(item)
+
+    logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}")
+    logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}")
+    logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}")
+
+    # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by 
+    # `exclude_properties_from_response()`  - Zhou 10/1/2025
+    return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip
+
+
 """
 Generating triggered data based on the target events and methods
 
@@ -396,6 +561,8 @@ def generate_triggered_data(trigger_type: TriggerTypeEnum, normalized_class, req
     # decides the ordering of which trigger method gets to run first
     properties = schema_section[normalized_class]['properties']
 
+    logger.info(f"Skipping triggered data generation for the following properties: {properties_to_skip}")
+
     # Set each property value and put all resulting data into a dictionary for:
     # before_create_trigger|before_update_trigger|on_read_trigger
     # No property value to be set for: after_create_trigger|after_update_trigger
@@ -2001,7 +2168,6 @@ def convert_str_literal(data_str):
             data = ast.literal_eval(data_str)
 
             if isinstance(data, (list, dict)):
-                logger.info(f"The input string literal has been converted to {type(data)} successfully")
                 return data
             else:
                 logger.info(f"The input string literal is not list or dict after evaluation, return the original string input")