Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.6.0
2.6.1
112 changes: 84 additions & 28 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,11 +774,77 @@ def get_entity_by_id(id):
# Otherwise query against uuid-api and neo4j to get the entity dict if the id exists
entity_dict = query_target_entity(id, token)
normalized_entity_type = entity_dict['entity_type']

# These are the top-level fields and nested fields defined in the schema yaml
fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type)

# Only support defined query string parameters for filtering purposes
# 'property' was initially introduced to return one of the single fields ['data_access_level', 'status']
# 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance)
# When both 'property' and 'exclude' are specified in the URL, 'property' dominates
# since the final result is a single field value - Zhou 10/1/2025
supported_qs_params = ['property', 'exclude']

# There are three types of properties that can be excluded from the GET response
# - top-level properties generated by trigger methods
# - top-level properties returned as part of Neo4j node properties
# - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query
triggered_top_properties_to_skip = []
neo4j_top_properties_to_skip = []
neo4j_nested_properties_to_skip = []

if bool(request.args):
# First make sure the user provided query string params are valid
for param in request.args:
if param not in supported_qs_params:
bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")

# Return a single property key and value using ?property=<property_key>
if 'property' in request.args:
single_property_key = request.args.get('property')

# Single property key that is immediately avaibale in Neo4j without running any triggers
# The `data_access_level` property is available in all entities Donor/Sample/Dataset
# and this filter is being used by gateway to check the data_access_level for file assets
# The `status` property is only available in Dataset and being used by search-api for revision
supported_property_keys = ['data_access_level', 'status']

# Validate the target property
if single_property_key not in supported_property_keys:
bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}")

if single_property_key == 'status' and \
not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")

# Response with the property value directly
# Don't use jsonify() on string value
return entity_dict[single_property_key]

# Exclude fields—either top-level or nested—specified by the user via the URL query string,
# using the format `?exclude=a.b,a.c,x`, where:
# - `x` is a top-level property of the target entity
# - `a.b` and `a.c` are nested fields in a dot-notated form (b and c could be from a different entity type)
#
# Note: This is not the most efficient approach, as exclusion is performed after the Neo4j query
# rather than within it. However, it leverages the existing `exclude_properties_from_response()`
# function for simplicity and maintainability. - Zhou 10/1/2025
try:
all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request)

# Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict`
# Also get nested properties that are directly returned from Neo4j, which will be handled differently
triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude)
except ValueError as e:
bad_request_error(e)
except Exception as e:
internal_server_error(e)

# Get the generated complete entity result from cache if exists
# Otherwise re-generate on the fly
complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict)
# NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods
# Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025
complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip)

# Determine if the entity is publicly visible base on its data, only.
# To verify if a Collection is public, it is necessary to have its Datasets, which
Expand Down Expand Up @@ -813,37 +879,27 @@ def get_entity_by_id(id):
forbidden_error(f"The requested {normalized_entity_type} has non-public data."
f" A Globus token with access permission is required.")

# Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict`
# Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip`
# from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025
for item in neo4j_top_properties_to_skip:
complete_dict.pop(item)

# Also normalize the result based on schema
final_result = schema_manager.normalize_entity_result_for_response(complete_dict)

# Result filtering based on query string
# The `data_access_level` property is available in all entities Donor/Sample/Dataset
# and this filter is being used by gateway to check the data_access_level for file assets
# The `status` property is only available in Dataset and being used by search-api for revision
result_filtering_accepted_property_keys = ['data_access_level', 'status']

if bool(request.args):
property_key = request.args.get('property')

if property_key is not None:
# Validate the target property
if property_key not in result_filtering_accepted_property_keys:
bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}")

if property_key == 'status' and \
not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")
# In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset)
# where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list`
# For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level.
# NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by
# `exclude_properties_from_response()` - Zhou 10/1/2025
final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result)

# Response with the property value directly
# Don't use jsonify() on string value
return complete_dict[property_key]
else:
bad_request_error("The specified query string is not supported. Use '?property=<key>' to filter the result")
else:
# Response with the dict
if public_entity and not user_in_hubmap_read_group(request):
final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
return jsonify(final_result)
# Response with the dict
if public_entity and not user_in_hubmap_read_group(request):
final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)

return jsonify(final_result)


"""
Expand Down
2 changes: 1 addition & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ PyYAML==5.4.1
# Use the branch name of commons from github for testing new changes made in commons from different branch
# Default is main branch specified in docker-compose.development.yml if not set
# git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
hubmap-commons==2.1.19
hubmap-commons==2.1.21

# For unit test
nose2==0.10.0
170 changes: 168 additions & 2 deletions src/schema/schema_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ def get_fields_to_exclude(normalized_class=None):
Parameters
----------
excluded_fields : list
A list of the fields to be excluded
A JSON list of the fields to be excluded, may have nested fields
output_dict : dictionary
A dictionary representing the data to be modified

Expand Down Expand Up @@ -349,6 +349,171 @@ def delete_nested_field(data, nested_path):
return output_dict


"""
Use the Flask request.args MultiDict to see if 'exclude' is a URL parameter passed in with the
request and parse the comma-separated properties to be excluded from final response

For now, only support one dot for nested fields (depth 2)

Parameters
----------
request: Flask request object
The instance of Flask request passed in from application request

Returns
-------
list
A flat list of strings containing top-level and/or nested dot-notated properties
Example: ['a.b', 'a.c', 'x']
"""
def get_all_fields_to_exclude_from_query_string(request):
all_properties_to_exclude = []

if 'exclude' in request.args:
# The query string values are case-sensitive as the property keys in schema yaml are case-sensitive
properties_to_exclude_str = request.args.get('exclude')

if properties_to_exclude_str:
# Must all lowercase values
has_upper = any(c.isupper() for c in properties_to_exclude_str)

if has_upper:
raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.")

all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")]

logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
else:
raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).")

# A bit more validation to limit to depth 2
for item in all_properties_to_exclude:
if '.' in item:
if len(item.split('.')) > 2:
raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.")

# More validation - ensure prohibited properties are not accepted
# This two properties are required internally by `normalize_entity_result_for_response()`
prohibited_properties = ['uuid', 'entity_type']
second_level_list = []

for item in all_properties_to_exclude:
if item in prohibited_properties or item.split('.')[1] in prohibited_properties:
raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.")

return all_properties_to_exclude


"""
Transform a flat list of dot-notated strings into a hybrid list that:
- keeps plain strings as-is
- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix

Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}]

Used by `GET /entities/<id>?exclude=a.b, a.c, x` to build a JSON list
that can be futher processed by `exclude_properties_from_response()`.

Parameters
----------
flat_list : list
A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields
Example: ['a.b', 'a.c', 'x']

Returns
-------
list
A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}]
"""
def group_dot_notation_fields(flat_list):
output_list = []
grouped_dict = {}

for item in flat_list:
# For now, only support one dot for nested fields (depth 2)
if '.' in item:
prefix, field = item.split('.', 1)
grouped_dict.setdefault(prefix, []).append(field)
else:
output_list.append(item)

# Add grouped items as dictionaries
for prefix, fields in grouped_dict.items():
output_list.append({prefix: fields})

return output_list


"""
Group properties by exclusion type

Example: ['a.b', 'a.c', 'x', 'y'] where
- x and y are top-level properties
- x is Neo4j node property, and y is generated via trigger method
- a.b and a.c are nested properties while a is a top-level property of either type

Parameters
----------
normalized_entity_type : str
One of the normalized entity types: Dataset, Collection, Sample, Donor, Upload, Publication
flat_list : list
A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields
Example: ['a.b', 'a.c', 'x']

Returns
-------
list
Three lists - one for triggered properties and one for Neo4j node properties

Example for Dataset:
- triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
- neo4j_top_properties_to_skip: ['data_access_level']
- neo4j_nested_properties_to_skip: ['status_history.status']
"""
def determine_property_exclusion_type(normalized_entity_type, flat_list):
global _schema

triggered_top_properties_to_skip = []
neo4j_top_properties_to_skip = []
neo4j_nested_properties_to_skip =[]
top_level_list = []
second_level_list = []
properties = _schema['ENTITIES'][normalized_entity_type]['properties']

# First find the top-level properties without using dot-notation
for item in flat_list:
if '.' not in item:
top_level_list.append(item)
else:
second_level_list.append(item)

# Only care about the properties defined in schema yaml
for item in top_level_list:
if item in properties:
if 'on_read_trigger' in properties[item]:
triggered_top_properties_to_skip.append(item)
else:
neo4j_top_properties_to_skip.append(item)

# Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip`
# `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip`
for item in second_level_list:
prefix = item.split('.')[0]
if prefix in properties:
if 'on_read_trigger' in properties[prefix]:
triggered_top_properties_to_skip.append(item)
else:
neo4j_nested_properties_to_skip.append(item)

logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}")

# NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by
# `exclude_properties_from_response()` - Zhou 10/1/2025
return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip


"""
Generating triggered data based on the target events and methods

Expand Down Expand Up @@ -396,6 +561,8 @@ def generate_triggered_data(trigger_type: TriggerTypeEnum, normalized_class, req
# decides the ordering of which trigger method gets to run first
properties = schema_section[normalized_class]['properties']

logger.info(f"Skipping triggered data generation for the following properties: {properties_to_skip}")

# Set each property value and put all resulting data into a dictionary for:
# before_create_trigger|before_update_trigger|on_read_trigger
# No property value to be set for: after_create_trigger|after_update_trigger
Expand Down Expand Up @@ -2001,7 +2168,6 @@ def convert_str_literal(data_str):
data = ast.literal_eval(data_str)

if isinstance(data, (list, dict)):
logger.info(f"The input string literal has been converted to {type(data)} successfully")
return data
else:
logger.info(f"The input string literal is not list or dict after evaluation, return the original string input")
Expand Down
Loading