Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 17 additions & 17 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,21 +783,21 @@ def get_entity_by_id(id):
# 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance)
# When both 'property' and 'exclude' are specified in the URL, 'property' dominates
# since the final result is a single field value - Zhou 10/1/2025
supported_qs_params = ['property', 'exclude']
supported_query_params = ['property', 'exclude']

# There are three types of properties that can be excluded from the GET response
# - top-level properties generated by trigger methods
# - top-level properties returned as part of Neo4j node properties
# - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query
triggered_top_properties_to_skip = []
neo4j_top_properties_to_skip = []
neo4j_nested_properties_to_skip = []
triggered_top_props_to_skip = []
neo4j_top_props_to_skip = []
neo4j_nested_props_to_skip = []

if bool(request.args):
# First make sure the user provided query string params are valid
# First make sure the user provided query params are valid
for param in request.args:
if param not in supported_qs_params:
bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}")
if param not in supported_query_params:
bad_request_error(f"Only the following URL query parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_query_params)}")

# Return a single property key and value using ?property=<property_key>
if 'property' in request.args:
Expand All @@ -811,11 +811,11 @@ def get_entity_by_id(id):

# Validate the target property
if single_property_key not in supported_property_keys:
bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}")
bad_request_error(f"Only the following property keys are supported in the query parameter: {COMMA_SEPARATOR.join(supported_property_keys)}")

if single_property_key == 'status' and \
not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'):
bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string")
bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query parameter")

# Response with the property value directly
# Don't use jsonify() on string value
Expand All @@ -830,21 +830,21 @@ def get_entity_by_id(id):
# rather than within it. However, it leverages the existing `exclude_properties_from_response()`
# function for simplicity and maintainability. - Zhou 10/1/2025
try:
all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request)
all_props_to_exclude = schema_manager.get_excluded_query_props(request)

# Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict`
# Also get nested properties that are directly returned from Neo4j, which will be handled differently
triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude)
triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip = schema_manager.get_exclusion_types(normalized_entity_type, all_props_to_exclude)
except ValueError as e:
bad_request_error(e)
except Exception as e:
internal_server_error(e)

# Get the generated complete entity result from cache if exists
# Otherwise re-generate on the fly
# NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods
# NOTE: top-level properties in `triggered_top_props_to_skip` will skip the trigger methods
# Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025
complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip)
complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_props_to_skip)

# Determine if the entity is publicly visible base on its data, only.
# To verify if a Collection is public, it is necessary to have its Datasets, which
Expand Down Expand Up @@ -880,9 +880,9 @@ def get_entity_by_id(id):
f" A Globus token with access permission is required.")

# Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict`
# Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip`
# Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_props_to_skip`
# from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025
for item in neo4j_top_properties_to_skip:
for item in neo4j_top_props_to_skip:
complete_dict.pop(item)

# Also normalize the result based on schema
Expand All @@ -891,9 +891,9 @@ def get_entity_by_id(id):
# In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset)
# where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list`
# For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level.
# NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by
# NOTE: need to convert the `neo4j_nested_props_to_skip` to a format that can be used by
# `exclude_properties_from_response()` - Zhou 10/1/2025
final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result)
final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_props(neo4j_nested_props_to_skip), final_result)

# Response with the dict
if public_entity and not user_in_hubmap_read_group(request):
Expand Down
117 changes: 75 additions & 42 deletions src/schema/schema_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def delete_nested_field(data, nested_path):
for item in data[key]:
if nested_field in item:
del item[nested_field]
elif nested_field in data[key]:
elif isinstance(data[key], dict) and nested_field in data[key]:
del data[key][nested_field]
elif isinstance(value, dict):
delete_nested_field(data[key], value)
Expand Down Expand Up @@ -366,42 +366,75 @@ def delete_nested_field(data, nested_path):
A flat list of strings containing top-level and/or nested dot-notated properties
Example: ['a.b', 'a.c', 'x']
"""
def get_all_fields_to_exclude_from_query_string(request):
all_properties_to_exclude = []
def get_excluded_query_props(request):
all_props_to_exclude = []

if 'exclude' in request.args:
# The query string values are case-sensitive as the property keys in schema yaml are case-sensitive
properties_to_exclude_str = request.args.get('exclude')

if properties_to_exclude_str:
# Must all lowercase values
has_upper = any(c.isupper() for c in properties_to_exclude_str)

if has_upper:
raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.")
props_to_exclude_str = request.args.get('exclude')

all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")]
if not validate_comma_separated_exclude_str(props_to_exclude_str):
raise ValueError(
"The 'exclude' query parameter must be a comma-separated list of properties that follow these rules: "
"[1] Each property must include at least one letter; "
"[2] Only lowercase letters and underscores '_' are allowed; "
"[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b')."
)

logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}")
else:
raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).")
all_props_to_exclude = [item.strip() for item in props_to_exclude_str.split(",")]

logger.info(f"User specified properties to exclude in request URL: {all_props_to_exclude}")

# A bit more validation to limit to depth 2
for item in all_properties_to_exclude:
if '.' in item:
if len(item.split('.')) > 2:
raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.")

# More validation - ensure prohibited properties are not accepted
# This two properties are required internally by `normalize_entity_result_for_response()`
prohibited_properties = ['uuid', 'entity_type']
second_level_list = []

for item in all_properties_to_exclude:
for item in all_props_to_exclude:
if item in prohibited_properties or ('.' in item and item.split('.')[1] in prohibited_properties):
raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.")

return all_properties_to_exclude
return all_props_to_exclude


"""
The 'exclude' query parameter must be a comma-separated list of properties that follow these rules:

[1] Each property must include at least one letter;
[2] Only lowercase letters and underscores '_' are allowed;
[3] Nested property is limited to 2 depths and must use single dot '.' for dot-notation (like 'a.b').

Parameters
----------
s : str
Comma-separated input string used to exclude entity properties

Returns
-------
bool
True if valid or False otherwise
"""
def validate_comma_separated_exclude_str(s: str):
# No empty string
if not s:
return False

# Split by commas
items = s.split(',')

# No empty items allowed (prevents ',,' or trailing comma)
if any(not item.strip() for item in items):
return False

def is_valid_item(item: str):
return (
all(c.islower() or c in '._' for c in item)
and any(c.isalpha() for c in item)
and item.count('.') <= 1
and not ((item.startswith('.') or item.endswith('.')))
)

return all(is_valid_item(item.strip()) for item in items)


"""
Expand All @@ -425,7 +458,7 @@ def get_all_fields_to_exclude_from_query_string(request):
list
A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}]
"""
def group_dot_notation_fields(flat_list):
def group_dot_notation_props(flat_list):
output_list = []
grouped_dict = {}

Expand Down Expand Up @@ -466,16 +499,16 @@ def group_dot_notation_fields(flat_list):
Three lists - one for triggered properties and one for Neo4j node properties

Example for Dataset:
- triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
- neo4j_top_properties_to_skip: ['data_access_level']
- neo4j_nested_properties_to_skip: ['status_history.status']
- triggered_top_props_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title']
- neo4j_top_props_to_skip: ['data_access_level']
- neo4j_nested_props_to_skip: ['status_history.status']
"""
def determine_property_exclusion_type(normalized_entity_type, flat_list):
def get_exclusion_types(normalized_entity_type, flat_list):
global _schema

triggered_top_properties_to_skip = []
neo4j_top_properties_to_skip = []
neo4j_nested_properties_to_skip =[]
triggered_top_props_to_skip = []
neo4j_top_props_to_skip = []
neo4j_nested_props_to_skip =[]
top_level_list = []
second_level_list = []
properties = _schema['ENTITIES'][normalized_entity_type]['properties']
Expand All @@ -491,27 +524,27 @@ def determine_property_exclusion_type(normalized_entity_type, flat_list):
for item in top_level_list:
if item in properties:
if TriggerTypeEnum.ON_READ in properties[item]:
triggered_top_properties_to_skip.append(item)
triggered_top_props_to_skip.append(item)
else:
neo4j_top_properties_to_skip.append(item)
neo4j_top_props_to_skip.append(item)

# Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip`
# `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip`
# Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_props_to_skip`
# `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_props_to_skip`
for item in second_level_list:
prefix = item.split('.')[0]
if prefix in properties:
if TriggerTypeEnum.ON_READ in properties[prefix]:
triggered_top_properties_to_skip.append(item)
triggered_top_props_to_skip.append(item)
else:
neo4j_nested_properties_to_skip.append(item)
neo4j_nested_props_to_skip.append(item)

logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}")
logger.info(f"Determined property exclusion type - triggered_top_props_to_skip: {triggered_top_props_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_top_props_to_skip: {neo4j_top_props_to_skip}")
logger.info(f"Determined property exclusion type - neo4j_nested_props_to_skip: {neo4j_nested_props_to_skip}")

# NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by
# NOTE: Will need to convert the `neo4j_nested_props_to_skip` to a format that can be used by
# `exclude_properties_from_response()` - Zhou 10/1/2025
return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip
return triggered_top_props_to_skip, neo4j_top_props_to_skip, neo4j_nested_props_to_skip


"""
Expand Down
Loading