diff --git a/VERSION b/VERSION index e70b4523..6a6a3d8e 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.6.0 +2.6.1 diff --git a/src/app.py b/src/app.py index f118f000..22013627 100644 --- a/src/app.py +++ b/src/app.py @@ -774,11 +774,77 @@ def get_entity_by_id(id): # Otherwise query against uuid-api and neo4j to get the entity dict if the id exists entity_dict = query_target_entity(id, token) normalized_entity_type = entity_dict['entity_type'] + + # These are the top-level fields and nested fields defined in the schema yaml fields_to_exclude = schema_manager.get_fields_to_exclude(normalized_entity_type) + # Only support defined query string parameters for filtering purposes + # 'property' was initially introduced to return one of the single fields ['data_access_level', 'status'] + # 'exclude' is newly added to reduce the large paylod caused by certain fields (`direct_ancestors.files` for instance) + # When both 'property' and 'exclude' are specified in the URL, 'property' dominates + # since the final result is a single field value - Zhou 10/1/2025 + supported_qs_params = ['property', 'exclude'] + + # There are three types of properties that can be excluded from the GET response + # - top-level properties generated by trigger methods + # - top-level properties returned as part of Neo4j node properties + # - second-level properties returned by Neo4j but nested and can't be skipped in Cypher query + triggered_top_properties_to_skip = [] + neo4j_top_properties_to_skip = [] + neo4j_nested_properties_to_skip = [] + + if bool(request.args): + # First make sure the user provided query string params are valid + for param in request.args: + if param not in supported_qs_params: + bad_request_error(f"Only the following URL query string parameters (case-sensitive) are supported: {COMMA_SEPARATOR.join(supported_qs_params)}") + + # Return a single property key and value using ?property= + if 'property' in request.args: + single_property_key = request.args.get('property') + + # Single property key that is immediately avaibale in Neo4j without running any triggers + # The `data_access_level` property is available in all entities Donor/Sample/Dataset + # and this filter is being used by gateway to check the data_access_level for file assets + # The `status` property is only available in Dataset and being used by search-api for revision + supported_property_keys = ['data_access_level', 'status'] + + # Validate the target property + if single_property_key not in supported_property_keys: + bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(supported_property_keys)}") + + if single_property_key == 'status' and \ + not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): + bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") + + # Response with the property value directly + # Don't use jsonify() on string value + return entity_dict[single_property_key] + + # Exclude fields—either top-level or nested—specified by the user via the URL query string, + # using the format `?exclude=a.b,a.c,x`, where: + # - `x` is a top-level property of the target entity + # - `a.b` and `a.c` are nested fields in a dot-notated form (b and c could be from a different entity type) + # + # Note: This is not the most efficient approach, as exclusion is performed after the Neo4j query + # rather than within it. However, it leverages the existing `exclude_properties_from_response()` + # function for simplicity and maintainability. - Zhou 10/1/2025 + try: + all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + + # Determine which top-level properties to exclude from triggers and which to exclude directly from the resulting Neo4j `entity_dict` + # Also get nested properties that are directly returned from Neo4j, which will be handled differently + triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip = schema_manager.determine_property_exclusion_type(normalized_entity_type, all_properties_to_exclude) + except ValueError as e: + bad_request_error(e) + except Exception as e: + internal_server_error(e) + # Get the generated complete entity result from cache if exists # Otherwise re-generate on the fly - complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict) + # NOTE: top-level properties in `triggered_top_properties_to_skip` will skip the trigger methods + # Nested properties like `direct_ancestors.files` will be handled by the trigger method - Zhou 10/1/2025 + complete_dict = schema_manager.get_complete_entity_result(request, token, entity_dict, triggered_top_properties_to_skip) # Determine if the entity is publicly visible base on its data, only. # To verify if a Collection is public, it is necessary to have its Datasets, which @@ -813,37 +879,27 @@ def get_entity_by_id(id): forbidden_error(f"The requested {normalized_entity_type} has non-public data." f" A Globus token with access permission is required.") + # Remove the top-level properties that are directly available in the resulting Neo4j `entity_dict` + # Due to the use of entity cache from `query_target_entity()`, we don't want to exclude the `neo4j_top_properties_to_skip` + # from actual Neo4j query. And it's not s performance concern neither. - Zhou 10/1/2025 + for item in neo4j_top_properties_to_skip: + complete_dict.pop(item) + # Also normalize the result based on schema final_result = schema_manager.normalize_entity_result_for_response(complete_dict) - # Result filtering based on query string - # The `data_access_level` property is available in all entities Donor/Sample/Dataset - # and this filter is being used by gateway to check the data_access_level for file assets - # The `status` property is only available in Dataset and being used by search-api for revision - result_filtering_accepted_property_keys = ['data_access_level', 'status'] - - if bool(request.args): - property_key = request.args.get('property') - - if property_key is not None: - # Validate the target property - if property_key not in result_filtering_accepted_property_keys: - bad_request_error(f"Only the following property keys are supported in the query string: {COMMA_SEPARATOR.join(result_filtering_accepted_property_keys)}") - - if property_key == 'status' and \ - not schema_manager.entity_type_instanceof(normalized_entity_type, 'Dataset'): - bad_request_error(f"Only Dataset or Publication supports 'status' property key in the query string") + # In addition, there may be nested fields like `ingest_metadata.dag_provenance_list` (for Dataset) + # where `ingest_metadata` is an actual Neo4j node string property containing `dag_provenance_list` + # For such cases, we can't handle via simple Neo4j query. Instead, exclude at Python app level. + # NOTE: need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # `exclude_properties_from_response()` - Zhou 10/1/2025 + final_result = schema_manager.exclude_properties_from_response(schema_manager.group_dot_notation_fields(neo4j_nested_properties_to_skip), final_result) - # Response with the property value directly - # Don't use jsonify() on string value - return complete_dict[property_key] - else: - bad_request_error("The specified query string is not supported. Use '?property=' to filter the result") - else: - # Response with the dict - if public_entity and not user_in_hubmap_read_group(request): - final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) - return jsonify(final_result) + # Response with the dict + if public_entity and not user_in_hubmap_read_group(request): + final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result) + + return jsonify(final_result) """ diff --git a/src/requirements.txt b/src/requirements.txt index 6026e84d..ae3fa02e 100644 --- a/src/requirements.txt +++ b/src/requirements.txt @@ -18,7 +18,7 @@ PyYAML==5.4.1 # Use the branch name of commons from github for testing new changes made in commons from different branch # Default is main branch specified in docker-compose.development.yml if not set # git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons -hubmap-commons==2.1.19 +hubmap-commons==2.1.21 # For unit test nose2==0.10.0 diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py index 7be00fb3..0f50201c 100644 --- a/src/schema/schema_manager.py +++ b/src/schema/schema_manager.py @@ -305,7 +305,7 @@ def get_fields_to_exclude(normalized_class=None): Parameters ---------- excluded_fields : list - A list of the fields to be excluded + A JSON list of the fields to be excluded, may have nested fields output_dict : dictionary A dictionary representing the data to be modified @@ -349,6 +349,171 @@ def delete_nested_field(data, nested_path): return output_dict +""" +Use the Flask request.args MultiDict to see if 'exclude' is a URL parameter passed in with the +request and parse the comma-separated properties to be excluded from final response + +For now, only support one dot for nested fields (depth 2) + +Parameters +---------- +request: Flask request object + The instance of Flask request passed in from application request + +Returns +------- +list + A flat list of strings containing top-level and/or nested dot-notated properties + Example: ['a.b', 'a.c', 'x'] +""" +def get_all_fields_to_exclude_from_query_string(request): + all_properties_to_exclude = [] + + if 'exclude' in request.args: + # The query string values are case-sensitive as the property keys in schema yaml are case-sensitive + properties_to_exclude_str = request.args.get('exclude') + + if properties_to_exclude_str: + # Must all lowercase values + has_upper = any(c.isupper() for c in properties_to_exclude_str) + + if has_upper: + raise ValueError("All the properties specified in 'exclude' query string in URL must be lowercase.") + + all_properties_to_exclude = [item.strip() for item in properties_to_exclude_str.split(",")] + + logger.info(f"User specified properties to exclude in request URL: {all_properties_to_exclude}") + else: + raise ValueError("The value of the 'exclude' query string parameter can not be empty and must be similar to the form of 'a, b, c, d.e, ...' (case-sensitive).") + + # A bit more validation to limit to depth 2 + for item in all_properties_to_exclude: + if '.' in item: + if len(item.split('.')) > 2: + raise ValueError("Only single dot-separated keys are allowed in 'exclude' (e.g., a.b). Keys with multiple dots like a.b.c are not supported.") + + # More validation - ensure prohibited properties are not accepted + # This two properties are required internally by `normalize_entity_result_for_response()` + prohibited_properties = ['uuid', 'entity_type'] + second_level_list = [] + + for item in all_properties_to_exclude: + if item in prohibited_properties or item.split('.')[1] in prohibited_properties: + raise ValueError(f"Entity property '{item}' is not allowed in the 'exclude' query parameter.") + + return all_properties_to_exclude + + +""" +Transform a flat list of dot-notated strings into a hybrid list that: +- keeps plain strings as-is +- converts entries with dot-notation (like 'direct_ancestors.files') into a dictionary, grouping by the prefix + +Example: ['a.b', 'a.c', 'x'] -> ['x', {'a': ['b', 'c']}] + +Used by `GET /entities/?exclude=a.b, a.c, x` to build a JSON list +that can be futher processed by `exclude_properties_from_response()`. + +Parameters +---------- +flat_list : list + A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields + Example: ['a.b', 'a.c', 'x'] + +Returns +------- +list + A list mixing strings and grouped dicts, like ['x', {'a': ['b', 'c']}] +""" +def group_dot_notation_fields(flat_list): + output_list = [] + grouped_dict = {} + + for item in flat_list: + # For now, only support one dot for nested fields (depth 2) + if '.' in item: + prefix, field = item.split('.', 1) + grouped_dict.setdefault(prefix, []).append(field) + else: + output_list.append(item) + + # Add grouped items as dictionaries + for prefix, fields in grouped_dict.items(): + output_list.append({prefix: fields}) + + return output_list + + +""" +Group properties by exclusion type + +Example: ['a.b', 'a.c', 'x', 'y'] where +- x and y are top-level properties +- x is Neo4j node property, and y is generated via trigger method +- a.b and a.c are nested properties while a is a top-level property of either type + +Parameters +---------- +normalized_entity_type : str + One of the normalized entity types: Dataset, Collection, Sample, Donor, Upload, Publication +flat_list : list + A flat list of strings, dot-notated strings are optional and can be used to indicate nested fields + Example: ['a.b', 'a.c', 'x'] + +Returns +------- +list + Three lists - one for triggered properties and one for Neo4j node properties + + Example for Dataset: + - triggered_top_properties_to_skip: ['direct_ancestors.files', 'direct_ancestors.ingest_metadata', 'upload.title'] + - neo4j_top_properties_to_skip: ['data_access_level'] + - neo4j_nested_properties_to_skip: ['status_history.status'] +""" +def determine_property_exclusion_type(normalized_entity_type, flat_list): + global _schema + + triggered_top_properties_to_skip = [] + neo4j_top_properties_to_skip = [] + neo4j_nested_properties_to_skip =[] + top_level_list = [] + second_level_list = [] + properties = _schema['ENTITIES'][normalized_entity_type]['properties'] + + # First find the top-level properties without using dot-notation + for item in flat_list: + if '.' not in item: + top_level_list.append(item) + else: + second_level_list.append(item) + + # Only care about the properties defined in schema yaml + for item in top_level_list: + if item in properties: + if 'on_read_trigger' in properties[item]: + triggered_top_properties_to_skip.append(item) + else: + neo4j_top_properties_to_skip.append(item) + + # Nested second-level properties, such as `direct_ancestors.files`, belong to `triggered_top_properties_to_skip` + # `ingest_metadata.dag_provenance_list` belongs to `neo4j_nested_properties_to_skip` + for item in second_level_list: + prefix = item.split('.')[0] + if prefix in properties: + if 'on_read_trigger' in properties[prefix]: + triggered_top_properties_to_skip.append(item) + else: + neo4j_nested_properties_to_skip.append(item) + + logger.info(f"Determined property exclusion type - triggered_top_properties_to_skip: {triggered_top_properties_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_top_properties_to_skip: {neo4j_top_properties_to_skip}") + logger.info(f"Determined property exclusion type - neo4j_nested_properties_to_skip: {neo4j_nested_properties_to_skip}") + + # NOTE: Will need to convert the `neo4j_nested_properties_to_skip` to a format that can be used by + # `exclude_properties_from_response()` - Zhou 10/1/2025 + return triggered_top_properties_to_skip, neo4j_top_properties_to_skip, neo4j_nested_properties_to_skip + + """ Generating triggered data based on the target events and methods @@ -396,6 +561,8 @@ def generate_triggered_data(trigger_type: TriggerTypeEnum, normalized_class, req # decides the ordering of which trigger method gets to run first properties = schema_section[normalized_class]['properties'] + logger.info(f"Skipping triggered data generation for the following properties: {properties_to_skip}") + # Set each property value and put all resulting data into a dictionary for: # before_create_trigger|before_update_trigger|on_read_trigger # No property value to be set for: after_create_trigger|after_update_trigger @@ -2001,7 +2168,6 @@ def convert_str_literal(data_str): data = ast.literal_eval(data_str) if isinstance(data, (list, dict)): - logger.info(f"The input string literal has been converted to {type(data)} successfully") return data else: logger.info(f"The input string literal is not list or dict after evaluation, return the original string input") diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py index 738ce81d..5add3874 100644 --- a/src/schema/schema_neo4j_queries.py +++ b/src/schema/schema_neo4j_queries.py @@ -566,13 +566,15 @@ def get_uploads(neo4j_driver, uuid, property_key = None): The uuid of target entity property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list A unique list of uuids of source entities """ -def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): +def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] if property_key: @@ -580,9 +582,15 @@ def get_dataset_direct_ancestors(neo4j_driver, uuid, property_key = None): f"WHERE t.uuid = '{uuid}' " f"RETURN apoc.coll.toSet(COLLECT(s.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " - f"WHERE t.uuid = '{uuid}' " - f"RETURN apoc.coll.toSet(COLLECT(s)) AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " + f"WHERE t.uuid = '{uuid}' " + f"WITH apoc.coll.toSet(COLLECT(s)) AS uniqueDirectAncestors " + f"RETURN [a IN uniqueDirectAncestors | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {properties_to_exclude}))] AS {record_field_name}") + else: + query = (f"MATCH (s:Entity)-[:ACTIVITY_INPUT]->(a:Activity)-[:ACTIVITY_OUTPUT]->(t:Dataset) " + f"WHERE t.uuid = '{uuid}' " + f"RETURN apoc.coll.toSet(COLLECT(s)) AS {record_field_name}") logger.info("======get_dataset_direct_ancestors() query======") logger.debug(query) @@ -1075,13 +1083,15 @@ def get_collection_associated_datasets(neo4j_driver, uuid, property_key = None): The uuid of dataset or publication property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list A list of collection uuids """ -def get_dataset_collections(neo4j_driver, uuid, property_key = None): +def get_dataset_collections(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] if property_key: @@ -1089,9 +1099,15 @@ def get_dataset_collections(neo4j_driver, uuid, property_key = None): f"WHERE e.uuid = '{uuid}' " f"RETURN apoc.coll.toSet(COLLECT(c.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " - f"WHERE e.uuid = '{uuid}' " - f"RETURN apoc.coll.toSet(COLLECT(c)) AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE e.uuid = '{uuid}' " + f"WITH apoc.coll.toSet(COLLECT(c)) AS uniqueCollections " + f"RETURN [c IN uniqueCollections | apoc.create.vNode(labels(c), apoc.map.removeKeys(properties(c), {properties_to_exclude}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Entity)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE e.uuid = '{uuid}' " + f"RETURN apoc.coll.toSet(COLLECT(c)) AS {record_field_name}") logger.info("======get_dataset_collections() query======") logger.debug(query) @@ -1156,18 +1172,26 @@ def get_publication_associated_collection(neo4j_driver, uuid): The neo4j database connection pool uuid : str The uuid of dataset +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- dict A Upload dict """ -def get_dataset_upload(neo4j_driver, uuid): +def get_dataset_upload(neo4j_driver, uuid, properties_to_exclude = []): result = {} - query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " - f"WHERE e.uuid = '{uuid}' " - f"RETURN s AS {record_field_name}") + if properties_to_exclude: + query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE e.uuid = '{uuid}' " + f"WITH s AS up " + f"RETURN apoc.create.vNode(labels(up), apoc.map.removeKeys(properties(up), {properties_to_exclude})) AS {record_field_name}") + else: + query = (f"MATCH (e:Entity)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE e.uuid = '{uuid}' " + f"RETURN s AS {record_field_name}") logger.info("======get_dataset_upload() query======") logger.debug(query) @@ -1191,20 +1215,32 @@ def get_dataset_upload(neo4j_driver, uuid): The neo4j database connection pool uuid : str The uuid of collection +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list The list containing associated dataset dicts """ -def get_collection_datasets(neo4j_driver, uuid): +def get_collection_datasets(neo4j_driver, uuid, properties_to_exclude = []): results = [] fields_to_omit = SchemaConstants.OMITTED_FIELDS - query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " - f"WHERE c.uuid = '{uuid}' " - f"WITH COLLECT(DISTINCT e) AS uniqueDataset " - f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") + + + if properties_to_exclude: + merged_list = properties_to_exclude + fields_to_omit + + query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE c.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueDataset " + f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {merged_list}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) " + f"WHERE c.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueDataset " + f"RETURN [a IN uniqueDataset | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") logger.info("======get_collection_datasets() query======") logger.debug(query) @@ -1400,13 +1436,15 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list): The uuid of Upload property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- list The list containing associated dataset dicts """ -def get_upload_datasets(neo4j_driver, uuid, property_key = None): +def get_upload_datasets(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): results = [] fields_to_omit = SchemaConstants.OMITTED_FIELDS if property_key: @@ -1416,10 +1454,18 @@ def get_upload_datasets(neo4j_driver, uuid, property_key = None): # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN apoc.coll.toSet(COLLECT(e.{property_key})) AS {record_field_name}") else: - query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " - f"WHERE s.uuid = '{uuid}' " - f"WITH COLLECT(DISTINCT e) AS uniqueUploads " - f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") + if properties_to_exclude: + merged_list = properties_to_exclude + fields_to_omit + + query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE s.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueUploads " + f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {merged_list}))] AS {record_field_name}") + else: + query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) " + f"WHERE s.uuid = '{uuid}' " + f"WITH COLLECT(DISTINCT e) AS uniqueUploads " + f"RETURN [a IN uniqueUploads | apoc.create.vNode(labels(a), apoc.map.removeKeys(properties(a), {fields_to_omit}))] AS {record_field_name}") logger.info("======get_upload_datasets() query======") logger.debug(query) @@ -1551,28 +1597,33 @@ def count_attached_published_datasets(neo4j_driver, entity_type, uuid): The uuid of target entity property_key : str A target property key for result filtering +properties_to_exclude : list + A list of node properties to exclude from result Returns ------- dict The parent dict, can either be a Sample or Donor """ -def get_sample_direct_ancestor(neo4j_driver, uuid, property_key = None): +def get_sample_direct_ancestor(neo4j_driver, uuid, property_key = None, properties_to_exclude = []): result = {} if property_key: query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " # Filter out the Lab entity if it's the ancestor f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " - # COLLECT() returns a list - # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN parent.{property_key} AS {record_field_name}") else: - query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " + if properties_to_exclude: + query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " + # Filter out the Lab entity if it's the ancestor + f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " + f"WITH parent AS p " + f"RETURN apoc.create.vNode(labels(p), apoc.map.removeKeys(properties(p), {properties_to_exclude})) AS {record_field_name}") + else: + query = (f"MATCH (s:Sample)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) " # Filter out the Lab entity if it's the ancestor f"WHERE s.uuid='{uuid}' AND parent.entity_type <> 'Lab' " - # COLLECT() returns a list - # apoc.coll.toSet() reruns a set containing unique nodes f"RETURN parent AS {record_field_name}") logger.info("======get_sample_direct_ancestor() query======") diff --git a/src/schema/schema_triggers.py b/src/schema/schema_triggers.py index 4445718e..87211de4 100644 --- a/src/schema/schema_triggers.py +++ b/src/schema/schema_triggers.py @@ -708,8 +708,14 @@ def get_collection_datasets(property_key, normalized_type, request, user_token, raise KeyError("Missing 'uuid' key in 'existing_data_dict' during calling 'get_collection_datasets()' trigger method.") logger.info(f"Executing 'get_collection_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") + + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) - datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + datasets_list = schema_neo4j_queries.get_collection_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -781,7 +787,13 @@ def get_dataset_collections(property_key, normalized_type, request, user_token, logger.info(f"Executing 'get_dataset_collections()' trigger method on uuid: {existing_data_dict['uuid']}") - collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + collections_list = schema_neo4j_queries.get_dataset_collections(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -859,7 +871,13 @@ def get_dataset_upload(property_key, normalized_type, request, user_token, exist logger.info(f"Executing 'get_dataset_upload()' trigger method on uuid: {existing_data_dict['uuid']}") - upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + upload_dict = schema_neo4j_queries.get_dataset_upload(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -987,7 +1005,13 @@ def get_dataset_direct_ancestors(property_key, normalized_type, request, user_to logger.info(f"Executing 'get_dataset_direct_ancestors()' trigger method on uuid: {existing_data_dict['uuid']}") - direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + direct_ancestors_list = schema_neo4j_queries.get_dataset_direct_ancestors(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2003,7 +2027,13 @@ def get_sample_direct_ancestor(property_key, normalized_type, request, user_toke logger.info(f"Executing 'get_sample_direct_ancestor()' trigger method on uuid: {existing_data_dict['uuid']}") - direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + direct_ancestor_dict = schema_neo4j_queries.get_sample_direct_ancestor(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2270,7 +2300,13 @@ def get_upload_datasets(property_key, normalized_type, request, user_token, exis logger.info(f"Executing 'get_upload_datasets()' trigger method on uuid: {existing_data_dict['uuid']}") - datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid']) + # Get all the user specified fields either top-level or nested from the original query string in request URL + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + neo4j_properties_to_exclude = _get_neo4j_properties_to_exclude(property_key, request) + + datasets_list = schema_neo4j_queries.get_upload_datasets(schema_manager.get_neo4j_driver_instance(), existing_data_dict['uuid'], property_key = None, properties_to_exclude = neo4j_properties_to_exclude) # Get rid of the entity node properties that are not defined in the yaml schema # as well as the ones defined as `exposed: false` in the yaml schema @@ -2669,3 +2705,50 @@ def _get_age_age_units_race_sex_phrase(age:str=None, age_units:str='units', race return f"{age}-{age_units}-old {race} {sex}" +""" +Parse the original user request to determine the Neo4j properties to exclude from trigger generated data + +Parameters +---------- +property_key : str + The target property key of the value to be generated +request: Flask request object + The instance of Flask request passed in from application request + +Returns +------- +list: A list containing Neo4j node properties to exclude +""" +def _get_neo4j_properties_to_exclude(property_key, request): + neo4j_properties_to_exclude = [] + + # Get all the user specified fields either top-level or nested from the original query string in request URL + try: + all_properties_to_exclude = schema_manager.get_all_fields_to_exclude_from_query_string(request) + except ValueError as e: + raise ValueError(e) + except Exception as e: + raise Exception(e) + + # Find the specific sub list, depth is limited to 2 + # We only care about the Neo4j node properties as we don't run nested triggers inside a trigger method + # For example, direct_ancestors.files is supported, but direct_ancestors.metadata.acquisition_id is not - Zhou 10/1/2025 + grouped_fields = schema_manager.group_dot_notation_fields(all_properties_to_exclude) + + for item in grouped_fields: + # Find the depth 2 properties (top-level to this triggered entity) + if isinstance(item, dict) and property_key in item: + for field in item[property_key]: + if not isinstance(field, str): + item[property_key].pop(field) + + neo4j_properties_to_exclude = item[property_key] + + logger.info(f"User specified neo4j properties to exclude in request URL: {neo4j_properties_to_exclude}") + + # Stop after finding the first match + break + + return neo4j_properties_to_exclude + +