hubmapconsortium · yuanzhou · Jan 23, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.4.2
+2.4.3
diff --git a/src/app.py b/src/app.py
@@ -797,13 +797,6 @@ def get_entity_by_id(id):
         # Response with the dict
         if public_entity and not user_in_hubmap_read_group(request):
             final_result = schema_manager.exclude_properties_from_response(fields_to_exclude, final_result)
-        if normalized_entity_type == 'Collection':
-            for i, dataset in enumerate(final_result.get('datasets', [])):
-                if _get_entity_visibility(normalized_entity_type='Dataset', entity_dict=dataset) != DataVisibilityEnum.PUBLIC or user_in_hubmap_read_group(request):
-                    # If the dataset is non-public, or if the user has read-group access, there is no need to remove fields, continue to the next dataset
-                    continue
-                dataset_excluded_fields = schema_manager.get_fields_to_exclude('Dataset')
-                final_result.get('datasets')[i] = schema_manager.exclude_properties_from_response(dataset_excluded_fields, dataset)
         return jsonify(final_result)
 
 """

diff --git a/src/dev_entity_worker.py b/src/dev_entity_worker.py
@@ -329,15 +329,6 @@ def _get_entity_by_id_for_auth_level(self, entity_id:Annotated[str, 32], valid_u
         #if public_entity and not user_in_hubmap_read_group(request):
         if public_entity and not user_authorized:
             final_result = self.schemaMgr.exclude_properties_from_response(fields_to_exclude, final_result)
-        if normalized_entity_type == 'Collection':
-            for i, dataset in enumerate(final_result.get('datasets', [])):
-                if self._get_entity_visibility( entity_dict=dataset) != DataVisibilityEnum.PUBLIC \
-                        or user_authorized: # or user_in_hubmap_read_group(request):
-                    # If the dataset is public, or if the user has read-group access, there is
-                    # no need to remove fields, continue to the next dataset
-                    continue
-                dataset_excluded_fields = self.schemaMgr.get_fields_to_exclude('Dataset')
-                final_result.get('datasets')[i] = self.schemaMgr.exclude_properties_from_response(dataset_excluded_fields, dataset)
         return final_result
 
     '''

diff --git a/src/schema/provenance_schema.yaml b/src/schema/provenance_schema.yaml
@@ -192,6 +192,11 @@ shared_entity_properties: &shared_entity_properties
 ENTITIES:
   ############################################# Collection #############################################
   Collection:
+    excluded_properties_from_public_response:
+      - datasets:
+        - lab_dataset_id
+        - metadata:
+          - lab_id
     # Collection can not be derivation source but not target
     derivation:
       source: false
@@ -304,6 +309,15 @@ ENTITIES:
       - lab_dataset_id
       - metadata:
         - lab_id
+      - direct_ancestors:
+        # Sample ancestors of a Dataset must have these fields removed
+        - lab_tissue_sample_id
+        - submission_id
+        # Dataset ancestors of a Dataset must have these fields removed
+        - lab_dataset_id
+        # Both Sample and Dataset ancestors of a Dataset must have these fields removed
+        - metadata:
+          - lab_id
     derivation:
       source: true
       target: true
@@ -667,10 +681,6 @@ ENTITIES:
         description: "The activity that was performed."
       dataset_type:
         before_create_trigger: set_publication_dataset_type
-        before_property_create_validators:
-          - validate_recognized_dataset_type
-        before_property_update_validators:
-          - validate_recognized_dataset_type
         type: string
         generated: true
         immutable: true
@@ -874,6 +884,18 @@ ENTITIES:
     excluded_properties_from_public_response:
       - lab_tissue_sample_id
       - submission_id
+      - metadata:
+        - lab_id
+      - direct_ancestor:
+        # Donor ancestors of a Sample must have these fields removed
+        - lab_donor_id
+        - label
+        # Sample ancestors of a Sample must have these fields removed
+        - lab_tissue_sample_id
+        - metadata:
+          - lab_id
+        # Both Sample and Donor ancestors of a Sample must have these fields removed
+        - submission_id
     properties:
       <<: *shared_properties
       <<: *shared_entity_properties

diff --git a/src/schema/schema_constants.py b/src/schema/schema_constants.py
@@ -22,7 +22,7 @@ class SchemaConstants(object):
 
     DOI_BASE_URL = 'https://doi.org/'
 
-    DATASETS_OMITTED_FIELDS = ['ingest_metadata', 'metadata', 'files']
+    OMITTED_FIELDS = ['ingest_metadata', 'files']
 
 # Define an enumeration to classify an entity's visibility, which can be combined with
 # authorization info when verify operations on a request.

diff --git a/src/schema/schema_manager.py b/src/schema/schema_manager.py
@@ -306,13 +306,26 @@ def delete_nested_field(data, nested_path):
                     if isinstance(value, list):
                         for nested_field in value:
                             if isinstance(nested_field, dict):
-                                delete_nested_field(data[key], nested_field)
+                                if isinstance(data[key], list):
+                                    for item in data[key]:
+                                        delete_nested_field(item, nested_field)
+                                else:
+                                    delete_nested_field(data[key], nested_field)
+                            elif isinstance(data[key], list):
+                                for item in data[key]:
+                                    if nested_field in item:
+                                        del item[nested_field]
                             elif nested_field in data[key]:
                                 del data[key][nested_field]
                     elif isinstance(value, dict):
                         delete_nested_field(data[key], value)
         elif nested_path in data:
-            del data[nested_path]
+            if isinstance(data[nested_path], list):
+                for item in data[nested_path]:
+                    if nested_path in item:
+                        del item[nested_path]
+            else:
+                del data[nested_path]
 
     for field in excluded_fields:
         delete_nested_field(output_dict, field)

diff --git a/src/schema/schema_neo4j_queries.py b/src/schema/schema_neo4j_queries.py
@@ -164,7 +164,7 @@ def filter_ancestors_by_type(neo4j_driver, direct_ancestor_uuids, entity_type):
 """
 def get_children(neo4j_driver, uuid, property_key = None):
     results = []
-
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     if property_key:
         query = (f"MATCH (e:Entity)-[:ACTIVITY_INPUT]->(:Activity)-[:ACTIVITY_OUTPUT]->(child:Entity) "
                  # The target entity can't be a Lab
@@ -178,7 +178,7 @@ def get_children(neo4j_driver, uuid, property_key = None):
                  f"WHERE e.uuid='{uuid}' AND e.entity_type <> 'Lab' "
                  # COLLECT() returns a list
                  # apoc.coll.toSet() reruns a set containing unique nodes
-                 f"RETURN apoc.coll.toSet(COLLECT(child)) AS {record_field_name}")
+                 f"RETURN apoc.coll.toSet(COLLECT(apoc.create.vNode(labels(child), apoc.map.removeKeys(properties(child), {fields_to_omit})))) AS {record_field_name}")
 
     logger.info("======get_children() query======")
     logger.info(query)
@@ -193,7 +193,7 @@ def get_children(neo4j_driver, uuid, property_key = None):
             else:
                 # Convert the list of nodes to a list of dicts
                 results = nodes_to_dicts(record[record_field_name])
-
+                
     return results
 
 
@@ -216,7 +216,7 @@ def get_children(neo4j_driver, uuid, property_key = None):
 """
 def get_parents(neo4j_driver, uuid, property_key = None):
     results = []
-
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     if property_key:
         query = (f"MATCH (e:Entity)<-[:ACTIVITY_OUTPUT]-(:Activity)<-[:ACTIVITY_INPUT]-(parent:Entity) "
                  # Filter out the Lab entities
@@ -230,7 +230,7 @@ def get_parents(neo4j_driver, uuid, property_key = None):
                  f"WHERE e.uuid='{uuid}' AND parent.entity_type <> 'Lab' "
                  # COLLECT() returns a list
                  # apoc.coll.toSet() reruns a set containing unique nodes
-                 f"RETURN apoc.coll.toSet(COLLECT(parent)) AS {record_field_name}")
+                 f"RETURN apoc.coll.toSet(COLLECT(apoc.create.vNode(labels(parent), apoc.map.removeKeys(properties(parent), {fields_to_omit})))) AS {record_field_name}")
 
     logger.info("======get_parents() query======")
     logger.info(query)
@@ -380,7 +380,7 @@ def get_tuplets(neo4j_driver, uuid, property_key=None):
 """
 def get_ancestors(neo4j_driver, uuid, property_key = None):
     results = []
-
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     if property_key:
         query = (f"MATCH (e:Entity)<-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]-(ancestor:Entity) "
                  # Filter out the Lab entities
@@ -394,7 +394,7 @@ def get_ancestors(neo4j_driver, uuid, property_key = None):
                  f"WHERE e.uuid='{uuid}' AND ancestor.entity_type <> 'Lab' "
                  # COLLECT() returns a list
                  # apoc.coll.toSet() reruns a set containing unique nodes
-                 f"RETURN apoc.coll.toSet(COLLECT(ancestor)) AS {record_field_name}")
+                 f"RETURN apoc.coll.toSet(COLLECT(apoc.create.vNode(labels(ancestor), apoc.map.removeKeys(properties(ancestor), {fields_to_omit})))) AS {record_field_name}")
 
     logger.info("======get_ancestors() query======")
     logger.info(query)
@@ -431,7 +431,7 @@ def get_ancestors(neo4j_driver, uuid, property_key = None):
 """
 def get_descendants(neo4j_driver, uuid, property_key = None):
     results = []
-
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     if property_key:
         query = (f"MATCH (e:Entity)-[:ACTIVITY_INPUT|ACTIVITY_OUTPUT*]->(descendant:Entity) "
                  # The target entity can't be a Lab
@@ -445,7 +445,7 @@ def get_descendants(neo4j_driver, uuid, property_key = None):
                  f"WHERE e.uuid='{uuid}' AND e.entity_type <> 'Lab' "
                  # COLLECT() returns a list
                  # apoc.coll.toSet() reruns a set containing unique nodes
-                 f"RETURN apoc.coll.toSet(COLLECT(descendant)) AS {record_field_name}")
+                 f"RETURN apoc.coll.toSet(COLLECT(apoc.create.vNode(labels(descendant), apoc.map.removeKeys(properties(descendant), {fields_to_omit})))) AS {record_field_name}")
 
     logger.info("======get_descendants() query======")
     logger.info(query)
@@ -460,7 +460,7 @@ def get_descendants(neo4j_driver, uuid, property_key = None):
             else:
                 # Convert the list of nodes to a list of dicts
                 results = nodes_to_dicts(record[record_field_name])
-
+                
     return results
 
 
@@ -1185,7 +1185,7 @@ def get_dataset_upload(neo4j_driver, uuid):
 def get_collection_datasets(neo4j_driver, uuid):
     results = []
 
-    fields_to_omit = SchemaConstants.DATASETS_OMITTED_FIELDS
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     query = (f"MATCH (e:Dataset)-[:IN_COLLECTION]->(c:Collection) "
              f"WHERE c.uuid = '{uuid}' "
              f"RETURN COLLECT(apoc.create.vNode(labels(e), apoc.map.removeKeys(properties(e), {fields_to_omit}))) AS {record_field_name}")
@@ -1391,7 +1391,7 @@ def unlink_datasets_from_upload(neo4j_driver, upload_uuid, dataset_uuids_list):
 """
 def get_upload_datasets(neo4j_driver, uuid, property_key = None):
     results = []
-    fields_to_omit = SchemaConstants.DATASETS_OMITTED_FIELDS
+    fields_to_omit = SchemaConstants.OMITTED_FIELDS
     if property_key:
         query = (f"MATCH (e:Dataset)-[:IN_UPLOAD]->(s:Upload) "
                  f"WHERE s.uuid = '{uuid}' "

diff --git a/src/schema/schema_validators.py b/src/schema/schema_validators.py
@@ -68,7 +68,8 @@ def validate_recognized_dataset_type(property_key, normalized_entity_type, reque
     # those square brackets are acceptable at the end of the string.  Simply validate the start.
     proposed_dataset_type_prefix = re.sub(pattern='(\S)\s\[.*\]$', repl=r'\1', string=new_data_dict['dataset_type'])
     target_list = schema_manager.get_dataset_type_valueset_list()
-
+    # TODO This is a temporary bypass because the UBKG does not support publication as a dataset_type yet. Remove once its added
+    target_list.append("Publication")
     if proposed_dataset_type_prefix not in target_list:
         raise ValueError(f"Proposed Dataset dataset_type '{proposed_dataset_type_prefix}'"
                          f" is not recognized in the existing ontology."