diff --git a/lambdas/service/app.py b/lambdas/service/app.py index e25fcdfe06..9e9c353965 100644 --- a/lambdas/service/app.py +++ b/lambdas/service/app.py @@ -121,7 +121,7 @@ # changes and reset the minor version to zero. Otherwise, increment only # the minor version for backwards compatible changes. A backwards # compatible change is one that does not require updates to clients. - 'version': '15.1', + 'version': '16.0', 'description': fd(f''' # Overview diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json index 916bb892af..8d2420fe8b 100644 --- a/lambdas/service/openapi.json +++ b/lambdas/service/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.1", "info": { "title": "azul-service-dev", - "version": "15.1", + "version": "16.0", "description": "\n# Overview\n\nAzul is a REST web service for querying metadata associated with\nboth experimental and analysis data from a data repository. In order\nto deliver response times that make it suitable for interactive use\ncases, the set of metadata properties that it exposes for sorting,\nfiltering, and aggregation is limited. Azul provides a uniform view\nof the metadata over a range of diverse schemas, effectively\nshielding clients from changes in the schemas as they occur over\ntime. It does so, however, at the expense of detail in the set of\nmetadata properties it exposes and in the accuracy with which it\naggregates them.\n\nAzul denormalizes and aggregates metadata into several different\nindices for selected entity types. Metadata entities can be queried\nusing the [Index](#operations-tag-Index) endpoints.\n\nA set of indices forms a catalog. There is a default catalog called\n`dcp2` which will be used unless a\ndifferent catalog name is specified using the `catalog` query\nparameter. Metadata from different catalogs is completely\nindependent: a response obtained by querying one catalog does not\nnecessarily correlate to a response obtained by querying another\none. Two catalogs can contain metadata from the same sources or\ndifferent sources. It is only guaranteed that the body of a\nresponse by any given endpoint adheres to one schema,\nindependently of which catalog was specified in the request.\n\nAzul provides the ability to download data and metadata via the\n[Manifests](#operations-tag-Manifests) endpoints. The\n`curl` format manifests can be used to\ndownload data files. Other formats provide various views of the\nmetadata. Manifests can be generated for a selection of files using\nfilters. These filters are interchangeable with the filters used by\nthe [Index](#operations-tag-Index) endpoints.\n\nAzul also provides a [summary](#operations-Index-get_index_summary)\nview of indexed data.\n\n## Data model\n\nAny index, when queried, returns a JSON array of hits. Each hit\nrepresents a metadata entity. Nested in each hit is a summary of the\nproperties of entities associated with the hit. An entity is\nassociated either by a direct edge in the original metadata graph,\nor indirectly as a series of edges. The nested properties are\ngrouped by the type of the associated entity. The properties of all\ndata files associated with a particular sample, for example, are\nlisted under `hits[*].files` in a `/index/samples` response. It is\nimportant to note that while each _hit_ represents a discrete\nentity, the properties nested within that hit are the result of an\naggregation over potentially many associated entities.\n\nTo illustrate this, consider a data file that is part of two\nprojects (a project is a group of related experiments, typically by\none laboratory, institution or consortium). Querying the `files`\nindex for this file yields a hit looking something like:\n\n```\n{\n \"projects\": [\n {\n \"projectTitle\": \"Project One\"\n \"laboratory\": ...,\n ...\n },\n {\n \"projectTitle\": \"Project Two\"\n \"laboratory\": ...,\n ...\n }\n ],\n \"files\": [\n {\n \"format\": \"pdf\",\n \"name\": \"Team description.pdf\",\n ...\n }\n ]\n}\n```\n\nThis example hit contains two kinds of nested entities (a hit in an\nactual Azul response will contain more): There are the two projects\nentities, and the file itself. These nested entities contain\nselected metadata properties extracted in a consistent way. This\nmakes filtering and sorting simple.\n\nAlso notice that there is only one file. When querying a particular\nindex, the corresponding entity will always be a singleton like\nthis.\n\n\n## Contact us\n\nFor technical support please file an issue at\n[GitHub](https://github.com/DataBiosphere/azul/issues) or email\n`azul-group@ucsc.edu`. To report a security concern or misconduct please email\n`azul-group@ucsc.edu`.\n" }, "tags": [ @@ -2319,22 +2319,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -2549,7 +2533,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -2638,7 +2622,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -3974,22 +3957,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -4204,7 +4171,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -4293,7 +4260,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -5629,22 +5595,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -5859,7 +5809,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "entity_type", @@ -5948,7 +5898,6 @@ "publicationTitle", "sampleDisease", "sampleEntityType", - "sampleId", "selectedCellType", "sha256", "sourceId", @@ -7174,22 +7123,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -7404,7 +7337,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] }, @@ -8633,22 +8566,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -8863,7 +8780,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" } ] } @@ -10050,22 +9967,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -10280,7 +10181,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", @@ -11572,22 +11473,6 @@ ], "additionalProperties": false }, - "sampleId": { - "type": "object", - "properties": { - "is": { - "type": "array", - "items": { - "type": "string", - "nullable": true - } - } - }, - "required": [ - "is" - ], - "additionalProperties": false - }, "selectedCellType": { "type": "object", "properties": { @@ -11802,7 +11687,7 @@ } } }, - "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, sampleId, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" + "description": "\nCriteria to filter entities from the search results.\n\nEach filter consists of a field name, a relation (relational operator),\nand an array of field values. The available relations are \"is\",\n\"within\", \"contains\", and \"intersects\". Multiple filters are combined\nusing \"and\" logic. An entity must match all filters to be included in\nthe response. How multiple field values within a single filter are\ncombined depends on the relation.\n\nFor the \"is\" relation, multiple values are combined using \"or\" logic.\nFor example, `{\"fileFormat\": {\"is\": [\"fastq\", \"fastq.gz\"]}}` selects\nentities where the file format is either \"fastq\" or \"fastq.gz\". For the\n\"within\", \"intersects\", and \"contains\" relations, the field values must\ncome in nested pairs specifying upper and lower bounds, and multiple\npairs are combined using \"and\" logic. For example, `{\"donorCount\":\n{\"within\": [[1,5], [5,10]]}}` selects entities whose donor organism\ncount falls within both ranges, i.e., is exactly 5.\n\nThe accessions field supports filtering for a specific accession and/or\nnamespace within a project. For example, `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\"}]}}` will filter for projects that have an\n`array_express` accession. Similarly, `{\"accessions\": {\"is\": [\n{\"accession\":\"ERP112843\"}]}}` will filter for projects that have the\naccession `ERP112843` while `{\"accessions\": {\"is\": [\n{\"namespace\":\"array_express\", \"accession\": \"E-AAAA-00\"}]}}` will filter\nfor projects that match both values.\n\nThe organismAge field is special in that it contains two property keys:\nvalue and unit. For example, `{\"organismAge\": {\"is\": [{\"value\": \"20\",\n\"unit\": \"year\"}]}}`. Both keys are required. `{\"organismAge\": {\"is\":\n[null]}}` selects entities that have no organism age.\n\nSupported field names are: accessions, aggregateLastModifiedDate, aggregateSubmissionDate, aggregateUpdateDate, assayType, biologicalSex, bionetworkName, bundleUuid, bundleVersion, cellCount, cellLineType, contactName, contentDescription, dataUseRestriction, developmentStage, donorCount, donorDisease, duosId, effectiveCellCount, effectiveOrgan, entryId, fileFormat, fileId, fileName, fileSize, fileSource, fileVersion, genusSpecies, institution, instrumentManufacturerModel, isIntermediate, isTissueAtlasProject, laboratory, lastModifiedDate, libraryConstructionApproach, matrixCellCount, modelOrgan, modelOrganPart, nucleicAcidSource, organ, organPart, organismAge, organismAgeRange, pairedEnd, preservationMethod, project, projectDescription, projectEstimatedCellCount, projectId, projectTitle, publicationTitle, sampleDisease, sampleEntityType, selectedCellType, sha256, sourceId, sourceSpec, specimenDisease, specimenOrgan, specimenOrganPart, submissionDate, tissueAtlas, updateDate, workflow, accessible\n" }, { "name": "format", diff --git a/src/azul/plugins/metadata/anvil/indexer/aggregate.py b/src/azul/plugins/metadata/anvil/indexer/aggregate.py index bb40b75bf7..051b452ffd 100644 --- a/src/azul/plugins/metadata/anvil/indexer/aggregate.py +++ b/src/azul/plugins/metadata/anvil/indexer/aggregate.py @@ -13,6 +13,7 @@ Accumulator, DistinctAccumulator, GroupingAggregator, + SetAccumulator, SetOfDictAccumulator, SimpleAggregator, SumAccumulator, @@ -23,13 +24,28 @@ class ActivityAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field in ('activity_id', 'document_id', 'source_datarepo_row_ids'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class BiosampleAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field == 'donor_age_at_collection': + if field in ('biosample_id', 'document_id', 'source_datarepo_row_ids'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field == 'donor_age_at_collection': return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) @@ -38,13 +54,33 @@ def _accumulator(self, field: str) -> Accumulator | None: class DatasetAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + # dataset.document_id aggregation is required for creating of manifests + if field == 'document_id': + return super()._accumulator(field) + elif field == 'source_datarepo_row_ids': + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class DiagnosisAggregator(SimpleAggregator): def _accumulator(self, field: str) -> Accumulator | None: - if field in ('diagnosis_age', 'onset_age'): + if field in ('diagnosis_id', 'document_id', 'source_datarepo_row_ids'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field == 'disease': + return SetAccumulator(max_size=14100) + elif field in ('diagnosis_age', 'onset_age'): return SetOfDictAccumulator(max_size=100, key=compose_keys(none_safe_tuple_key(none_last=True), itemgetter('lte', 'gte'))) @@ -53,7 +89,16 @@ def _accumulator(self, field: str) -> Accumulator | None: class DonorAggregator(SimpleAggregator): - pass + + def _accumulator(self, field: str) -> Accumulator | None: + if field in ('document_id', 'donor_id', 'source_datarepo_row_ids'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class FileAggregator(GroupingAggregator): @@ -72,7 +117,17 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return entity['file_format'], def _accumulator(self, field: str) -> Accumulator | None: - if field in ('count', 'file_size'): + if field in ( + 'document_id', + 'drs_uri', + 'file_id', + 'file_md5sum', + 'file_name', + 'source_datarepo_row_ids', + 'version', + ): + return None + elif field in ('count', 'file_size'): return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) diff --git a/src/azul/plugins/metadata/hca/__init__.py b/src/azul/plugins/metadata/hca/__init__.py index 1110f46ecd..742721785e 100644 --- a/src/azul/plugins/metadata/hca/__init__.py +++ b/src/azul/plugins/metadata/hca/__init__.py @@ -183,7 +183,7 @@ def exposed_indices(self) -> dict[EntityType, Sorting]: files=Sorting(field_name='fileName'), projects=Sorting(field_name='projectTitle', max_page_size=75), - samples=Sorting(field_name='sampleId') + samples=Sorting(field_name='entryId') ) @property @@ -276,7 +276,6 @@ def _field_mapping(self) -> InverseFieldMapping: 'donor_count': 'donorCount' }, 'samples': { - 'biomaterial_id': 'sampleId', 'entity_type': 'sampleEntityType', 'organ': 'organ', 'organ_part': 'organPart', diff --git a/src/azul/plugins/metadata/hca/indexer/aggregate.py b/src/azul/plugins/metadata/hca/indexer/aggregate.py index 818ff53457..b4a5918fb8 100644 --- a/src/azul/plugins/metadata/hca/indexer/aggregate.py +++ b/src/azul/plugins/metadata/hca/indexer/aggregate.py @@ -116,11 +116,39 @@ def _default_accumulator(self) -> Accumulator | None: class SampleAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field in ('biomaterial_id', 'document_id'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class SpecimenAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field == 'biomaterial_id': + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field == 'document_id': + # Added to the files aggregate in order to be included in manifests. + # It is also added to the samples aggregate for the calculation of + # the summary response field `specimenCount`, which is okay since + # there should only be one specimen inner entity in any samples + # outer entity. + if self.outer_entity_type in ('samples', 'files'): + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class CellSuspensionAggregator(GroupingAggregator): @@ -143,14 +171,29 @@ def _group_keys(self, entity) -> tuple[Any, ...]: return frozenset(entity['organ']), def _accumulator(self, field) -> Accumulator | None: - if field in self.cell_count_fields: + if field in ('biomaterial_id', 'document_id'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + elif field in self.cell_count_fields: return DistinctAccumulator(SumAccumulator()) else: return super()._accumulator(field) class CellLineAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field in ('biomaterial_id', 'document_id'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class DonorOrganismAggregator(SimpleAggregator): @@ -162,14 +205,12 @@ def _transform_entity(self, entity: JSON) -> JSON: } def _accumulator(self, field) -> Accumulator | None: - if field == 'organism_age_range': - return SetAccumulator(max_size=100) - elif field == 'organism_age': - return SetOfDictAccumulator(max_size=100, - key=compose_keys(none_safe_tuple_key(none_last=True), - none_safe_itemgetter('value', 'unit'))) - elif field == 'donor_count': - return UniqueValueCountAccumulator() + if field == 'biomaterial_id': + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None elif field == 'document_id': # If any donor IDs are missing from the aggregate, those donors will # be omitted during the verbatim handover. Donors are a "hot" entity @@ -179,17 +220,38 @@ def _accumulator(self, field) -> Accumulator | None: # FIXME: Enforce that hot entity types are completely aggregated # https://github.com/DataBiosphere/azul/issues/6793 return SetAccumulator(max_size=100) + elif field == 'development_stage': + return SetAccumulator(max_size=200) + elif field == 'organism_age_range': + return SetAccumulator(max_size=200) + elif field == 'organism_age': + return SetOfDictAccumulator(max_size=200, + key=compose_keys(none_safe_tuple_key(none_last=True), + none_safe_itemgetter('value', 'unit'))) + elif field == 'donor_count': + return UniqueValueCountAccumulator() else: return super()._accumulator(field) class OrganoidAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field in ('biomaterial_id', 'document_id'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class ProjectAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: + # Aggregation of `document_id` is required to allow filters using + # the `projectId` field on non-project endpoints. if field == 'document_id': return SetAccumulator(max_size=100) elif field in ('project_description', @@ -212,9 +274,7 @@ def _accumulator(self, field) -> Accumulator | None: class ProtocolAggregator(SimpleAggregator): def _accumulator(self, field) -> Accumulator | None: - if field == 'assay_type': - return FrequencySetAccumulator(max_size=100) - elif field == 'document_id': + if field == 'document_id': # If any protocol IDs are missing from the aggregate, those # protocols may be omitted during the verbatim handover. Some # protocols are "hot" entity types, and we can't track their hubs in @@ -223,6 +283,8 @@ def _accumulator(self, field) -> Accumulator | None: # FIXME: Enforce that hot entity types are completely aggregated # https://github.com/DataBiosphere/azul/issues/6793 return SetAccumulator(max_size=100) + elif field == 'assay_type': + return FrequencySetAccumulator(max_size=100) else: return super()._accumulator(field) @@ -231,11 +293,30 @@ def _default_accumulator(self) -> Accumulator | None: class SequencingInputAggregator(SimpleAggregator): - pass + + def _accumulator(self, field) -> Accumulator | None: + if field in ('biomaterial_id', 'document_id'): + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) class SequencingProcessAggregator(SimpleAggregator): + def _accumulator(self, field) -> Accumulator | None: + if field == 'document_id': + # Added to the files aggregate in order to be included in manifests + if self.outer_entity_type == 'files': + return super()._accumulator(field) + else: + return None + else: + return super()._accumulator(field) + def _default_accumulator(self) -> Accumulator | None: return SetAccumulator(max_size=10) @@ -246,7 +327,7 @@ def _accumulator(self, field) -> Accumulator | None: if field == 'document_id': return None elif field == 'file': - return DictAccumulator(max_size=100, key=itemgetter('uuid')) + return DictAccumulator(max_size=600, key=itemgetter('uuid')) else: return SetAccumulator() diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 8cf051f716..74bb7af9d1 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -411,7 +411,6 @@ def make_file(self, file: JSON) -> JSON: 'size': file.get('size'), 'fileSource': file.get('file_source'), self.plugin.special_fields.file_uuid.name_in_hit: file.get('uuid'), - 'version': file.get('version'), 'matrixCellCount': file.get('matrix_cell_count'), 'drs_uri': file.get('drs_uri'), 'azul_url': self._file_url(uuid=json_str(file['uuid']), @@ -422,7 +421,6 @@ def make_file(self, file: JSON) -> JSON: def make_specimen(self, specimen) -> MutableJSON: return { - 'id': specimen['biomaterial_id'], 'organ': specimen.get('organ', None), 'organPart': specimen.get('organ_part', None), 'disease': specimen.get('disease', None), @@ -452,7 +450,6 @@ def make_cell_suspensions(self, entry) -> MutableJSONs: def make_cell_line(self, cell_line) -> MutableJSON: return { - 'id': cell_line['biomaterial_id'], 'cellLineType': cell_line.get('cell_line_type', None), 'modelOrgan': cell_line.get('model_organ', None), } @@ -462,7 +459,6 @@ def make_cell_lines(self, entry) -> MutableJSONs: def make_donor(self, donor) -> MutableJSON: return { - 'id': donor['biomaterial_id'], 'donorCount': donor.get('donor_count', None), 'developmentStage': donor.get('development_stage', None), 'genusSpecies': donor.get('genus_species', None), @@ -477,7 +473,6 @@ def make_donors(self, entry) -> MutableJSONs: def make_organoid(self, organoid) -> MutableJSON: return { - 'id': organoid['biomaterial_id'], 'modelOrgan': organoid.get('model_organ', None), 'modelOrganPart': organoid.get('model_organ_part', None) } @@ -486,11 +481,12 @@ def make_organoids(self, entry) -> MutableJSONs: return [self.make_organoid(organoid) for organoid in entry['contents']['organoids']] def make_sample(self, sample, entity_dict, entity_type) -> MutableJSON: - is_aggregate = isinstance(sample['document_id'], list) organ_prop = 'organ' if entity_type == 'specimens' else 'model_organ' + effective_organ = sample[organ_prop] + is_aggregate = isinstance(effective_organ, list) return { 'sampleEntityType': [entity_type] if is_aggregate else entity_type, - 'effectiveOrgan': sample[organ_prop], + 'effectiveOrgan': effective_organ, **entity_dict } diff --git a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json index 30b2d92fed..47c6227a9a 100644 --- a/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json +++ b/test/indexer/data/826dea02-e274-affe-aabc-eb3db63ad068.results.json @@ -28,15 +28,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -66,9 +57,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -97,17 +85,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -140,15 +117,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -165,15 +133,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -182,24 +141,12 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 } ] @@ -923,18 +870,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -954,15 +889,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -1017,17 +943,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -1060,15 +975,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -1085,15 +991,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -1102,36 +999,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -1140,24 +1016,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -1903,15 +1767,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -1941,9 +1796,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -1972,17 +1824,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2015,15 +1856,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2040,15 +1872,6 @@ ], "files": [ { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2057,24 +1880,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2324,18 +2135,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2379,9 +2178,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2410,17 +2206,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2453,15 +2238,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -2478,15 +2254,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -2495,36 +2262,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -2533,25 +2279,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -2848,18 +2581,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -2879,15 +2600,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -2917,9 +2629,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -2948,17 +2657,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -2991,15 +2689,6 @@ ], "donors": [ { - "document_id": [ - "bfd991f2-2797-4083-972a-da7c6d7f1b2e" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "donor_id": [ - "1e2bd7e5-f45e-a391-daea-7c060be76acd" - ], "organism_type": [ "redacted-ACw+6ecI" ], @@ -3016,15 +2705,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -3033,36 +2713,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -3071,24 +2730,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] @@ -3383,18 +3030,6 @@ "contents": { "activities": [ { - "document_id": [ - "1509ef40-d1ba-440d-b298-16b7c173dcd4", - "816e364e-1193-4e5b-a91a-14e4b009157c" - ], - "source_datarepo_row_ids": [ - "sequencing:a6c663c7-6f26-4ed2-af9d-48e9c709a22b", - "sequencing:d4f6c0c4-1e11-438e-8218-cfea63b8b051" - ], - "activity_id": [ - "18b3be87-e26b-4376-0d8d-c1e370e90e07", - "a60c5138-3749-f7cb-8714-52d389ad5231" - ], "activity_table": [ "anvil_sequencingactivity" ], @@ -3414,15 +3049,6 @@ ], "biosamples": [ { - "document_id": [ - "826dea02-e274-4ffe-aabc-eb3db63ad068" - ], - "source_datarepo_row_ids": [ - "sample:98048c3b-2525-4090-94fd-477de31f2608" - ], - "biosample_id": [ - "f9d40cf6-37b8-22f3-ce35-0dc614d2452b" - ], "anatomical_site": [ "~null" ], @@ -3452,9 +3078,6 @@ "document_id": [ "2370f948-2783-4eb6-afea-e022897f4dcf" ], - "source_datarepo_row_ids": [ - "workspace_attributes:7a22b629-9d81-4e4d-9297-f9e44ed760bc" - ], "dataset_id": [ "52ee7665-7033-63f2-a8d9-ce8e32666739" ], @@ -3483,17 +3106,6 @@ ], "diagnoses": [ { - "document_id": [ - "15d85d30-ad4a-4f50-87a8-a27f59dd1b5f", - "939a4bd3-86ed-4a8a-81f4-fbe0ee673461" - ], - "source_datarepo_row_ids": [ - "subject:c23887a0-20c1-44e4-a09e-1c5dfdc2d0ef" - ], - "diagnosis_id": [ - "25ff8d32-18c9-fc3e-020a-5de20d35d906", - "5ebe9bc4-a1be-0ddf-7277-b1e88276d0f6" - ], "disease": [ "redacted-A61iJlLx", "redacted-g50ublm/" @@ -3543,15 +3155,6 @@ ], "files": [ { - "document_id": [ - "15b76f9c-6b46-433f-851d-34e89f1b9ba6" - ], - "source_datarepo_row_ids": [ - "file_inventory:81d16471-97ac-48fe-99a0-73d9ec62c2c0" - ], - "file_id": [ - "1e269f04-4347-4188-b060-1dcc69e71d67" - ], "data_modality": [ "~null" ], @@ -3560,36 +3163,15 @@ ], "file_size": 213021639, "file_size_": 213021639, - "file_md5sum": [ - "beec606ee0aa299fdf913f4259316622" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.g.vcf.gz" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_1e269f04-4347-4188-b060-1dcc69e71d67" - ], "count": 1 }, { - "document_id": [ - "3b17377b-16b1-431c-9967-e5d01fc5923f" - ], - "source_datarepo_row_ids": [ - "file_inventory:9658d94a-511d-4b49-82c3-d0cb07e0cff2" - ], - "file_id": [ - "8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "data_modality": [ "~null" ], @@ -3598,24 +3180,12 @@ ], "file_size": 3306845592, "file_size_": 3306845592, - "file_md5sum": [ - "7cd9fd7b54a8bf380e44e93706f1fa2d" - ], "reference_assembly": [ "~null" ], - "file_name": [ - "307500.merged.matefixed.sorted.markeddups.recal.bam" - ], "is_supplementary": [ 0 ], - "version": [ - "2022-06-01T00:00:00.000000Z" - ], - "drs_uri": [ - "drs://mock_tdr.lan/v1_6c87f0e1-509d-46a4-b845-7584df39263b_8b722e88-8103-49c1-b351-e64fa7c6ab37" - ], "count": 1 } ] diff --git a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json index a77d93573b..00970feaa9 100644 --- a/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json +++ b/test/indexer/data/aaa96233-bf27-44c7-82df-b4dc15ad4d9d.2018-11-02T11:33:44.698028Z.results.json @@ -1402,12 +1402,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -1436,12 +1430,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -1464,23 +1452,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -1490,12 +1468,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -1518,12 +1490,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "total_estimated_cells_redundant": 0, "total_estimated_cells_redundant_": 0, "total_estimated_cells": 1, @@ -1545,9 +1511,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ @@ -2376,23 +2339,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2405,9 +2358,6 @@ "document_id": [ "a21dc760-a500-4236-bcff-da34a0e873d2" ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2435,9 +2385,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -2465,12 +2412,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], @@ -2609,12 +2550,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -2643,12 +2578,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2671,23 +2600,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -2697,12 +2616,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -2730,9 +2643,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "genus_species": [ @@ -2760,12 +2670,6 @@ ], "cell_suspensions": [ { - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], - "biomaterial_id": [ - "GSM2172585 1" - ], "selected_cell_type": [ "~null" ], @@ -3208,12 +3112,6 @@ "contents": { "samples": [ { - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "entity_type": [ "specimens" ], @@ -3242,12 +3140,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3270,23 +3162,13 @@ ], "sequencing_inputs": [ { - "biomaterial_id": [ - "GSM2172585 1" - ], - "document_id": [ - "412898c5-5b9b-4907-b07c-e9b89666e204" - ], "sequencing_input_type": [ "cell_suspension" ] } ], "sequencing_processes": [ - { - "document_id": [ - "771ddaf6-3a4f-4314-97fe-6294ff8e25a4" - ] - } + {} ], "specimens": [ { @@ -3296,12 +3178,6 @@ "_source": [ "specimen_from_organism" ], - "document_id": [ - "a21dc760-a500-4236-bcff-da34a0e873d2" - ], - "biomaterial_id": [ - "DID_scRSq06_pancreas" - ], "disease": [ "normal" ], @@ -3347,9 +3223,6 @@ "document_id": [ "7b07b9d0-cc0e-4098-9f64-f4a569f7d746" ], - "biomaterial_id": [ - "DID_scRSq06" - ], "donor_count": 1, "donor_count_": 1, "biological_sex": [ diff --git a/test/indexer/test_indexer.py b/test/indexer/test_indexer.py index dc36763591..199da4bbe0 100644 --- a/test/indexer/test_indexer.py +++ b/test/indexer/test_indexer.py @@ -1930,15 +1930,13 @@ def test_cell_line_sample(self): if qualifier == 'samples': sample = one(contents['samples']) sample_entity_type = sample['entity_type'] - if aggregate: - document_ids = one(contents[sample_entity_type])['document_id'] - elif contribution: + if contribution: document_ids = [d['document_id'] for d in contents[sample_entity_type]] + self.assertIn(sample['document_id'], document_ids) entity = one(d for d in contents[sample_entity_type] if d['document_id'] == sample['document_id']) self.assertEqual(sample['biomaterial_id'], entity['biomaterial_id']) else: - assert False, doc_type - self.assertIn(sample['document_id'], document_ids) + assert aggregate, doc_type self.assertEqual(one(contents['specimens'])['organ'], ['blood'] if aggregate else 'blood') self.assertEqual(one(contents['specimens'])['organ_part'], ['venous blood']) self.assertEqual(len(contents['cell_lines']), 1 if aggregate else 2) @@ -2025,6 +2023,7 @@ def test_sample_with_no_donor(self): k: (v if isinstance(v, list) else [v]) + ([] if k == 'organism_age_range' or True else [None]) for k, v in donor.items() + if k != 'biomaterial_id' } } hits = self._get_all_hits() diff --git a/test/service/test_app_logging.py b/test/service/test_app_logging.py index a2c7ae31f9..95c19226e0 100644 --- a/test/service/test_app_logging.py +++ b/test/service/test_app_logging.py @@ -153,7 +153,7 @@ def filter_body(organ: str) -> JSON: elif debug == 1: expected_log = f'… with a response body starting in {body[:prefix_len]}' elif debug > 1: - expected_log = f'… with a response body of length 9137 being {body}' + expected_log = f'… with a response body of length 9050 being {body}' else: assert False self.assertEqual(expected_log, body_log_message) diff --git a/test/service/test_response.py b/test/service/test_response.py index 131be8f280..be3775f6e8 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -222,7 +222,6 @@ def test_response_stage_files(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -244,8 +243,7 @@ def test_response_stage_files(self): f'?catalog=test&version=2018-11-02T11%3A33%3A44.698028Z', 'drs_uri': f'drs://{self._drs_domain_name}/' f'7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb?version=2018-11-02T11%3A33%3A44.698028Z', - 'uuid': '7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb', - 'version': '2018-11-02T11:33:44.698028Z' + 'uuid': '7b07f99e-4a8a-4ad0-bd4f-db0d7a00c7bb' } ], 'organoids': [ @@ -280,7 +278,6 @@ def test_response_stage_files(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -297,7 +294,6 @@ def test_response_stage_files(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -500,7 +496,6 @@ def test_response_stage_projects(self): 'disease': ['normal'], 'developmentStage': [None], 'genusSpecies': ['Australopithecus'], - 'id': ['DID_scRSq06'], 'donorCount': 1, 'organismAge': [{'value': '38', 'unit': 'year'}], 'organismAgeRange': [[1198368000.0, 1198368000.0]], @@ -619,7 +614,6 @@ def test_response_stage_projects(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -636,7 +630,6 @@ def test_response_stage_projects(self): 'specimens': [ { 'disease': ['normal'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'preservationMethod': [None], @@ -734,7 +727,6 @@ def test_response_stage_projects_accessions(self): 'disease': ['H syndrome'], 'developmentStage': ['human adult stage'], 'genusSpecies': ['Homo sapiens'], - 'id': ['donor_ID_1'], 'donorCount': 1, 'organismAge': [{'value': '20', 'unit': 'year'}], 'organismAgeRange': [[630720000.0, 630720000.0]], @@ -889,7 +881,6 @@ def test_response_stage_projects_accessions(self): 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['brain'], 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -906,7 +897,6 @@ def test_response_stage_projects_accessions(self): 'specimens': [ { 'disease': ['H syndrome'], - 'id': ['specimen_ID_1'], 'organ': ['brain'], 'organPart': ['amygdala'], 'preservationMethod': [None], @@ -954,7 +944,6 @@ def test_response_stage_projects_cell_line(self): file_url_func=self.file_url_func) response = stage.process_response((hits, self.paginations[0], {})) expected_cell_lines = { - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2', 'cell_line_GM18517'], 'cellLineType': ['primary', 'stem cell-derived'], 'modelOrgan': ['blood (parent_cell_line)', 'blood (child_cell_line)'], } @@ -964,7 +953,6 @@ def test_response_stage_projects_cell_line(self): expected_samples = { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['blood (child_cell_line)'], - 'id': ['cell_line_Day7_hiPSC-CM_BioRep2'], 'cellLineType': ['stem cell-derived'], 'modelOrgan': ['blood (child_cell_line)'], } @@ -997,8 +985,7 @@ def test_response_stage_files_file(self): f'?catalog=test&version=2019-10-09T17%3A22%3A51.560099Z', 'drs_uri': f'drs://{self._drs_domain_name}/' f'a8b8479d-cfa9-4f74-909f-49552439e698?version=2019-10-09T17%3A22%3A51.560099Z', - 'uuid': 'a8b8479d-cfa9-4f74-909f-49552439e698', - 'version': '2019-10-09T17:22:51.560099Z' + 'uuid': 'a8b8479d-cfa9-4f74-909f-49552439e698' } file = one(one(response['hits'])['files']) self.assertElasticEqual(file, expected_file) @@ -1198,12 +1185,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '45-49', 'unit': 'year'}, @@ -1226,12 +1207,6 @@ def test_ranged_values(self): 'genusSpecies': [ 'Homo sapiens' ], - 'id': [ - 'HPSI0314i-hoik', - 'HPSI0214i-wibj', - 'HPSI0314i-sojd', - 'HPSI0214i-kucg' - ], 'donorCount': 4, 'organismAge': [ {'value': '40-44', 'unit': 'year'}, @@ -2082,7 +2057,7 @@ def test_bad_search_after_search_before(self): """ Test that invalid JSON for search_after or search_before raise a 400 """ - query_params = self._params(size=1, sort='sampleId', order='asc') + query_params = self._params(size=1, sort='entryId', order='asc') url = self.base_url.set(path='/index/samples', args=query_params) # Get page 1 response = requests.get(str(url)) @@ -2386,14 +2361,12 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2408,12 +2381,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['organoids'], 'effectiveOrgan': ['Brain'], - 'id': [ - 'Org_HPSI0214i-kucg_2_2', - 'Org_HPSI0214i-wibj_2_2', - 'Org_HPSI0314i-hoik_1_2', - 'Org_HPSI0314i-sojd_3_2', - ], 'modelOrgan': ['Brain'], 'modelOrganPart': [None], } @@ -2426,14 +2393,12 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['cellLines'], 'effectiveOrgan': ['immune system'], - 'id': ['Cell_line_2'], 'cellLineType': ['primary'], 'modelOrgan': ['immune system'], }, { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['embryo'], - 'id': ['Specimen1'], 'organ': ['embryo'], 'organPart': ['skin epidermis'], 'disease': ['normal'], @@ -2445,7 +2410,6 @@ def test_inner_entity_samples(self): { 'sampleEntityType': ['specimens'], 'effectiveOrgan': ['pancreas'], - 'id': ['DID_scRSq06_pancreas'], 'organ': ['pancreas'], 'organPart': ['islet of Langerhans'], 'disease': ['normal'], @@ -2964,7 +2928,6 @@ def test_matrices_tree(self): 'fileSource': 'DCP/2 Analysis', 'matrixCellCount': None, 'uuid': 'bd98f428-881e-501a-ac16-24f27a68ce2f', - 'version': '2021-02-11T23:11:45.000000Z', 'contentDescription': ['Count Matrix'], 'format': 'loom', 'isIntermediate': False, @@ -2996,7 +2959,6 @@ def test_matrices_tree(self): 'fileSource': 'DCP/1 Matrix Service', 'matrixCellCount': None, 'uuid': '538faa28-3235-5e4b-a998-5672e2d964e8', - 'version': '2020-12-03T10:39:17.144517Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3018,7 +2980,6 @@ def test_matrices_tree(self): 'fileSource': 'DCP/1 Matrix Service', 'matrixCellCount': None, 'uuid': '6c142250-567c-5b63-bd4f-0d78499863f8', - 'version': '2020-12-03T10:39:17.144517Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3040,7 +3001,6 @@ def test_matrices_tree(self): 'fileSource': 'DCP/1 Matrix Service', 'matrixCellCount': None, 'uuid': '8d2ba1c1-bc9f-5c2a-a74d-fe5e09bdfb18', - 'version': '2020-12-03T10:39:17.144517Z', 'contentDescription': ['Matrix'], 'format': 'loom', 'isIntermediate': False, @@ -3081,7 +3041,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': '87f31102-ebbc-5875-abdf-4fa5cea48e8d', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3103,7 +3062,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': '733318e0-19c2-51e8-9ad6-d94ad562dd46', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3125,7 +3083,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': 'c59e2de5-01fe-56eb-be56-679ed14161bf', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3147,7 +3104,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': '68bda896-3b3e-5f2a-9212-f4030a0f37e2', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3169,7 +3125,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': '0c5ab869-da2d-5c11-b4ae-f978a052899f', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3191,7 +3146,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': 'cade4593-bfba-56ed-80ab-080d0de7d5a4', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3213,7 +3167,6 @@ def test_matrices_tree(self): 'fileSource': 'ArrayExpress', 'matrixCellCount': None, 'uuid': '5b465aad-0981-5152-b468-e615e20f5884', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'zip', 'isIntermediate': False, @@ -3235,7 +3188,6 @@ def test_matrices_tree(self): 'fileSource': 'HCA Release', 'matrixCellCount': None, 'uuid': 'b905c8be-2e2d-592c-8481-3eb7a87c6484', - 'version': '2021-02-10T16:56:40.419579Z', 'contentDescription': ['Matrix'], 'format': 'csv', 'isIntermediate': False, @@ -3682,7 +3634,7 @@ def test(self): 'default_order': 'asc' }, 'samples': { - 'default_sort': 'sampleId', + 'default_sort': 'entryId', 'default_order': 'asc' } } @@ -3785,7 +3737,6 @@ def test_contributed_analyses_matrix(self): 'size': 107958959, 'fileSource': None, 'uuid': '780846a0-dbc5-4bdc-ab3a-0da14b3ed551', - 'version': '2022-07-26T00:16:47.748000Z', 'matrixCellCount': None, 'drs_uri': 'drs://data.terra.bio/v1' '_541cc0bb-c54f-4a7e-8cdd-1a70cbd2f20c'