Expansion of the documentation (#592)

jonbrenas · leehart · alimanfoo · web-flow · commit e213d1bff980 · 2024-12-04T22:26:58.000Z
* Starting to expand the documentation.

* Expanded the doc for some basic functions.

* Adding details to genome_features

* Correction of a typo

* Taking a breather before tackling sample_metadata()

* Taking a breather after sample_metadata()

* Sample metadata should be done.

* SNP calls done.

* Dealt with site_annotations, thanks to Eric for the help

* Dealt with biallelic_snp_calls

* Dealt with haplotypes.

* Adding AIMs

* Started with cnv_hmm. Not sure what I wrote is correct.

* STarted on cnv_coverage_calls. I have no idea what some of the variables are.

* Started work on discordant read calls. A few holes yet.

* Dealt with gene_cnv. Some unknowns remain.

* Learning how to count to 5

* Started on snp_allele_frequencies.

* Dealt with snp_allele_frequencies and gene_cnv_frequencies

* Missed anopheles.py ... again

* Dealt with pca

* Dealt with njt

* Dealt with roh_hmm

* Dealt with diversity stats

* Done (for now)

* Update malariagen_data/anoph/cnv_data.py

Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;

* Update malariagen_data/anoph/cnv_data.py

Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;

* Update malariagen_data/anoph/genome_features.py

Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;

* Update malariagen_data/anoph/sample_metadata.py

Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;

* Update malariagen_data/anopheles.py

Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;

* Replaced ** in the docs

* Missed two files

---------

Co-authored-by: Lee &lt;4256466+leehart@users.noreply.github.com&gt;
Co-authored-by: Alistair Miles &lt;alimanfoo@googlemail.com&gt;
diff --git a/malariagen_data/anoph/aim_data.py b/malariagen_data/anoph/aim_data.py
@@ -65,7 +65,11 @@ def _prep_aims_param(self, *, aims: aim_params.aims) -> str:
     @check_types
     @doc(
         summary="Access ancestry informative marker variants.",
-        returns="A dataset containing AIM positions and discriminating alleles.",
+        returns="""
+        A dataset with 2 dimensions: `variants` the number of AIMs sites, and `alleles` which will always be 2, each representing one of the species. It contains 2 coordinates:
+        `variant_contig` has `variants` values and contains the chromosome arm of each AIM, and `variant_position` has `variants` values and contains the position of each AIM. It contains 1 data variable:
+        `variant_allele` has (`variants`, `allele`) values and contains the discriminating alleles for each AIM.
+        """,
     )
     def aim_variants(self, aims: aim_params.aims) -> xr.Dataset:
         self._require_aim_analysis()
@@ -113,7 +117,16 @@ def _aim_calls_dataset(self, *, aims, sample_set):
             calls.
         """,
         returns="""
-            A dataset containing AIM SNP sites, alleles and genotype calls.
+        A dataset with 4 dimensions:
+        `variants` the number of AIMs sites,
+        `samples` the number of samples,
+        `ploidy` the ploidy (2),
+        and `alleles` which will always be 2, each representing one of the species. It contains 3 coordinates:
+        `sample_id` has `samples` values and contains the identifier of each sample,
+        `variant_contig` has `variants` values and contains the chromosome arm of each AIM,
+        and `variant_position` has `variants` values and contains the position of each AIM. It contains 2 data variables:
+        `call_genotype` has (`variants`, `samples`, `ploidy`) values and contains both calls for each sample and each AIM,
+        `variant_allele` has (`variants`, `allele`) values and contains the discriminating alleles for each AIM.
         """,
     )
     def aim_calls(
diff --git a/malariagen_data/anoph/base.py b/malariagen_data/anoph/base.py
@@ -397,7 +397,16 @@ def _read_sample_sets(self, *, single_release: str):
     @check_types
     @doc(
         summary="Access a dataframe of sample sets",
-        returns="A dataframe of sample sets, one row per sample set.",
+        returns="""A dataframe of sample sets, one row per sample set. It contains five columns:
+         `sample_set` is the name of the sample set,
+         `sample_count` is the number of samples the sample set contains,
+         `study_id` is the identifier for the study that generated the sample set,
+         `study_url` is the URL of the study on the MalariaGEN website,
+         `term_of_use_expiry` is the date when the terms of use expire,
+         `terms_of_use_url` is the URL of the terms of use,
+         `release` is the identifier of the release containing the sample set,
+         `unrestricted_use` whether the sample set can be without restriction (e.g., if the terms of use have expired).
+            """,
     )
     def sample_sets(
         self,
@@ -441,6 +450,7 @@ def sample_sets(
     @check_types
     @doc(
         summary="Find which release a sample set was included in.",
+        returns="The release the sample set is part of.",
     )
     def lookup_release(self, sample_set: base_params.sample_set) -> str:
         if self._cache_sample_set_to_release is None:
@@ -455,6 +465,7 @@ def lookup_release(self, sample_set: base_params.sample_set) -> str:
     @check_types
     @doc(
         summary="Find which study a sample set belongs to.",
+        returns="The study the sample set belongs to.",
     )
     def lookup_study(self, sample_set: base_params.sample_set) -> str:
         if self._cache_sample_set_to_study is None:
@@ -468,6 +479,7 @@ def lookup_study(self, sample_set: base_params.sample_set) -> str:
     @check_types
     @doc(
         summary="Find the study info for a sample set.",
+        returns="The info for the study the sample set belongs to.",
     )
     def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
         if self._cache_sample_set_to_study_info is None:
@@ -483,6 +495,7 @@ def lookup_study_info(self, sample_set: base_params.sample_set) -> dict:
     @check_types
     @doc(
         summary="Find the terms-of-use info for a sample set.",
+        returns="The terms-of-use info for the sample set.",
     )
     def lookup_terms_of_use_info(self, sample_set: base_params.sample_set) -> dict:
         if self._cache_sample_set_to_terms_of_use_info is None:
diff --git a/malariagen_data/anoph/cnv_data.py b/malariagen_data/anoph/cnv_data.py
@@ -170,7 +170,19 @@ def _cnv_hmm_dataset(self, *, contig, sample_set, inline_array, chunks):
     @check_types
     @doc(
         summary="Access CNV HMM data from CNV calling.",
-        returns="An xarray dataset of CNV HMM calls and associated data.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of CNV regions in the selected region,
+        `samples` the number of samples. There are 4 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each CNV region,
+        `variant_end` has `variants` values and contains the final position of each CNV region,
+        `variant_contig` has `variants` values and contains the contig of each CNV region,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 5 data variables:
+        `call_CN`, it has (`variants`, `samples`) values and contains the number of copies for each sample and each CNV region,
+        `call_RawCov`, it has (`variants`, `samples`) values and contains the raw coverage for each sample and each CNV region,
+        `call_NormCov`, it has (`variants`, `samples`) values and contains the normalized coverage for each sample and each CNV region,
+        `sample_coverage_variance`, it has `samples` values and contains the variance of the coverage for each sample,
+        `sample_id_high_variance`, it has `samples` values and contains whether each sample has a high variance.
+        """,
     )
     def cnv_hmm(
         self,
@@ -377,7 +389,19 @@ def _cnv_coverage_calls_dataset(
     @check_types
     @doc(
         summary="Access CNV HMM data from genome-wide CNV discovery and filtering.",
-        returns="An xarray dataset of CNV alleles and genotypes.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of CNV regions in the selected region,
+        `samples` the number of samples. There are 5 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each CNV region,
+        `variant_end` has `variants` values and contains the final position of each CNV region,
+        `variant_contig` has `variants` values and contains the contig of each CNV region,
+        `variant_id` has `variants` values and contains the identifier for each CNV region,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 4 data variables:
+        `variant_CIPOS`, it has `variants` values and contains the confidence interval for the start position for each CNV region,
+        `variant_CIEND`, it has `variants` values and contains the confidence interval for the end position for each CNV region,
+        `variant_filter_pass`, it has `variants` values and is True for each CNV region that passes quality filters,
+        `call_genotype`, it has (`variants`, `samples`) values and contains the coverage call for each sample and each CNV region,
+        """,
     )
     def cnv_coverage_calls(
         self,
@@ -533,7 +557,21 @@ def _cnv_discordant_read_calls_dataset(
     @check_types
     @doc(
         summary="Access CNV discordant read calls data.",
-        returns="An xarray dataset of CNV alleles and genotypes.",
+        returns="""A dataset with 2 dimensions:
+        `variants` the number of discordant read calls in the selected region,
+        `samples` the number of samples. There are 5 coordinates:
+        `variant_position` has `variants` values and contains the initial position of each discordant read call,
+        `variant_end` has `variants` values and contains the final position of each discordant read call,
+        `variant_id` has `variants` values and contains the identifier of each discordant read call,
+        `variant_contig` has `variants` values and contains the contig of each discordant read call,
+        `sample_id` has `samples` values and contains the identifier of each sample. It contains 6 data variables:
+        `variant_Region`, it has `variants` values and contains the identifier of the region covered by each discordant read call,
+        `variant_StartBreakpointMethod`, it has `variants` values and specifies how the start breakpoint was determined for each discordant read call,
+        `variant_EndBreakpointMethod`, it has `variants` values and specifies how the end breakpoint was determined for each discordant read call,
+        `call_genotype`, it has (`variants`, `samples`) values and contains the number of copies of each discordant read call for each sample,
+        `sample_coverage_variance`, it has `samples` values and contains the variance of the coverage for each sample,
+        `sample_id_high_variance`, it has `samples` values and contains whether each sample has a high variance.
+        """,
     )
     def cnv_discordant_read_calls(
         self,
diff --git a/malariagen_data/anoph/distance.py b/malariagen_data/anoph/distance.py
@@ -86,6 +86,7 @@ def __init__(self, **kwargs):
         summary="""
             Compute pairwise distances between samples using biallelic SNP genotypes.
         """,
+        returns=("dist", "samples", "n_snps_used"),
     )
     def biallelic_diplotype_pairwise_distances(
         self,
@@ -107,7 +108,9 @@ def biallelic_diplotype_pairwise_distances(
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
-    ) -> Tuple[np.ndarray, np.ndarray, int]:
+    ) -> Tuple[
+        distance_params.dist, distance_params.samples, distance_params.n_snps_used
+    ]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "biallelic_diplotype_pairwise_distances"
@@ -234,6 +237,7 @@ def _biallelic_diplotype_pairwise_distances(
         summary="""
             Construct a neighbour-joining tree between samples using biallelic SNP genotypes.
         """,
+        returns=("Z", "samples", "n_snps_used"),
     )
     def njt(
         self,
@@ -260,7 +264,7 @@ def njt(
         random_seed: base_params.random_seed = 42,
         inline_array: base_params.inline_array = base_params.inline_array_default,
         chunks: base_params.chunks = base_params.native_chunks,
-    ) -> Tuple[np.ndarray, np.ndarray, int]:
+    ) -> Tuple[distance_params.Z, distance_params.samples, distance_params.n_snps_used]:
         # Change this name if you ever change the behaviour of this function, to
         # invalidate any previously cached data.
         name = "njt_v1"
diff --git a/malariagen_data/anoph/distance_params.py b/malariagen_data/anoph/distance_params.py
@@ -2,6 +2,8 @@
 
 from typing_extensions import Annotated, TypeAlias
 
+import numpy as np
+
 distance_metric: TypeAlias = Annotated[
     Literal[
         "cityblock",
@@ -20,6 +22,32 @@
 
 default_nj_algorithm: nj_algorithm = "dynamic"
 
+dist: TypeAlias = Annotated[
+    np.ndarray,
+    """
+    A numpy array containing the distance between each pair of samples.
+    """,
+]
+
+Z: TypeAlias = Annotated[
+    np.ndarray,
+    """
+    A neighbour-joining tree encoded as a numpy array. Each row in the
+    array contains data for one internal node in the tree, in the order
+    in which they were created by the neighbour-joining algorithm.
+    Within each row there are five values: left child node identifier,
+    right child node identifier, distance to left child, distance to
+    right child, total number of leaves. This data structure is similar
+    to that returned by scipy's hierarchical clustering functions,
+    except that here we have two distance values for each internal node
+    rather than one because distances to the children may be different.
+    """,
+]
+
+samples: TypeAlias = Annotated[np.ndarray, "The list of the sample identifiers"]
+
+n_snps_used: TypeAlias = Annotated[int, "The number of SNPs used"]
+
 center_x: TypeAlias = Annotated[int | float, "X coordinate where plotting is centered."]
 
 center_y: TypeAlias = Annotated[int | float, "Y coordinate where plotting is centered."]
diff --git a/malariagen_data/anoph/fst_params.py b/malariagen_data/anoph/fst_params.py
@@ -26,7 +26,11 @@
 df_pairwise_fst: TypeAlias = Annotated[
     pd.DataFrame,
     """
-    A dataframe of pairwise Fst and standard error values.
+    A dataframe of pairwise Fst and standard error values. It has
+    4 columns:
+    `cohort1` and `cohort2` are the two cohorts,
+    `fst` is the value of the Fst between the two cohorts,
+    `se` is the standard error.
     """,
 ]
 
diff --git a/malariagen_data/anoph/genome_features.py b/malariagen_data/anoph/genome_features.py
@@ -119,7 +119,7 @@ def _prep_gff_attributes(
     @check_types
     @doc(
         summary="Access genome feature annotations.",
-        returns="A dataframe of genome annotations, one row per feature.",
+        returns="A dataframe of genome annotations, one row per feature. The dataframe follows the GFF3 format (https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), including extra attributes `ID`, `Parent`, `Name` and `description` depending on the dataset.",
     )
     def genome_features(
         self,
diff --git a/malariagen_data/anoph/hap_data.py b/malariagen_data/anoph/hap_data.py
@@ -321,7 +321,17 @@ def _haplotypes_for_contig(
     @check_types
     @doc(
         summary="Access haplotype data.",
-        returns="A dataset of haplotypes and associated data.",
+        returns="""A dataset with 4 dimensions:
+        `variants` the number of sites in the selected region,
+        `allele` the number of alleles (2),
+        `samples` the number of samples,
+        and `ploidy` the ploidy (2). There are 3 coordinates:
+        `variant_position` has `variants` values and contains the position of each site,
+        `variant_contig` has `variants` values and contains the contig of each site,
+        `sample_id` has `samples` values and contains the identifier of each sample. The data variables are:
+        `variant_allele`, it has (`variants`, `alleles`) values and contains the reference followed by the alternate allele for each site,
+        `call_genotype`, it has (`variants`, `samples`, `ploidy`) values and contains both calls for each site and each sample.
+        """,
     )
     def haplotypes(
         self,
diff --git a/malariagen_data/anoph/het_params.py b/malariagen_data/anoph/het_params.py
@@ -47,7 +47,13 @@
     pd.DataFrame,
     """
     A DataFrame where each row provides data about a single run of
-    homozygosity.
+    homozygosity. The columns are:
+    `sample_id` containing the identifier of the sample,
+    `contig` containing the contig,
+    `roh_start` containing the start of the run of homozygosity,
+    `roh_stop` containing the end of the run of homozygosity,
+    `roh_length` containing the length of the run of homozygosity,
+    `roh_is_marginal` containing whether the run of homozygosity is marginal.
     """,
 ]
 
diff --git a/malariagen_data/anoph/pca_params.py b/malariagen_data/anoph/pca_params.py
@@ -15,8 +15,66 @@
 df_pca: TypeAlias = Annotated[
     pd.DataFrame,
     """
-    A dataframe of sample metadata, with columns "PC1", "PC2", "PC3",
-    etc., added.
+    A dataframe of projections along principal components, one row per sample. The columns are:
+        `sample_id` is the identifier of the sample,
+        `partner_sample_id` is the identifier of the sample used by the partners who contributed it,
+        `contributor` is the partner who contributed the sample,
+        `country` is the country the sample was collected in,
+        `location` is the location the sample was collected in,
+        `year` is the year the sample was collected,
+        `month` is the month the sample was collected,
+        `latitude` is the latitude of the location the sample was collected in,
+        `longitude` is the longitude of the location the sample was collected in,
+        `sex_call` is the sex of the sample,
+        `sample_set` is the sample set containing the sample,
+        `release` is the release containing the sample,
+        `quarter` is the quarter of the year the sample was collected,
+        `study_id* is the identifier of the study the sample set containing the sample came from,
+        `study_url` is the URL of the study the sample set containing the sample came from,
+        `terms_of_use_expiry_date` is the date the terms of use for the sample expire,
+        `terms_of_use_url` is the URL of the terms of use for the sample,
+        `unrestricted_use` indicates whether the sample can be used without restrictions (e.g., if the terms of use of expired),
+        `mean_cov` is mean value of the coverage,
+        `median_cov` is the median value of the coverage,
+        `modal_cov` is the mode of the coverage,
+        `mean_cov_2L` is mean value of the coverage on 2L,
+        `median_cov_2L` is the median value of the coverage on 2L,
+        `mode_cov_2L` is the mode of the coverage on 2L,
+        `mean_cov_2R` is mean value of the coverage on 2R,
+        `median_cov_2R` is the median value of the coverage on 2R,
+        `mode_cov_2R` is the mode of the coverage on 2R,
+        `mean_cov_3L` is mean value of the coverage on 3L,
+        `median_cov_3L` is the median value of the coverage on 3L,
+        `mode_cov_3L` is the mode of the coverage on 3L,
+        `mean_cov_3R` is mean value of the coverage on 3R,
+        `median_cov_3R` is the median value of the coverage on 3R,
+        `mode_cov_3R` is the mode of the coverage on 3R,
+        `mean_cov_X` is mean value of the coverage on X,
+        `median_cov_X` is the median value of the coverage on X,
+        `mode_cov_X` is the mode of the coverage on X,
+        `frac_gen_cov` is the faction of the genome covered,
+        `divergence` is the divergence,
+        `contam_pct` is the percentage of contamination,
+        `contam_LLR` is the log-likelihood ratio of contamination,
+        `aim_species_fraction_arab` is the fraction of the gambcolu vs. arabiensis AIMs that indicated arabiensis (this column is only present for *Ag3*),
+        `aim_species_fraction_colu` is the fraction of the gambiae vs. coluzzii AIMs that indicated coluzzii (this column is only present for *Ag3*),
+        `aim_species_fraction_colu_no2l` is the fraction of the gambiae vs. coluzzii AIMs that indicated coluzzii, not including the chromosome arm 2L which contains an introgression (this column is only present for *Ag3*),
+        `aim_species_gambcolu_arabiensis` is the taxonomic group assigned by the gambcolu vs. arabiensis AIMs (this column is only present for *Ag3*),
+        `aim_species_gambiae_coluzzi` is the taxonomic group assigned by the gambiae vs. coluzzii AIMs (this column is only present for *Ag3*),
+        `aim_species_gambcolu_arabiensis` is the taxonomic group assigned by the combination of both AIMs analyses (this column is only present for *Ag3*),
+        `country_iso` is the ISO code of the country the sample was collected in,
+        `admin1_name` is the name of the first administrative level the sample was collected in,
+        `admin1_iso` is the ISO code of the first administrative level the sample was collected in,
+        `admin2_name` is the name of the second administrative level the sample was collected in,
+        `taxon` is the taxon assigned to the sample by the combination of the AIMs analysis and the cohort analysis,
+        `cohort_admin1_year` is the cohort the sample belongs to when samples are grouped by first administrative level and year,
+        `cohort_admin1_month` is the cohort the sample belongs to when samples are grouped by first administrative level and month,
+        `cohort_admin1_quarter` is the cohort the sample belongs to when samples are grouped by first administrative level and quarter,
+        `cohort_admin2_year` is the cohort the sample belongs to when samples are grouped by second administrative level and year,
+        `cohort_admin2_month` is the cohort the sample belongs to when samples are grouped by second administrative level and month,
+        `cohort_admin2_quarter` is the cohort the sample belong to when samples are grouped by second administrative level and quarter.
+        `PC?` is the projection along principal component ? (? being an integer between 1 and the number of components). There are as many such columns as components,
+        `pca_fit` is whether this sample was used for fitting.
     """,
 ]
 
diff --git a/malariagen_data/anoph/sample_metadata.py b/malariagen_data/anoph/sample_metadata.py
diff --git a/malariagen_data/anoph/snp_data.py b/malariagen_data/anoph/snp_data.py
diff --git a/malariagen_data/anoph/snp_frq.py b/malariagen_data/anoph/snp_frq.py
diff --git a/malariagen_data/anopheles.py b/malariagen_data/anopheles.py

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ def _prep_gff_attributes(`
`119`	`119`	`@check_types`
`120`	`120`	`@doc(`
`121`	`121`	`summary="Access genome feature annotations.",`
`122`		`- returns="A dataframe of genome annotations, one row per feature.",`
	`122`	+ returns="A dataframe of genome annotations, one row per feature. The dataframe follows the GFF3 format (https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), including extra attributes `ID`, `Parent`, `Name` and `description` depending on the dataset.",
`123`	`123`	`)`
`124`	`124`	`def genome_features(`
`125`	`125`	`self,`