Improve documentation

jrm5100 · jrm5100 · commit 968af39b40af · 2021-06-23T16:47:03.000-04:00
diff --git a/docs/api.rst b/docs/api.rst
@@ -14,10 +14,8 @@ API Reference
 
 ----
 
-.. autoclass:: pandas_genomics.GenotypeDtype
-.. autoclass:: pandas_genomics.GenotypeArray
+.. automodule:: pandas_genomics.arrays
 
 ----
 
-.. autoclass:: pandas_genomics.GenotypeDataframeAccessor
-.. autoclass:: pandas_genomics.GenotypeSeriesAccessor
+.. automodule:: pandas_genomics.accessors
diff --git a/docs/index.rst b/docs/index.rst
@@ -88,7 +88,7 @@ If you are looking for information on a specific function, class or
 method, this part of the documentation is for you.
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 3
 
    api
 
@@ -100,4 +100,5 @@ Release History, etc
 .. toctree::
    :maxdepth: 2
 
+   notes
    release-history
diff --git a/docs/notes.rst b/docs/notes.rst
@@ -0,0 +1,10 @@
+=====
+Notes
+=====
+
+* The `genomics` DataFrame accessor
+
+  * Will only work if the entire DataFrame consists of GenotypeArray Series
+  * Requires that all variant IDs are unique.  Variants get a random unique (UUID4) ID if one is not specified.
+
+* The Series (or DataFrame column) name should not be confused with the variant ID.  There is no reason to assume they match.
diff --git a/pandas_genomics/__init__.py b/pandas_genomics/__init__.py
@@ -10,12 +10,12 @@
 __version__ = importlib_metadata.version(__name__)
 
 __all__ = [
-    __version__,
-    GenotypeSeriesAccessor,
-    GenotypeDataframeAccessor,
-    GenotypeDtype,
-    GenotypeArray,
-    io,
-    scalars,
-    sim,
+    "__version__",
+    "GenotypeSeriesAccessor",
+    "GenotypeDataframeAccessor",
+    "GenotypeDtype",
+    "GenotypeArray",
+    "io",
+    "scalars",
+    "sim",
 ]
diff --git a/pandas_genomics/accessors/__init__.py b/pandas_genomics/accessors/__init__.py
@@ -1,2 +1,18 @@
+"""
+Accessors
+---------
+
+This module contains 'genomics' accessors for DataFrames and Series
+
+  .. autosummary::
+     :toctree: accessors
+
+     GenotypeSeriesAccessor
+     GenotypeDataframeAccessor
+
+"""
+
 from .series_accessor import GenotypeSeriesAccessor
 from .dataframe_accessor import GenotypeDataframeAccessor
+
+__all__ = ["GenotypeSeriesAccessor", "GenotypeDataframeAccessor"]
diff --git a/pandas_genomics/accessors/dataframe_accessor.py b/pandas_genomics/accessors/dataframe_accessor.py
@@ -12,6 +12,11 @@
 class GenotypeDataframeAccessor:
     """
     DataFrame accessor for GenotypeArray methods
+
+    .. code-block:: python
+
+        df.genomics.variant_info
+        df.genomics.encode_additive()
     """
 
     def __init__(self, pandas_obj):
@@ -118,19 +123,19 @@ def encode_codominant(self) -> pd.DataFrame:
         )
 
     def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
-        """Weighted (edge) encoding of genotypes.
+        """Weighted (EDGE) encoding of genotypes.
 
         See :meth:`GenotypeArray.encode_weighted`
 
         Parameters
         ----------
         encoding_info: pd.DataFrame
             columns:
-                Variant ID - used to match variants
-                Alpha Value - used for heterozygous genotypes
-                Ref Allele - which allele is considered reference
-                Alt Allele - which allele is considered alternate
-                Minor Allele Frequency - MAF of data used during calculation of alpha values
+                - Variant ID - used to match variants
+                - Alpha Value - used for heterozygous genotypes
+                - Ref Allele - which allele is considered reference
+                - Alt Allele - which allele is considered alternate
+                - Minor Allele Frequency - MAF of data used during calculation of alpha values
 
         Returns
         -------
diff --git a/pandas_genomics/accessors/series_accessor.py b/pandas_genomics/accessors/series_accessor.py
@@ -10,6 +10,12 @@
 class GenotypeSeriesAccessor:
     """
     Series accessor for GenotypeArray methods
+
+
+    .. code-block:: python
+
+        s.genomics.variant_info
+        s.genomics.encode_additive()
     """
 
     def __init__(self, obj):
diff --git a/pandas_genomics/arrays/__init__.py b/pandas_genomics/arrays/__init__.py
@@ -1,4 +1,6 @@
 """
+.. currentmodule:: pandas_genomics.arrays
+
 Arrays
 ------
 
@@ -10,6 +12,14 @@
      GenotypeDtype
      GenotypeArray
 
+Specialized methods are added to the GenotypeArray using Mixins:
+
+  .. autosummary::
+     : toctree: arrays
+
+     encoding_mixin.EncodingMixin
+     info_mixin.InfoMixin
+
 """
 
 from .genotype_array import GenotypeDtype, GenotypeArray
diff --git a/pandas_genomics/arrays/encoding_mixin.py b/pandas_genomics/arrays/encoding_mixin.py
@@ -13,12 +13,14 @@ class EncodingMixin:
 
     def encode_additive(self) -> pd.arrays.IntegerArray:
         """
+        Additive Encoding
+
+        - Number of copies of non-reference allele
+        - pd.NA when any alleles are missing
+
         Returns
         -------
         pd.arrays.IntegerArray
-            Number of copies of non-reference allele
-            pd.NA when any alleles are missing
-            Raises ValueError if there is more than 1 alternate allele
         """
         allele_sum = (self.allele_idxs != 0).sum(axis=1).astype("float")
         allele_sum[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -27,12 +29,15 @@ def encode_additive(self) -> pd.arrays.IntegerArray:
 
     def encode_dominant(self) -> pd.arrays.IntegerArray:
         """
+        Dominant Encoding
+
+        - 0 for Homozygous Reference
+        - 1 for any other case
+        - pd.NA when any alleles are missing
+
         Returns
         -------
         pd.arrays.IntegerArray
-            0 for Homozygous Reference
-            1 for any other case
-            pd.NA when any alleles are missing
         """
         has_minor = (self.allele_idxs != 0).any(axis=1).astype("float")
         has_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -41,12 +46,15 @@ def encode_dominant(self) -> pd.arrays.IntegerArray:
 
     def encode_recessive(self) -> pd.arrays.IntegerArray:
         """
+        Recessive Encoding
+
+        - 1 for Homozygous Non-reference
+        - 0 for anything else
+        - pd.NA when any alleles are missing
+
         Returns
         -------
         pd.arrays.IntegerArray
-            1 for Homozygous Non-reference
-            0 for anything else
-            pd.NA when any alleles are missing
         """
         all_minor = (self.allele_idxs != 0).all(axis=1).astype("float")
         all_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -58,14 +66,15 @@ def encode_codominant(self) -> pd.arrays.Categorical:
         This encodes the genotype into three categories.  When utilized in regression, this results in two variables
         due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1.  0 in both indicates "Ref".
 
+        - 'Ref' for Homozygous Reference
+        - 'Het' for Heterozygous
+        - 'Hom' for Homozygous Non-Reference
+        - pd.NA for missing
+        - Raises an error if ploidy is not 2
+
         Returns
         -------
         pd.arrays.Categorical
-            'Ref' for Homozygous Reference
-            'Het' for Heterozygous
-            'Hom' for Homozygous Non-Reference
-            pd.NA for missing
-            Raises an error if ploidy is not 2
         """
         if self.dtype.variant.ploidy != 2:
             raise ValueError(
diff --git a/pandas_genomics/io/plink/to_plink.py b/pandas_genomics/io/plink/to_plink.py
@@ -35,8 +35,7 @@ def to_plink(
 
     Notes
     -----
-    If the data index has the required columns (FID, IID, IID_father, IID_mother, sex, phenotype) the fam file will
-      be created based on the index.
+    If the data index has the required columns (FID, IID, IID_father, IID_mother, sex, phenotype) the fam file will be created based on the index.
     If a phenotype name is provided, this will override any phenotype information in the index.
     If the data has a single index column this will be used (with the prefix) for FID and IID.  Defaults will be used for other .fam data
 
diff --git a/pandas_genomics/sim/__init__.py b/pandas_genomics/sim/__init__.py
@@ -1,2 +1,18 @@
-from .BAMS import BAMS, SNPEffectEncodings, PenetranceTables
+"""
+Simulation
+----------
+The `sim` module provides classes for generating simulated genotypes
+
+    .. autosummary::
+        :toctree: sim
+
+        BAMS
+        SNPEffectEncodings
+        PenetranceTables
+        generate_random_gt
+
+"""
+
+
+from .biallelic_model_simulator import BAMS, SNPEffectEncodings, PenetranceTables
 from .random_gt import generate_random_gt
diff --git a/pandas_genomics/sim/biallelic_model_simulator.py b/pandas_genomics/sim/biallelic_model_simulator.py
@@ -11,7 +11,7 @@
 
 
 class SNPEffectEncodings(Enum):
-    """Normalized SNP Effects encoded as 3-length tuples"""
+    """Enum: Normalized SNP Effects encoded as 3-length tuples"""
 
     DOMINANT = (0, 1, 1)
     SUPER_ADDITIVE = (0, 0.75, 1)
@@ -22,7 +22,7 @@ class SNPEffectEncodings(Enum):
 
 
 class PenetranceTables(Enum):
-    """Penetrance Tables for Simple Models"""
+    """Enum: Penetrance Tables for Simple Models"""
 
     HR_HR = [1, 0, 0, 0, 0, 0, 0, 0, 0]  # Homozygous Referent X Homozygous Referent
     HR_HET = [0, 1, 0, 0, 0, 0, 0, 0, 0]  # Homozygous Referent X Heterozygous
@@ -38,6 +38,8 @@ class PenetranceTables(Enum):
 class BAMS:
     """
     Biallelic Model Simulator.  Used to simulate two SNPs with phenotype data based on a penetrance table.
+
+    It can be initialized using the PenetranceTables enum or using `from_model` with values from the SNPEffectEncodings enum.
     """
 
     def __init__(