Skip to content

Commit 968af39

Browse files
committed
Improve documentation
1 parent c5091db commit 968af39

File tree

12 files changed

+110
-38
lines changed

12 files changed

+110
-38
lines changed

docs/api.rst

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,8 @@ API Reference
1414

1515
----
1616

17-
.. autoclass:: pandas_genomics.GenotypeDtype
18-
.. autoclass:: pandas_genomics.GenotypeArray
17+
.. automodule:: pandas_genomics.arrays
1918

2019
----
2120

22-
.. autoclass:: pandas_genomics.GenotypeDataframeAccessor
23-
.. autoclass:: pandas_genomics.GenotypeSeriesAccessor
21+
.. automodule:: pandas_genomics.accessors

docs/index.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ If you are looking for information on a specific function, class or
8888
method, this part of the documentation is for you.
8989

9090
.. toctree::
91-
:maxdepth: 2
91+
:maxdepth: 3
9292

9393
api
9494

@@ -100,4 +100,5 @@ Release History, etc
100100
.. toctree::
101101
:maxdepth: 2
102102

103+
notes
103104
release-history

docs/notes.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
=====
2+
Notes
3+
=====
4+
5+
* The `genomics` DataFrame accessor
6+
7+
* Will only work if the entire DataFrame consists of GenotypeArray Series
8+
* Requires that all variant IDs are unique. Variants get a random unique (UUID4) ID if one is not specified.
9+
10+
* The Series (or DataFrame column) name should not be confused with the variant ID. There is no reason to assume they match.

pandas_genomics/__init__.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,12 @@
1010
__version__ = importlib_metadata.version(__name__)
1111

1212
__all__ = [
13-
__version__,
14-
GenotypeSeriesAccessor,
15-
GenotypeDataframeAccessor,
16-
GenotypeDtype,
17-
GenotypeArray,
18-
io,
19-
scalars,
20-
sim,
13+
"__version__",
14+
"GenotypeSeriesAccessor",
15+
"GenotypeDataframeAccessor",
16+
"GenotypeDtype",
17+
"GenotypeArray",
18+
"io",
19+
"scalars",
20+
"sim",
2121
]
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,18 @@
1+
"""
2+
Accessors
3+
---------
4+
5+
This module contains 'genomics' accessors for DataFrames and Series
6+
7+
.. autosummary::
8+
:toctree: accessors
9+
10+
GenotypeSeriesAccessor
11+
GenotypeDataframeAccessor
12+
13+
"""
14+
115
from .series_accessor import GenotypeSeriesAccessor
216
from .dataframe_accessor import GenotypeDataframeAccessor
17+
18+
__all__ = ["GenotypeSeriesAccessor", "GenotypeDataframeAccessor"]

pandas_genomics/accessors/dataframe_accessor.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
class GenotypeDataframeAccessor:
1313
"""
1414
DataFrame accessor for GenotypeArray methods
15+
16+
.. code-block:: python
17+
18+
df.genomics.variant_info
19+
df.genomics.encode_additive()
1520
"""
1621

1722
def __init__(self, pandas_obj):
@@ -118,19 +123,19 @@ def encode_codominant(self) -> pd.DataFrame:
118123
)
119124

120125
def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
121-
"""Weighted (edge) encoding of genotypes.
126+
"""Weighted (EDGE) encoding of genotypes.
122127
123128
See :meth:`GenotypeArray.encode_weighted`
124129
125130
Parameters
126131
----------
127132
encoding_info: pd.DataFrame
128133
columns:
129-
Variant ID - used to match variants
130-
Alpha Value - used for heterozygous genotypes
131-
Ref Allele - which allele is considered reference
132-
Alt Allele - which allele is considered alternate
133-
Minor Allele Frequency - MAF of data used during calculation of alpha values
134+
- Variant ID - used to match variants
135+
- Alpha Value - used for heterozygous genotypes
136+
- Ref Allele - which allele is considered reference
137+
- Alt Allele - which allele is considered alternate
138+
- Minor Allele Frequency - MAF of data used during calculation of alpha values
134139
135140
Returns
136141
-------

pandas_genomics/accessors/series_accessor.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
class GenotypeSeriesAccessor:
1111
"""
1212
Series accessor for GenotypeArray methods
13+
14+
15+
.. code-block:: python
16+
17+
s.genomics.variant_info
18+
s.genomics.encode_additive()
1319
"""
1420

1521
def __init__(self, obj):

pandas_genomics/arrays/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
"""
2+
.. currentmodule:: pandas_genomics.arrays
3+
24
Arrays
35
------
46
@@ -10,6 +12,14 @@
1012
GenotypeDtype
1113
GenotypeArray
1214
15+
Specialized methods are added to the GenotypeArray using Mixins:
16+
17+
.. autosummary::
18+
: toctree: arrays
19+
20+
encoding_mixin.EncodingMixin
21+
info_mixin.InfoMixin
22+
1323
"""
1424

1525
from .genotype_array import GenotypeDtype, GenotypeArray

pandas_genomics/arrays/encoding_mixin.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ class EncodingMixin:
1313

1414
def encode_additive(self) -> pd.arrays.IntegerArray:
1515
"""
16+
Additive Encoding
17+
18+
- Number of copies of non-reference allele
19+
- pd.NA when any alleles are missing
20+
1621
Returns
1722
-------
1823
pd.arrays.IntegerArray
19-
Number of copies of non-reference allele
20-
pd.NA when any alleles are missing
21-
Raises ValueError if there is more than 1 alternate allele
2224
"""
2325
allele_sum = (self.allele_idxs != 0).sum(axis=1).astype("float")
2426
allele_sum[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -27,12 +29,15 @@ def encode_additive(self) -> pd.arrays.IntegerArray:
2729

2830
def encode_dominant(self) -> pd.arrays.IntegerArray:
2931
"""
32+
Dominant Encoding
33+
34+
- 0 for Homozygous Reference
35+
- 1 for any other case
36+
- pd.NA when any alleles are missing
37+
3038
Returns
3139
-------
3240
pd.arrays.IntegerArray
33-
0 for Homozygous Reference
34-
1 for any other case
35-
pd.NA when any alleles are missing
3641
"""
3742
has_minor = (self.allele_idxs != 0).any(axis=1).astype("float")
3843
has_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -41,12 +46,15 @@ def encode_dominant(self) -> pd.arrays.IntegerArray:
4146

4247
def encode_recessive(self) -> pd.arrays.IntegerArray:
4348
"""
49+
Recessive Encoding
50+
51+
- 1 for Homozygous Non-reference
52+
- 0 for anything else
53+
- pd.NA when any alleles are missing
54+
4455
Returns
4556
-------
4657
pd.arrays.IntegerArray
47-
1 for Homozygous Non-reference
48-
0 for anything else
49-
pd.NA when any alleles are missing
5058
"""
5159
all_minor = (self.allele_idxs != 0).all(axis=1).astype("float")
5260
all_minor[(self.allele_idxs == MISSING_IDX).any(axis=1)] = np.nan
@@ -58,14 +66,15 @@ def encode_codominant(self) -> pd.arrays.Categorical:
5866
This encodes the genotype into three categories. When utilized in regression, this results in two variables
5967
due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1. 0 in both indicates "Ref".
6068
69+
- 'Ref' for Homozygous Reference
70+
- 'Het' for Heterozygous
71+
- 'Hom' for Homozygous Non-Reference
72+
- pd.NA for missing
73+
- Raises an error if ploidy is not 2
74+
6175
Returns
6276
-------
6377
pd.arrays.Categorical
64-
'Ref' for Homozygous Reference
65-
'Het' for Heterozygous
66-
'Hom' for Homozygous Non-Reference
67-
pd.NA for missing
68-
Raises an error if ploidy is not 2
6978
"""
7079
if self.dtype.variant.ploidy != 2:
7180
raise ValueError(

pandas_genomics/io/plink/to_plink.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,7 @@ def to_plink(
3535
3636
Notes
3737
-----
38-
If the data index has the required columns (FID, IID, IID_father, IID_mother, sex, phenotype) the fam file will
39-
be created based on the index.
38+
If the data index has the required columns (FID, IID, IID_father, IID_mother, sex, phenotype) the fam file will be created based on the index.
4039
If a phenotype name is provided, this will override any phenotype information in the index.
4140
If the data has a single index column this will be used (with the prefix) for FID and IID. Defaults will be used for other .fam data
4241

0 commit comments

Comments
 (0)