@@ -13,12 +13,14 @@ class EncodingMixin:
1313
1414 def encode_additive (self ) -> pd .arrays .IntegerArray :
1515 """
16+ Additive Encoding
17+
18+ - Number of copies of non-reference allele
19+ - pd.NA when any alleles are missing
20+
1621 Returns
1722 -------
1823 pd.arrays.IntegerArray
19- Number of copies of non-reference allele
20- pd.NA when any alleles are missing
21- Raises ValueError if there is more than 1 alternate allele
2224 """
2325 allele_sum = (self .allele_idxs != 0 ).sum (axis = 1 ).astype ("float" )
2426 allele_sum [(self .allele_idxs == MISSING_IDX ).any (axis = 1 )] = np .nan
@@ -27,12 +29,15 @@ def encode_additive(self) -> pd.arrays.IntegerArray:
2729
2830 def encode_dominant (self ) -> pd .arrays .IntegerArray :
2931 """
32+ Dominant Encoding
33+
34+ - 0 for Homozygous Reference
35+ - 1 for any other case
36+ - pd.NA when any alleles are missing
37+
3038 Returns
3139 -------
3240 pd.arrays.IntegerArray
33- 0 for Homozygous Reference
34- 1 for any other case
35- pd.NA when any alleles are missing
3641 """
3742 has_minor = (self .allele_idxs != 0 ).any (axis = 1 ).astype ("float" )
3843 has_minor [(self .allele_idxs == MISSING_IDX ).any (axis = 1 )] = np .nan
@@ -41,12 +46,15 @@ def encode_dominant(self) -> pd.arrays.IntegerArray:
4146
4247 def encode_recessive (self ) -> pd .arrays .IntegerArray :
4348 """
49+ Recessive Encoding
50+
51+ - 1 for Homozygous Non-reference
52+ - 0 for anything else
53+ - pd.NA when any alleles are missing
54+
4455 Returns
4556 -------
4657 pd.arrays.IntegerArray
47- 1 for Homozygous Non-reference
48- 0 for anything else
49- pd.NA when any alleles are missing
5058 """
5159 all_minor = (self .allele_idxs != 0 ).all (axis = 1 ).astype ("float" )
5260 all_minor [(self .allele_idxs == MISSING_IDX ).any (axis = 1 )] = np .nan
@@ -58,14 +66,15 @@ def encode_codominant(self) -> pd.arrays.Categorical:
5866 This encodes the genotype into three categories. When utilized in regression, this results in two variables
5967 due to dummy encoding- "Het" as 0 or 1 and "Hom" as 0 or 1. 0 in both indicates "Ref".
6068
69+ - 'Ref' for Homozygous Reference
70+ - 'Het' for Heterozygous
71+ - 'Hom' for Homozygous Non-Reference
72+ - pd.NA for missing
73+ - Raises an error if ploidy is not 2
74+
6175 Returns
6276 -------
6377 pd.arrays.Categorical
64- 'Ref' for Homozygous Reference
65- 'Het' for Heterozygous
66- 'Hom' for Homozygous Non-Reference
67- pd.NA for missing
68- Raises an error if ploidy is not 2
6978 """
7079 if self .dtype .variant .ploidy != 2 :
7180 raise ValueError (
0 commit comments