@@ -20,15 +20,18 @@ class GenotypeDataframeAccessor:
2020 """
2121
2222 def __init__ (self , pandas_obj ):
23- if not pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt )).all ():
24- incorrect = pandas_obj .dtypes [
25- ~ pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt ))
26- ]
23+ if not pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt )).any ():
2724 raise AttributeError (
28- f "Incompatible datatypes: all columns must be a GenotypeDtype: { incorrect } "
25+ "Incompatible datatypes: at least one column must be a GenotypeDtype. "
2926 )
30- id_counts = Counter ([s .genomics .variant .id for _ , s in pandas_obj .iteritems ()])
31- if len (id_counts ) < len (pandas_obj .columns ):
27+ id_counts = Counter (
28+ [
29+ s .genomics .variant .id
30+ for _ , s in pandas_obj .iteritems ()
31+ if GenotypeDtype .is_dtype (s )
32+ ]
33+ )
34+ if len (id_counts ) < len (pandas_obj .select_dtypes ([GenotypeDtype ]).columns ):
3235 duplicates = [(k , v ) for k , v in id_counts .items () if v >= 2 ]
3336 raise AttributeError (
3437 f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n \t Duplicates: "
@@ -39,37 +42,41 @@ def __init__(self, pandas_obj):
3942 ######################
4043 # Variant Properties #
4144 ######################
45+ # These methods generally only return a result for each GenotypeArray column, ignoring other columns
46+
4247 @property
4348 def variant_info (self ) -> pd .DataFrame :
44- """Return a DataFrame with variant info indexed by the column name"""
49+ """Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray)"""
50+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
4551 return pd .DataFrame .from_dict (
4652 {
4753 colname : series .genomics .variant_info
48- for colname , series in self . _obj .iteritems ()
54+ for colname , series in genotypes .iteritems ()
4955 },
5056 orient = "index" ,
5157 )
5258
53- #########################
54- # Calculated Properties #
55- #########################
5659 @property
5760 def maf (self ):
58- """Return the minor allele frequency
61+ """Return the minor allele frequency of each variant
5962
6063 See :py:attr:`GenotypeArray.maf`"""
61- return self ._obj .apply (lambda col : col .genomics .maf )
64+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
65+ return genotypes .apply (lambda col : col .genomics .maf )
6266
6367 @property
6468 def hwe_pval (self ):
6569 """Return the probability that the samples are in HWE
6670
6771 See :py:attr:`GenotypeArray.hwe_pval`"""
68- return self ._obj .apply (lambda col : col .genomics .hwe_pval )
72+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
73+ return genotypes .apply (lambda col : col .genomics .hwe_pval )
6974
7075 ############
7176 # Encoding #
7277 ############
78+ # These methods generally return encoded values for any GenotypeArray columns without modifying other columns
79+
7380 def encode_additive (self ) -> pd .DataFrame :
7481 """Additive encoding of genotypes.
7582
@@ -80,7 +87,11 @@ def encode_additive(self) -> pd.DataFrame:
8087 pd.DataFrame
8188 """
8289 return pd .concat (
83- [s .genomics .encode_additive () for _ , s in self ._obj .iteritems ()], axis = 1
90+ [
91+ s .genomics .encode_additive () if GenotypeDtype .is_dtype (s ) else s
92+ for _ , s in self ._obj .iteritems ()
93+ ],
94+ axis = 1 ,
8495 )
8596
8697 def encode_dominant (self ) -> pd .DataFrame :
@@ -93,7 +104,11 @@ def encode_dominant(self) -> pd.DataFrame:
93104 pd.DataFrame
94105 """
95106 return pd .concat (
96- [s .genomics .encode_dominant () for _ , s in self ._obj .iteritems ()], axis = 1
107+ [
108+ s .genomics .encode_dominant () if GenotypeDtype .is_dtype (s ) else s
109+ for _ , s in self ._obj .iteritems ()
110+ ],
111+ axis = 1 ,
97112 )
98113
99114 def encode_recessive (self ) -> pd .DataFrame :
@@ -106,7 +121,11 @@ def encode_recessive(self) -> pd.DataFrame:
106121 pd.DataFrame
107122 """
108123 return pd .concat (
109- [s .genomics .encode_recessive () for _ , s in self ._obj .iteritems ()], axis = 1
124+ [
125+ s .genomics .encode_recessive () if GenotypeDtype .is_dtype (s ) else s
126+ for _ , s in self ._obj .iteritems ()
127+ ],
128+ axis = 1 ,
110129 )
111130
112131 def encode_codominant (self ) -> pd .DataFrame :
@@ -119,7 +138,11 @@ def encode_codominant(self) -> pd.DataFrame:
119138 pd.DataFrame
120139 """
121140 return pd .concat (
122- [s .genomics .encode_codominant () for _ , s in self ._obj .iteritems ()], axis = 1
141+ [
142+ s .genomics .encode_codominant () if GenotypeDtype .is_dtype (s ) else s
143+ for _ , s in self ._obj .iteritems ()
144+ ],
145+ axis = 1 ,
123146 )
124147
125148 def encode_weighted (self , encoding_info : pd .DataFrame ) -> pd .DataFrame :
@@ -181,6 +204,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
181204 # Process each variant
182205 results = []
183206 for _ , s in self ._obj .iteritems ():
207+ if not GenotypeDtype .is_dtype (s ):
208+ results .append (s )
209+ continue
184210 info = encoding_info .get (s .array .variant .id , None )
185211 if info is None :
186212 warnings [
@@ -244,7 +270,7 @@ def generate_weighted_encodings(
244270 PLoS genetics 17.6 (2021): e1009534.
245271 """
246272 return generate_weighted_encodings (
247- genotypes = self ._obj ,
273+ genotypes = self ._obj . select_dtypes ([ GenotypeDtype ]) ,
248274 data = data ,
249275 outcome_variable = outcome_variable ,
250276 covariates = covariates ,
@@ -253,19 +279,24 @@ def generate_weighted_encodings(
253279 ###########
254280 # Filters #
255281 ###########
282+ # These methods drop genotypes that fail the filter, ignoring other columns
283+
256284 def filter_variants_maf (self , keep_min_freq : float = 0.01 ) -> pd .DataFrame :
257285 """
258286 Drop variants with a MAF less than the specified value (0.01 by default)
259287 """
260- return self ._obj .loc [:, self ._obj .genomics .maf >= keep_min_freq ]
288+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
289+ removed = genotypes .loc [:, genotypes .genomics .maf < keep_min_freq ].columns
290+ return self ._obj .drop (columns = removed )
261291
262292 def filter_variants_hwe (self , cutoff : float = 0.05 ) -> pd .DataFrame :
263293 """
264294 Drop variants with a probability of HWE less than the specified value (0.05 by default).
265295 Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes
266296 """
267- return self ._obj .loc [
268- :,
269- (self ._obj .genomics .hwe_pval >= cutoff )
270- | (np .isnan (self ._obj .genomics .hwe_pval )),
271- ]
297+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
298+ genotype_hwe_pval = genotypes .genomics .hwe_pval
299+ removed = genotypes .loc [
300+ :, (genotype_hwe_pval < cutoff ) & ~ np .isnan (genotype_hwe_pval )
301+ ].columns
302+ return self ._obj .drop (columns = removed )
0 commit comments