@@ -20,15 +20,18 @@ class GenotypeDataframeAccessor:
2020 """
2121
2222 def __init__ (self , pandas_obj ):
23- if not pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt )).all ():
24- incorrect = pandas_obj .dtypes [
25- ~ pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt ))
26- ]
23+ if not pandas_obj .dtypes .apply (lambda dt : GenotypeDtype .is_dtype (dt )).any ():
2724 raise AttributeError (
28- f "Incompatible datatypes: all columns must be a GenotypeDtype: { incorrect } "
25+ "Incompatible datatypes: at least one column must be a GenotypeDtype. "
2926 )
30- id_counts = Counter ([s .genomics .variant .id for _ , s in pandas_obj .iteritems ()])
31- if len (id_counts ) < len (pandas_obj .columns ):
27+ id_counts = Counter (
28+ [
29+ s .genomics .variant .id
30+ for _ , s in pandas_obj .iteritems ()
31+ if GenotypeDtype .is_dtype (s )
32+ ]
33+ )
34+ if len (id_counts ) < len (pandas_obj .select_dtypes ([GenotypeDtype ]).columns ):
3235 duplicates = [(k , v ) for k , v in id_counts .items () if v >= 2 ]
3336 raise AttributeError (
3437 f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n \t Duplicates: "
@@ -41,10 +44,12 @@ def __init__(self, pandas_obj):
4144 ######################
4245 @property
4346 def variant_info (self ) -> pd .DataFrame :
44- """Return a DataFrame with variant info indexed by the column name"""
47+ """Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray) """
4548 return pd .DataFrame .from_dict (
4649 {
4750 colname : series .genomics .variant_info
51+ if GenotypeDtype .is_dtype (series .dtype )
52+ else dict ()
4853 for colname , series in self ._obj .iteritems ()
4954 },
5055 orient = "index" ,
@@ -58,14 +63,20 @@ def maf(self):
5863 """Return the minor allele frequency
5964
6065 See :py:attr:`GenotypeArray.maf`"""
61- return self ._obj .apply (lambda col : col .genomics .maf )
66+ return self ._obj .apply (
67+ lambda col : col .genomics .maf if GenotypeDtype .is_dtype (col .dtype ) else pd .NA
68+ )
6269
6370 @property
6471 def hwe_pval (self ):
6572 """Return the probability that the samples are in HWE
6673
6774 See :py:attr:`GenotypeArray.hwe_pval`"""
68- return self ._obj .apply (lambda col : col .genomics .hwe_pval )
75+ return self ._obj .apply (
76+ lambda col : col .genomics .hwe_pval
77+ if GenotypeDtype .is_dtype (col .dtype )
78+ else pd .NA
79+ )
6980
7081 ############
7182 # Encoding #
@@ -80,7 +91,11 @@ def encode_additive(self) -> pd.DataFrame:
8091 pd.DataFrame
8192 """
8293 return pd .concat (
83- [s .genomics .encode_additive () for _ , s in self ._obj .iteritems ()], axis = 1
94+ [
95+ s .genomics .encode_additive () if GenotypeDtype .is_dtype (s ) else s
96+ for _ , s in self ._obj .iteritems ()
97+ ],
98+ axis = 1 ,
8499 )
85100
86101 def encode_dominant (self ) -> pd .DataFrame :
@@ -93,7 +108,11 @@ def encode_dominant(self) -> pd.DataFrame:
93108 pd.DataFrame
94109 """
95110 return pd .concat (
96- [s .genomics .encode_dominant () for _ , s in self ._obj .iteritems ()], axis = 1
111+ [
112+ s .genomics .encode_dominant () if GenotypeDtype .is_dtype (s ) else s
113+ for _ , s in self ._obj .iteritems ()
114+ ],
115+ axis = 1 ,
97116 )
98117
99118 def encode_recessive (self ) -> pd .DataFrame :
@@ -106,7 +125,11 @@ def encode_recessive(self) -> pd.DataFrame:
106125 pd.DataFrame
107126 """
108127 return pd .concat (
109- [s .genomics .encode_recessive () for _ , s in self ._obj .iteritems ()], axis = 1
128+ [
129+ s .genomics .encode_recessive () if GenotypeDtype .is_dtype (s ) else s
130+ for _ , s in self ._obj .iteritems ()
131+ ],
132+ axis = 1 ,
110133 )
111134
112135 def encode_codominant (self ) -> pd .DataFrame :
@@ -119,7 +142,11 @@ def encode_codominant(self) -> pd.DataFrame:
119142 pd.DataFrame
120143 """
121144 return pd .concat (
122- [s .genomics .encode_codominant () for _ , s in self ._obj .iteritems ()], axis = 1
145+ [
146+ s .genomics .encode_codominant () if GenotypeDtype .is_dtype (s ) else s
147+ for _ , s in self ._obj .iteritems ()
148+ ],
149+ axis = 1 ,
123150 )
124151
125152 def encode_weighted (self , encoding_info : pd .DataFrame ) -> pd .DataFrame :
@@ -181,6 +208,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
181208 # Process each variant
182209 results = []
183210 for _ , s in self ._obj .iteritems ():
211+ if not GenotypeDtype .is_dtype (s ):
212+ results .append (s )
213+ continue
184214 info = encoding_info .get (s .array .variant .id , None )
185215 if info is None :
186216 warnings [
@@ -244,7 +274,7 @@ def generate_weighted_encodings(
244274 PLoS genetics 17.6 (2021): e1009534.
245275 """
246276 return generate_weighted_encodings (
247- genotypes = self ._obj ,
277+ genotypes = self ._obj . select_dtypes ([ GenotypeDtype ]) ,
248278 data = data ,
249279 outcome_variable = outcome_variable ,
250280 covariates = covariates ,
@@ -257,15 +287,18 @@ def filter_variants_maf(self, keep_min_freq: float = 0.01) -> pd.DataFrame:
257287 """
258288 Drop variants with a MAF less than the specified value (0.01 by default)
259289 """
260- return self ._obj .loc [:, self ._obj .genomics .maf >= keep_min_freq ]
290+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
291+ removed = genotypes .loc [:, genotypes .genomics .maf < keep_min_freq ].columns
292+ return self ._obj .drop (columns = removed )
261293
262294 def filter_variants_hwe (self , cutoff : float = 0.05 ) -> pd .DataFrame :
263295 """
264296 Drop variants with a probability of HWE less than the specified value (0.05 by default).
265297 Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes
266298 """
267- return self ._obj .loc [
268- :,
269- (self ._obj .genomics .hwe_pval >= cutoff )
270- | (np .isnan (self ._obj .genomics .hwe_pval )),
271- ]
299+ genotypes = self ._obj .select_dtypes ([GenotypeDtype ])
300+ genotype_hwe_pval = genotypes .genomics .hwe_pval
301+ removed = genotypes .loc [
302+ :, (genotype_hwe_pval < cutoff ) & ~ np .isnan (genotype_hwe_pval )
303+ ].columns
304+ return self ._obj .drop (columns = removed )
0 commit comments