Skip to content

Commit ae0e589

Browse files
committed
Update DataFrameAccessor to work when not all columns are GenotypeDtypes
1 parent 56de679 commit ae0e589

File tree

4 files changed

+113
-39
lines changed

4 files changed

+113
-39
lines changed

pandas_genomics/accessors/dataframe_accessor.py

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,18 @@ class GenotypeDataframeAccessor:
2020
"""
2121

2222
def __init__(self, pandas_obj):
23-
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all():
24-
incorrect = pandas_obj.dtypes[
25-
~pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt))
26-
]
23+
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).any():
2724
raise AttributeError(
28-
f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}"
25+
"Incompatible datatypes: at least one column must be a GenotypeDtype."
2926
)
30-
id_counts = Counter([s.genomics.variant.id for _, s in pandas_obj.iteritems()])
31-
if len(id_counts) < len(pandas_obj.columns):
27+
id_counts = Counter(
28+
[
29+
s.genomics.variant.id
30+
for _, s in pandas_obj.iteritems()
31+
if GenotypeDtype.is_dtype(s)
32+
]
33+
)
34+
if len(id_counts) < len(pandas_obj.select_dtypes([GenotypeDtype]).columns):
3235
duplicates = [(k, v) for k, v in id_counts.items() if v >= 2]
3336
raise AttributeError(
3437
f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: "
@@ -41,10 +44,12 @@ def __init__(self, pandas_obj):
4144
######################
4245
@property
4346
def variant_info(self) -> pd.DataFrame:
44-
"""Return a DataFrame with variant info indexed by the column name"""
47+
"""Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray)"""
4548
return pd.DataFrame.from_dict(
4649
{
4750
colname: series.genomics.variant_info
51+
if GenotypeDtype.is_dtype(series.dtype)
52+
else dict()
4853
for colname, series in self._obj.iteritems()
4954
},
5055
orient="index",
@@ -58,14 +63,20 @@ def maf(self):
5863
"""Return the minor allele frequency
5964
6065
See :py:attr:`GenotypeArray.maf`"""
61-
return self._obj.apply(lambda col: col.genomics.maf)
66+
return self._obj.apply(
67+
lambda col: col.genomics.maf if GenotypeDtype.is_dtype(col.dtype) else pd.NA
68+
)
6269

6370
@property
6471
def hwe_pval(self):
6572
"""Return the probability that the samples are in HWE
6673
6774
See :py:attr:`GenotypeArray.hwe_pval`"""
68-
return self._obj.apply(lambda col: col.genomics.hwe_pval)
75+
return self._obj.apply(
76+
lambda col: col.genomics.hwe_pval
77+
if GenotypeDtype.is_dtype(col.dtype)
78+
else pd.NA
79+
)
6980

7081
############
7182
# Encoding #
@@ -80,7 +91,11 @@ def encode_additive(self) -> pd.DataFrame:
8091
pd.DataFrame
8192
"""
8293
return pd.concat(
83-
[s.genomics.encode_additive() for _, s in self._obj.iteritems()], axis=1
94+
[
95+
s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s
96+
for _, s in self._obj.iteritems()
97+
],
98+
axis=1,
8499
)
85100

86101
def encode_dominant(self) -> pd.DataFrame:
@@ -93,7 +108,11 @@ def encode_dominant(self) -> pd.DataFrame:
93108
pd.DataFrame
94109
"""
95110
return pd.concat(
96-
[s.genomics.encode_dominant() for _, s in self._obj.iteritems()], axis=1
111+
[
112+
s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s
113+
for _, s in self._obj.iteritems()
114+
],
115+
axis=1,
97116
)
98117

99118
def encode_recessive(self) -> pd.DataFrame:
@@ -106,7 +125,11 @@ def encode_recessive(self) -> pd.DataFrame:
106125
pd.DataFrame
107126
"""
108127
return pd.concat(
109-
[s.genomics.encode_recessive() for _, s in self._obj.iteritems()], axis=1
128+
[
129+
s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s
130+
for _, s in self._obj.iteritems()
131+
],
132+
axis=1,
110133
)
111134

112135
def encode_codominant(self) -> pd.DataFrame:
@@ -119,7 +142,11 @@ def encode_codominant(self) -> pd.DataFrame:
119142
pd.DataFrame
120143
"""
121144
return pd.concat(
122-
[s.genomics.encode_codominant() for _, s in self._obj.iteritems()], axis=1
145+
[
146+
s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s
147+
for _, s in self._obj.iteritems()
148+
],
149+
axis=1,
123150
)
124151

125152
def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
@@ -181,6 +208,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
181208
# Process each variant
182209
results = []
183210
for _, s in self._obj.iteritems():
211+
if not GenotypeDtype.is_dtype(s):
212+
results.append(s)
213+
continue
184214
info = encoding_info.get(s.array.variant.id, None)
185215
if info is None:
186216
warnings[
@@ -244,7 +274,7 @@ def generate_weighted_encodings(
244274
PLoS genetics 17.6 (2021): e1009534.
245275
"""
246276
return generate_weighted_encodings(
247-
genotypes=self._obj,
277+
genotypes=self._obj.select_dtypes([GenotypeDtype]),
248278
data=data,
249279
outcome_variable=outcome_variable,
250280
covariates=covariates,
@@ -257,15 +287,18 @@ def filter_variants_maf(self, keep_min_freq: float = 0.01) -> pd.DataFrame:
257287
"""
258288
Drop variants with a MAF less than the specified value (0.01 by default)
259289
"""
260-
return self._obj.loc[:, self._obj.genomics.maf >= keep_min_freq]
290+
genotypes = self._obj.select_dtypes([GenotypeDtype])
291+
removed = genotypes.loc[:, genotypes.genomics.maf < keep_min_freq].columns
292+
return self._obj.drop(columns=removed)
261293

262294
def filter_variants_hwe(self, cutoff: float = 0.05) -> pd.DataFrame:
263295
"""
264296
Drop variants with a probability of HWE less than the specified value (0.05 by default).
265297
Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes
266298
"""
267-
return self._obj.loc[
268-
:,
269-
(self._obj.genomics.hwe_pval >= cutoff)
270-
| (np.isnan(self._obj.genomics.hwe_pval)),
271-
]
299+
genotypes = self._obj.select_dtypes([GenotypeDtype])
300+
genotype_hwe_pval = genotypes.genomics.hwe_pval
301+
removed = genotypes.loc[
302+
:, (genotype_hwe_pval < cutoff) & ~np.isnan(genotype_hwe_pval)
303+
].columns
304+
return self._obj.drop(columns=removed)

tests/genotype_array/conftest.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,10 @@ def __get_data_for_encoding():
222222
def genotypearray_df():
223223
DATA_DIR = Path(__file__).parent.parent / "data" / "plink"
224224
input = DATA_DIR / "plink_test_small"
225-
return io.from_plink(input, max_variants=20, swap_alleles=True)
225+
df = io.from_plink(input, max_variants=20, swap_alleles=True)
226+
df["num"] = [1.0 for n in range(len(df))]
227+
df["bool"] = [True if n % 3 == 0 else False for n in range(len(df))]
228+
return df
226229

227230

228231
@pytest.fixture

tests/genotype_array/test_GenotypeArrayAccessors.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Test GenotypeArray Accessors
33
"""
4-
4+
import numpy as np
55
import pandas as pd
66
import pytest
77
from pandas._testing import (
@@ -28,7 +28,8 @@ def test_maf(data):
2828
)
2929
for colname in "ABC":
3030
df[colname].genomics.variant.id = colname
31-
expected = pd.Series({n: data.maf for n in "ABC"})
31+
df["D"] = np.ones(len(data))
32+
expected = pd.Series({"A": data.maf, "B": data.maf, "C": data.maf, "D": pd.NA})
3233
assert_series_equal(df.genomics.maf, expected)
3334

3435

@@ -37,23 +38,24 @@ def test_hwe(data):
3738

3839

3940
@pytest.mark.parametrize(
40-
"filter_value, num_vars_left", [(None, 15), (0.05, 1), (0.10, 0)]
41+
"filter_value, num_cols_left", [(None, 17), (0.05, 3), (0.10, 2)]
4142
)
42-
def test_filter_maf(genotypearray_df, filter_value, num_vars_left):
43+
def test_filter_maf(genotypearray_df, filter_value, num_cols_left):
4344
if filter_value is None:
4445
result = genotypearray_df.genomics.filter_variants_maf()
4546
else:
4647
result = genotypearray_df.genomics.filter_variants_maf(filter_value)
47-
assert len(result.columns) == num_vars_left
48+
assert len(result.columns) == num_cols_left
4849

4950

5051
@pytest.mark.parametrize(
51-
"filter_value, num_vars_left", [(None, 1), (0.05, 1), (1e-300, 2)]
52+
"filter_value, num_cols_left", [(None, 1), (0.05, 1), (1e-300, 2)]
5253
)
53-
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_vars_left):
54+
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_cols_left):
5455
data = pd.DataFrame({"yes": ga_inhwe, "no": ga_nothwe})
56+
data["num"] = [n for n in range(len(data))]
5557
if filter_value is None:
5658
result = data.genomics.filter_variants_hwe()
5759
else:
5860
result = data.genomics.filter_variants_hwe(filter_value)
59-
assert len(result.columns) == num_vars_left
61+
assert len(result.columns) == num_cols_left + 1

tests/genotype_array/test_GenotypeArrayEncoding.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,19 @@ def test_encoding_additive(data_for_encoding):
5656
expected = pd.Series(result)
5757
result_series = pd.Series(data_for_encoding()).genomics.encode_additive()
5858
assert_series_equal(result_series, expected)
59-
# Test using DataFrame accessor
59+
# Test using DataFrame accessor with extra col
6060
df = pd.DataFrame.from_dict(
6161
{n: data_for_encoding() for n in "ABC"}, orient="columns"
6262
)
63-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
63+
df["float"] = np.ones(len(df))
64+
expected = pd.DataFrame.from_dict(
65+
{
66+
"A": result_series,
67+
"B": result_series,
68+
"C": result_series,
69+
"float": df["float"],
70+
}
71+
)
6472
result_df = df.genomics.encode_additive()
6573
assert_frame_equal(result_df, expected)
6674

@@ -74,11 +82,19 @@ def test_encoding_dominant(data_for_encoding):
7482
expected = pd.Series(result)
7583
result_series = pd.Series(data_for_encoding()).genomics.encode_dominant()
7684
assert_series_equal(result_series, expected)
77-
# Test using DataFrame accessor
85+
# Test using DataFrame accessor with extra col
7886
df = pd.DataFrame.from_dict(
7987
{n: data_for_encoding() for n in "ABC"}, orient="columns"
8088
)
81-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
89+
df["float"] = np.ones(len(df))
90+
expected = pd.DataFrame.from_dict(
91+
{
92+
"A": result_series,
93+
"B": result_series,
94+
"C": result_series,
95+
"float": df["float"],
96+
}
97+
)
8298
result_df = df.genomics.encode_dominant()
8399
assert_frame_equal(result_df, expected)
84100

@@ -92,11 +108,19 @@ def test_encoding_recessive(data_for_encoding):
92108
expected = pd.Series(result)
93109
result_series = pd.Series(data_for_encoding()).genomics.encode_recessive()
94110
assert_series_equal(result_series, expected)
95-
# Test using DataFrame accessor
111+
# Test using DataFrame accessor with extra col
96112
df = pd.DataFrame.from_dict(
97113
{n: data_for_encoding() for n in "ABC"}, orient="columns"
98114
)
99-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
115+
df["float"] = np.ones(len(df))
116+
expected = pd.DataFrame.from_dict(
117+
{
118+
"A": result_series,
119+
"B": result_series,
120+
"C": result_series,
121+
"float": df["float"],
122+
}
123+
)
100124
result_df = df.genomics.encode_recessive()
101125
assert_frame_equal(result_df, expected)
102126

@@ -114,11 +138,19 @@ def test_encoding_codominant(data_for_encoding):
114138
expected = pd.Series(result)
115139
result_series = pd.Series(data_for_encoding()).genomics.encode_codominant()
116140
assert_series_equal(result_series, expected)
117-
# Test using DataFrame accessor
141+
# Test using DataFrame accessor with extra col
118142
df = pd.DataFrame.from_dict(
119143
{n: data_for_encoding() for n in "ABC"}, orient="columns"
120144
)
121-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
145+
df["float"] = np.ones(len(df))
146+
expected = pd.DataFrame.from_dict(
147+
{
148+
"A": result_series,
149+
"B": result_series,
150+
"C": result_series,
151+
"float": df["float"],
152+
}
153+
)
122154
result_df = df.genomics.encode_codominant()
123155
assert_frame_equal(result_df, expected)
124156

@@ -171,6 +203,7 @@ def test_encoding_weighted(
171203
"var2": [0.0, 0.3, 1.0, None, None],
172204
"var3": [0.0, 0.4, 1.0, None, None],
173205
"var4": [0.0, 0.5, 1.0, None, None],
206+
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
174207
},
175208
dtype="Float64",
176209
),
@@ -191,14 +224,17 @@ def test_encoding_weighted(
191224
"var0": [0.0, 0.1, 1.0, None, None],
192225
"var1": [0.0, 0.2, 1.0, None, None],
193226
"var4": [1.0, 0.5, 0.0, None, None],
227+
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
194228
},
195229
dtype="Float64",
196230
),
197231
),
198232
],
199233
)
200234
def test_encoding_weighted_df(encoding_df, encoding_info, expected):
201-
result = encoding_df.genomics.encode_weighted(encoding_info)
235+
df = encoding_df.copy()
236+
df["num"] = pd.Series(np.ones(len(df))).astype("Float64")
237+
result = df.genomics.encode_weighted(encoding_info)
202238
assert_frame_equal(expected, result)
203239

204240

0 commit comments

Comments
 (0)