Skip to content

Commit d63e63d

Browse files
authored
Merge pull request #22 from jrm5100/master
DataFrame Accessor doesn't require all-Genotype columns
2 parents 56de679 + 4889960 commit d63e63d

File tree

6 files changed

+130
-45
lines changed

6 files changed

+130
-45
lines changed

docs/release-history.rst

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,19 @@
22
Release History
33
===============
44

5+
v0.6.0 (2021-06-25)
6+
-------------------
7+
8+
Enhancements
9+
^^^^^^^^^^^^
10+
11+
* The *genomics* DataFrame Accessor no longer requires that all columns in the DataFrame are backed by a GenotypeArray
12+
13+
v0.5.2 (2021-06-24)
14+
-------------------
15+
16+
* Update numpy version requirement
17+
518
v0.5.1 (2021-06-23)
619
-------------------
720

pandas_genomics/accessors/dataframe_accessor.py

Lines changed: 57 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,18 @@ class GenotypeDataframeAccessor:
2020
"""
2121

2222
def __init__(self, pandas_obj):
23-
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all():
24-
incorrect = pandas_obj.dtypes[
25-
~pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt))
26-
]
23+
if not pandas_obj.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).any():
2724
raise AttributeError(
28-
f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}"
25+
"Incompatible datatypes: at least one column must be a GenotypeDtype."
2926
)
30-
id_counts = Counter([s.genomics.variant.id for _, s in pandas_obj.iteritems()])
31-
if len(id_counts) < len(pandas_obj.columns):
27+
id_counts = Counter(
28+
[
29+
s.genomics.variant.id
30+
for _, s in pandas_obj.iteritems()
31+
if GenotypeDtype.is_dtype(s)
32+
]
33+
)
34+
if len(id_counts) < len(pandas_obj.select_dtypes([GenotypeDtype]).columns):
3235
duplicates = [(k, v) for k, v in id_counts.items() if v >= 2]
3336
raise AttributeError(
3437
f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: "
@@ -39,37 +42,41 @@ def __init__(self, pandas_obj):
3942
######################
4043
# Variant Properties #
4144
######################
45+
# These methods generally only return a result for each GenotypeArray column, ignoring other columns
46+
4247
@property
4348
def variant_info(self) -> pd.DataFrame:
44-
"""Return a DataFrame with variant info indexed by the column name"""
49+
"""Return a DataFrame with variant info indexed by the column name (one row per GenotypeArray)"""
50+
genotypes = self._obj.select_dtypes([GenotypeDtype])
4551
return pd.DataFrame.from_dict(
4652
{
4753
colname: series.genomics.variant_info
48-
for colname, series in self._obj.iteritems()
54+
for colname, series in genotypes.iteritems()
4955
},
5056
orient="index",
5157
)
5258

53-
#########################
54-
# Calculated Properties #
55-
#########################
5659
@property
5760
def maf(self):
58-
"""Return the minor allele frequency
61+
"""Return the minor allele frequency of each variant
5962
6063
See :py:attr:`GenotypeArray.maf`"""
61-
return self._obj.apply(lambda col: col.genomics.maf)
64+
genotypes = self._obj.select_dtypes([GenotypeDtype])
65+
return genotypes.apply(lambda col: col.genomics.maf)
6266

6367
@property
6468
def hwe_pval(self):
6569
"""Return the probability that the samples are in HWE
6670
6771
See :py:attr:`GenotypeArray.hwe_pval`"""
68-
return self._obj.apply(lambda col: col.genomics.hwe_pval)
72+
genotypes = self._obj.select_dtypes([GenotypeDtype])
73+
return genotypes.apply(lambda col: col.genomics.hwe_pval)
6974

7075
############
7176
# Encoding #
7277
############
78+
# These methods generally return encoded values for any GenotypeArray columns without modifying other columns
79+
7380
def encode_additive(self) -> pd.DataFrame:
7481
"""Additive encoding of genotypes.
7582
@@ -80,7 +87,11 @@ def encode_additive(self) -> pd.DataFrame:
8087
pd.DataFrame
8188
"""
8289
return pd.concat(
83-
[s.genomics.encode_additive() for _, s in self._obj.iteritems()], axis=1
90+
[
91+
s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s
92+
for _, s in self._obj.iteritems()
93+
],
94+
axis=1,
8495
)
8596

8697
def encode_dominant(self) -> pd.DataFrame:
@@ -93,7 +104,11 @@ def encode_dominant(self) -> pd.DataFrame:
93104
pd.DataFrame
94105
"""
95106
return pd.concat(
96-
[s.genomics.encode_dominant() for _, s in self._obj.iteritems()], axis=1
107+
[
108+
s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s
109+
for _, s in self._obj.iteritems()
110+
],
111+
axis=1,
97112
)
98113

99114
def encode_recessive(self) -> pd.DataFrame:
@@ -106,7 +121,11 @@ def encode_recessive(self) -> pd.DataFrame:
106121
pd.DataFrame
107122
"""
108123
return pd.concat(
109-
[s.genomics.encode_recessive() for _, s in self._obj.iteritems()], axis=1
124+
[
125+
s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s
126+
for _, s in self._obj.iteritems()
127+
],
128+
axis=1,
110129
)
111130

112131
def encode_codominant(self) -> pd.DataFrame:
@@ -119,7 +138,11 @@ def encode_codominant(self) -> pd.DataFrame:
119138
pd.DataFrame
120139
"""
121140
return pd.concat(
122-
[s.genomics.encode_codominant() for _, s in self._obj.iteritems()], axis=1
141+
[
142+
s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s
143+
for _, s in self._obj.iteritems()
144+
],
145+
axis=1,
123146
)
124147

125148
def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
@@ -181,6 +204,9 @@ def encode_weighted(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
181204
# Process each variant
182205
results = []
183206
for _, s in self._obj.iteritems():
207+
if not GenotypeDtype.is_dtype(s):
208+
results.append(s)
209+
continue
184210
info = encoding_info.get(s.array.variant.id, None)
185211
if info is None:
186212
warnings[
@@ -244,7 +270,7 @@ def generate_weighted_encodings(
244270
PLoS genetics 17.6 (2021): e1009534.
245271
"""
246272
return generate_weighted_encodings(
247-
genotypes=self._obj,
273+
genotypes=self._obj.select_dtypes([GenotypeDtype]),
248274
data=data,
249275
outcome_variable=outcome_variable,
250276
covariates=covariates,
@@ -253,19 +279,24 @@ def generate_weighted_encodings(
253279
###########
254280
# Filters #
255281
###########
282+
# These methods drop genotypes that fail the filter, ignoring other columns
283+
256284
def filter_variants_maf(self, keep_min_freq: float = 0.01) -> pd.DataFrame:
257285
"""
258286
Drop variants with a MAF less than the specified value (0.01 by default)
259287
"""
260-
return self._obj.loc[:, self._obj.genomics.maf >= keep_min_freq]
288+
genotypes = self._obj.select_dtypes([GenotypeDtype])
289+
removed = genotypes.loc[:, genotypes.genomics.maf < keep_min_freq].columns
290+
return self._obj.drop(columns=removed)
261291

262292
def filter_variants_hwe(self, cutoff: float = 0.05) -> pd.DataFrame:
263293
"""
264294
Drop variants with a probability of HWE less than the specified value (0.05 by default).
265295
Keep np.nan results, which occur for non-diploid variants and insufficient sample sizes
266296
"""
267-
return self._obj.loc[
268-
:,
269-
(self._obj.genomics.hwe_pval >= cutoff)
270-
| (np.isnan(self._obj.genomics.hwe_pval)),
271-
]
297+
genotypes = self._obj.select_dtypes([GenotypeDtype])
298+
genotype_hwe_pval = genotypes.genomics.hwe_pval
299+
removed = genotypes.loc[
300+
:, (genotype_hwe_pval < cutoff) & ~np.isnan(genotype_hwe_pval)
301+
].columns
302+
return self._obj.drop(columns=removed)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pandas-genomics"
3-
version = "0.5.2"
3+
version = "0.6.0"
44
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
55
license = "BSD-3-Clause"
66
authors = ["John McGuigan <[email protected]>"]

tests/genotype_array/conftest.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,10 @@ def __get_data_for_encoding():
222222
def genotypearray_df():
223223
DATA_DIR = Path(__file__).parent.parent / "data" / "plink"
224224
input = DATA_DIR / "plink_test_small"
225-
return io.from_plink(input, max_variants=20, swap_alleles=True)
225+
df = io.from_plink(input, max_variants=20, swap_alleles=True)
226+
df["num"] = [1.0 for n in range(len(df))]
227+
df["bool"] = [True if n % 3 == 0 else False for n in range(len(df))]
228+
return df
226229

227230

228231
@pytest.fixture

tests/genotype_array/test_GenotypeArrayAccessors.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""
22
Test GenotypeArray Accessors
33
"""
4-
4+
import numpy as np
55
import pandas as pd
66
import pytest
77
from pandas._testing import (
@@ -28,7 +28,8 @@ def test_maf(data):
2828
)
2929
for colname in "ABC":
3030
df[colname].genomics.variant.id = colname
31-
expected = pd.Series({n: data.maf for n in "ABC"})
31+
df["D"] = np.ones(len(data))
32+
expected = pd.Series({"A": data.maf, "B": data.maf, "C": data.maf})
3233
assert_series_equal(df.genomics.maf, expected)
3334

3435

@@ -37,23 +38,24 @@ def test_hwe(data):
3738

3839

3940
@pytest.mark.parametrize(
40-
"filter_value, num_vars_left", [(None, 15), (0.05, 1), (0.10, 0)]
41+
"filter_value, num_cols_left", [(None, 17), (0.05, 3), (0.10, 2)]
4142
)
42-
def test_filter_maf(genotypearray_df, filter_value, num_vars_left):
43+
def test_filter_maf(genotypearray_df, filter_value, num_cols_left):
4344
if filter_value is None:
4445
result = genotypearray_df.genomics.filter_variants_maf()
4546
else:
4647
result = genotypearray_df.genomics.filter_variants_maf(filter_value)
47-
assert len(result.columns) == num_vars_left
48+
assert len(result.columns) == num_cols_left
4849

4950

5051
@pytest.mark.parametrize(
51-
"filter_value, num_vars_left", [(None, 1), (0.05, 1), (1e-300, 2)]
52+
"filter_value, num_cols_left", [(None, 1), (0.05, 1), (1e-300, 2)]
5253
)
53-
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_vars_left):
54+
def test_filter_hwe(ga_inhwe, ga_nothwe, filter_value, num_cols_left):
5455
data = pd.DataFrame({"yes": ga_inhwe, "no": ga_nothwe})
56+
data["num"] = [n for n in range(len(data))]
5557
if filter_value is None:
5658
result = data.genomics.filter_variants_hwe()
5759
else:
5860
result = data.genomics.filter_variants_hwe(filter_value)
59-
assert len(result.columns) == num_vars_left
61+
assert len(result.columns) == num_cols_left + 1

tests/genotype_array/test_GenotypeArrayEncoding.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,11 +56,19 @@ def test_encoding_additive(data_for_encoding):
5656
expected = pd.Series(result)
5757
result_series = pd.Series(data_for_encoding()).genomics.encode_additive()
5858
assert_series_equal(result_series, expected)
59-
# Test using DataFrame accessor
59+
# Test using DataFrame accessor with extra col
6060
df = pd.DataFrame.from_dict(
6161
{n: data_for_encoding() for n in "ABC"}, orient="columns"
6262
)
63-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
63+
df["float"] = np.ones(len(df))
64+
expected = pd.DataFrame.from_dict(
65+
{
66+
"A": result_series,
67+
"B": result_series,
68+
"C": result_series,
69+
"float": df["float"],
70+
}
71+
)
6472
result_df = df.genomics.encode_additive()
6573
assert_frame_equal(result_df, expected)
6674

@@ -74,11 +82,19 @@ def test_encoding_dominant(data_for_encoding):
7482
expected = pd.Series(result)
7583
result_series = pd.Series(data_for_encoding()).genomics.encode_dominant()
7684
assert_series_equal(result_series, expected)
77-
# Test using DataFrame accessor
85+
# Test using DataFrame accessor with extra col
7886
df = pd.DataFrame.from_dict(
7987
{n: data_for_encoding() for n in "ABC"}, orient="columns"
8088
)
81-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
89+
df["float"] = np.ones(len(df))
90+
expected = pd.DataFrame.from_dict(
91+
{
92+
"A": result_series,
93+
"B": result_series,
94+
"C": result_series,
95+
"float": df["float"],
96+
}
97+
)
8298
result_df = df.genomics.encode_dominant()
8399
assert_frame_equal(result_df, expected)
84100

@@ -92,11 +108,19 @@ def test_encoding_recessive(data_for_encoding):
92108
expected = pd.Series(result)
93109
result_series = pd.Series(data_for_encoding()).genomics.encode_recessive()
94110
assert_series_equal(result_series, expected)
95-
# Test using DataFrame accessor
111+
# Test using DataFrame accessor with extra col
96112
df = pd.DataFrame.from_dict(
97113
{n: data_for_encoding() for n in "ABC"}, orient="columns"
98114
)
99-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
115+
df["float"] = np.ones(len(df))
116+
expected = pd.DataFrame.from_dict(
117+
{
118+
"A": result_series,
119+
"B": result_series,
120+
"C": result_series,
121+
"float": df["float"],
122+
}
123+
)
100124
result_df = df.genomics.encode_recessive()
101125
assert_frame_equal(result_df, expected)
102126

@@ -114,11 +138,19 @@ def test_encoding_codominant(data_for_encoding):
114138
expected = pd.Series(result)
115139
result_series = pd.Series(data_for_encoding()).genomics.encode_codominant()
116140
assert_series_equal(result_series, expected)
117-
# Test using DataFrame accessor
141+
# Test using DataFrame accessor with extra col
118142
df = pd.DataFrame.from_dict(
119143
{n: data_for_encoding() for n in "ABC"}, orient="columns"
120144
)
121-
expected = pd.DataFrame.from_dict({n: result_series for n in "ABC"})
145+
df["float"] = np.ones(len(df))
146+
expected = pd.DataFrame.from_dict(
147+
{
148+
"A": result_series,
149+
"B": result_series,
150+
"C": result_series,
151+
"float": df["float"],
152+
}
153+
)
122154
result_df = df.genomics.encode_codominant()
123155
assert_frame_equal(result_df, expected)
124156

@@ -171,6 +203,7 @@ def test_encoding_weighted(
171203
"var2": [0.0, 0.3, 1.0, None, None],
172204
"var3": [0.0, 0.4, 1.0, None, None],
173205
"var4": [0.0, 0.5, 1.0, None, None],
206+
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
174207
},
175208
dtype="Float64",
176209
),
@@ -191,14 +224,17 @@ def test_encoding_weighted(
191224
"var0": [0.0, 0.1, 1.0, None, None],
192225
"var1": [0.0, 0.2, 1.0, None, None],
193226
"var4": [1.0, 0.5, 0.0, None, None],
227+
"num": [1.0, 1.0, 1.0, 1.0, 1.0],
194228
},
195229
dtype="Float64",
196230
),
197231
),
198232
],
199233
)
200234
def test_encoding_weighted_df(encoding_df, encoding_info, expected):
201-
result = encoding_df.genomics.encode_weighted(encoding_info)
235+
df = encoding_df.copy()
236+
df["num"] = pd.Series(np.ones(len(df))).astype("Float64")
237+
result = df.genomics.encode_weighted(encoding_info)
202238
assert_frame_equal(expected, result)
203239

204240

0 commit comments

Comments
 (0)