Skip to content

Commit 431f13c

Browse files
committed
Update to python > 3.10
1 parent 62439dc commit 431f13c

File tree

14 files changed

+190
-45
lines changed

14 files changed

+190
-45
lines changed

.python-version

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
3.10.3
1+
3.11.9

CHANGELOG.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Changelog
2+
3+
All notable changes to this project will be documented in this file.
4+
5+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7+
8+
## [1.0.1] – 2025-05-27
9+
10+
### Added
11+
- Full compatibility with `pandas.tests.extension.base` test suite.
12+
- `__setitem__`: added support for `Genotype`, `GenotypeArray`, and `pd.Series` (with `GenotypeArray` values).
13+
14+
### Changed
15+
- `__getitem__`: uses `pandas.api.indexers.check_array_indexer` and raises clear `ValueError` for invalid keys like strings.
16+
- `_from_sequence`: improved validation logic for variant compatibility and handling of scalar values.
17+
- `factorize`: now properly detects unique values using `allele_idxs` and handles `na_value`.
18+
19+
### Fixed
20+
- `insert()`: properly rejects non-Genotype scalars and raises informative errors.
21+
- `__eq__`: avoids invalid comparisons by checking variant compatibility.
22+
- Compatibility adjustments for `pandas 2.x`, `Python 3.11+`.
23+
24+
### Deprecated
25+
- None.
26+
27+
### Removed
28+
- None.
29+
30+
### Security
31+
- None.

pandas_genomics/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor
99

1010
# __version__ = importlib_metadata.version(__name__)
11+
__version__ = "1.0.1"
1112

1213
__all__ = [
1314
# "__version__",

pandas_genomics/accessors/dataframe_accessor.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,9 @@ def __init__(self, pandas_obj):
2828
id_counts = Counter(
2929
[
3030
s.genomics.variant.id
31-
for _, s in pandas_obj.iteritems()
31+
# Andre: Update to Python >= 3.10
32+
# for _, s in pandas_obj.iteritems()
33+
for _, s in pandas_obj.items()
3234
if GenotypeDtype.is_dtype(s)
3335
]
3436
)
@@ -52,7 +54,8 @@ def variant_info(self) -> pd.DataFrame:
5254
return pd.DataFrame.from_dict(
5355
{
5456
colname: series.genomics.variant_info
55-
for colname, series in genotypes.iteritems()
57+
# for colname, series in genotypes.iteritems()
58+
for colname, series in genotypes.items()
5659
},
5760
orient="index",
5861
)
@@ -90,7 +93,8 @@ def encode_additive(self) -> pd.DataFrame:
9093
return pd.concat(
9194
[
9295
s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s
93-
for _, s in self._obj.iteritems()
96+
# for _, s in self._obj.iteritems()
97+
for _, s in self._obj.items()
9498
],
9599
axis=1,
96100
)
@@ -107,7 +111,8 @@ def encode_dominant(self) -> pd.DataFrame:
107111
return pd.concat(
108112
[
109113
s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s
110-
for _, s in self._obj.iteritems()
114+
# for _, s in self._obj.iteritems()
115+
for _, s in self._obj.items()
111116
],
112117
axis=1,
113118
)
@@ -124,7 +129,8 @@ def encode_recessive(self) -> pd.DataFrame:
124129
return pd.concat(
125130
[
126131
s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s
127-
for _, s in self._obj.iteritems()
132+
# for _, s in self._obj.iteritems()
133+
for _, s in self._obj.items()
128134
],
129135
axis=1,
130136
)
@@ -141,7 +147,8 @@ def encode_codominant(self) -> pd.DataFrame:
141147
return pd.concat(
142148
[
143149
s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s
144-
for _, s in self._obj.iteritems()
150+
# for _, s in self._obj.iteritems()
151+
for _, s in self._obj.items()
145152
],
146153
axis=1,
147154
)
@@ -204,7 +211,8 @@ def encode_edge(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
204211

205212
# Process each variant
206213
results = []
207-
for _, s in self._obj.iteritems():
214+
# for _, s in self._obj.iteritems():
215+
for _, s in self._obj.items():
208216
if not GenotypeDtype.is_dtype(s):
209217
results.append(s)
210218
continue

pandas_genomics/arrays/genotype_array.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,27 @@
22
import re
33
from copy import copy
44
from typing import Dict, MutableMapping, Any, Optional, List, Union, Tuple, Iterable
5+
# from pandas.arrays import BooleanArray
6+
from pandas.arrays import BooleanArray, IntegerArray
57

68
import numpy as np
79
import pandas as pd
8-
from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
9-
from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
10-
from pandas.core.dtypes.inference import is_list_like
10+
11+
# Andre: Update to Python >=3.10
12+
# from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
13+
# from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
14+
# from pandas.core.dtypes.inference import is_list_like
15+
from pandas.api.extensions import ExtensionArray, register_extension_dtype, ExtensionDtype
16+
from pandas.api.types import is_list_like
1117

1218
from pandas_genomics.arrays.encoding_mixin import EncodingMixin
1319
from pandas_genomics.arrays.info_mixin import InfoMixin
1420
from pandas_genomics.scalars import Variant, Genotype, MISSING_IDX
1521

1622

1723
@register_extension_dtype
18-
class GenotypeDtype(PandasExtensionDtype):
24+
# class GenotypeDtype(PandasExtensionDtype):
25+
class GenotypeDtype(ExtensionDtype):
1926
"""
2027
An ExtensionDtype for genotype data.
2128
@@ -374,7 +381,15 @@ def _from_sequence(
374381
# Use the dtype variant
375382
variant = dtype.variant
376383
values = []
384+
377385
for idx, gt in enumerate(scalars):
386+
387+
# Andre: Update to Python >= 3.10
388+
if not isinstance(gt, Genotype):
389+
raise TypeError(
390+
f"Expected Genotype instance at index {idx}, got {type(gt).__name__}"
391+
)
392+
378393
if not variant.is_same_position(gt.variant):
379394
raise ValueError(
380395
f"Variant for Genotype {idx} of {len(scalars)} ({gt.variant}) "
@@ -387,6 +402,7 @@ def _from_sequence(
387402
)
388403
else:
389404
values.append((gt.allele_idxs, gt._float_score))
405+
390406
result = cls(values=[], dtype=GenotypeDtype(variant))
391407
result._data = np.array(values, dtype=result._dtype._record_type)
392408
return result
@@ -474,7 +490,15 @@ def __getitem__(self, index):
474490
# Check and convert the index
475491
index = pd.api.indexers.check_array_indexer(self._data, index)
476492

477-
result = operator.getitem(self._data, index)
493+
# Andre: Update to Python >= 3.10
494+
# result = operator.getitem(self._data, index)
495+
try:
496+
result = operator.getitem(self._data, index)
497+
except (TypeError, ValueError) as e:
498+
raise IndexError(
499+
"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
500+
"(`None`) and integer or boolean arrays are valid indices"
501+
) from e
478502

479503
if isinstance(result, np.ndarray):
480504
return GenotypeArray(values=result, dtype=self.dtype)
@@ -528,6 +552,11 @@ def __setitem__(
528552
self._data[key] = value._data
529553
elif isinstance(value, pd.Series) and isinstance(value.values, GenotypeArray):
530554
self._data[key] = value.values._data
555+
# Andre: Update to Python >= 3.10
556+
elif isinstance(value, np.ndarray) and value.dtype == object:
557+
# Convert to GenotypeArray assuming array of Genotype
558+
value = self._from_sequence(value.tolist(), dtype=self.dtype)
559+
self._data[key] = value._data
531560
else:
532561
raise ValueError(
533562
f"Can't set the value in a GenotypeArray with '{type(value)}"
@@ -564,7 +593,13 @@ def take(self, indexer, allow_fill=False, fill_value=None):
564593
def copy(self):
565594
return GenotypeArray(self._data.copy(), copy(self.dtype))
566595

567-
def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]:
596+
# Andre: Update to Python >= 3.10
597+
# def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]:
598+
def factorize(
599+
self,
600+
na_sentinel: int = -1,
601+
use_na_sentinel: bool = True
602+
) -> Tuple[np.ndarray, "GenotypeArray"]:
568603
"""
569604
Return an array of ints indexing unique values
570605
"""
@@ -583,7 +618,15 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]
583618
codes[self == gt] = idx
584619

585620
# Update codes for NA values
586-
codes[self.isna()] = na_sentinel
621+
# Andre: Update to Python >= 3.10
622+
# codes[self.isna()] = na_sentinel
623+
# Handle NA values
624+
if use_na_sentinel:
625+
codes[self.isna()] = na_sentinel
626+
else:
627+
# NaNs se tornam parte dos códigos únicos
628+
nan_idx = len(set(codes)) # ou: codes.max() + 1
629+
codes[self.isna()] = nan_idx
587630

588631
# Return the codes and unique values (not including NA)
589632
return codes, uniques[~uniques.isna()]

pandas_genomics/io/plink/from_plink.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,11 @@ def load_sample_info(fam_file, categorical_phenotype):
8484
DEFAULT_CAT_MAP = {1: "Control", 2: "Case"}
8585
if categorical_phenotype:
8686
df["phenotype"] = df["phenotype"].astype("category")
87-
df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP, inplace=True)
87+
88+
# Andre: Update to Python >= 3.10
89+
# df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP, inplace=True)
90+
df["phenotype"] = df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP)
91+
8892
df.loc[~df["phenotype"].isin(DEFAULT_CAT_MAP.values()), "phenotype"] = None
8993
print(f"\tLoaded information for {len(df)} samples from '{fam_file.name}'")
9094
return df

pandas_genomics/io/plink/to_plink.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,12 @@ def save_fam(
7676
]:
7777
# Recode sex
7878
fam_data = data.index.to_frame()
79-
fam_data["sex"].cat.rename_categories(
80-
{"male": 1, "female": 2, "unknown": 0}, inplace=True
79+
# Andre: Update to Python >= 3.10
80+
# fam_data["sex"].cat.rename_categories(
81+
# {"male": 1, "female": 2, "unknown": 0}, inplace=True
82+
# )
83+
fam_data["sex"] = fam_data["sex"].cat.rename_categories(
84+
{"male": 1, "female": 2, "unknown": 0}
8185
)
8286
# Update phenotype if provided
8387
if phenotype_name is not None:
@@ -109,8 +113,12 @@ def save_fam(
109113
"The phenotype must be categorical to utilize 'phenotype_control' and 'phenotype_case' parameters"
110114
)
111115
pheno_dict = {phenotype_control: 1, phenotype_case: 2}
112-
fam_data["phenotype"].cat.rename_categories(
113-
lambda c: pheno_dict.get(c, 0), inplace=True
116+
# Andre: Update to Python >= 3.10
117+
# fam_data["phenotype"].cat.rename_categories(
118+
# lambda c: pheno_dict.get(c, 0), inplace=True
119+
# )
120+
fam_data["phenotype"] = fam_data["phenotype"].cat.rename_categories(
121+
lambda c: pheno_dict.get(c, 0)
114122
)
115123

116124
fam_data.to_csv(output_fam, sep=" ", header=False, index=False)
@@ -120,7 +128,9 @@ def save_fam(
120128
def save_bim(data, output_bim):
121129
variants = [
122130
col_val.genomics.variant
123-
for col_name, col_val in data.iteritems()
131+
# Andre: Update to Python >= 3.10
132+
# for col_name, col_val in data.iteritems()
133+
for col_name, col_val in data.items()
124134
if GenotypeDtype.is_dtype(col_val.dtype)
125135
]
126136
for var in variants:
@@ -149,7 +159,9 @@ def save_bed(data, output_bed):
149159
bytes = np.array(
150160
[
151161
gt_array_to_plink_bits(col_val)
152-
for col_name, col_val in data.iteritems()
162+
# Andre: Update to Python >= 3.10
163+
# for col_name, col_val in data.iteritems()
164+
for col_name, col_val in data.items()
153165
if GenotypeDtype.is_dtype(col_val.dtype)
154166
]
155167
)

pandas_genomics/io/vcf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ def from_vcf(
4242
continue
4343

4444
# Skip variants below the minimum quality
45-
if vcf_variant.QUAL < min_qual:
45+
# Andre: Update to Python >= 3.10
46+
# if vcf_variant.QUAL < min_qual:
47+
if vcf_variant.QUAL is None or vcf_variant.QUAL < min_qual:
4648
continue
4749

4850
if len(vcf_variant.ALT) >= MISSING_IDX:

pandas_genomics/scalars.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,11 +430,20 @@ def __repr__(self):
430430
def __hash__(self):
431431
return hash(repr(self))
432432

433+
# # Andre: Update to Python >= 3.10
434+
# def __eq__(self, other):
435+
# if other.__class__ is not self.__class__:
436+
# return NotImplemented
437+
# # if self.variant != other.variant:
438+
# # raise NotImplementedError("Can't compare different variants")
433439
def __eq__(self, other):
434-
if other.__class__ is not self.__class__:
440+
if not isinstance(other, Genotype):
435441
return NotImplemented
436442
if self.variant != other.variant:
437-
raise NotImplementedError("Can't compare different variants")
443+
return False
444+
return self.allele_idxs == other.allele_idxs
445+
446+
438447
return self.allele_idxs == other.allele_idxs
439448

440449
def __lt__(self, other):

pyproject.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pandas-genomics"
3-
version = "0.12.1"
3+
version = "1.0.0"
44
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
55
license = "BSD-3-Clause"
66
authors = ["Andre Rico <[email protected]>"]
@@ -16,10 +16,10 @@ classifiers = [
1616
]
1717

1818
[tool.poetry.dependencies]
19-
python = ">=3.8.0,<3.11.0"
20-
numpy = "^1.24"
21-
pandas = "^1.3"
22-
cyvcf2 = {version="^0.30", markers = "sys_platform != 'win32'"}
19+
python = ">=3.10,<3.13"
20+
numpy = ">=1.24,<2.0"
21+
pandas = ">=1.3,<2.2"
22+
cyvcf2 = { version = ">=0.31.1", markers = "sys_platform != 'win32'" }
2323
sphinx_rtd_theme = {version = "^0.5.0", optional = true}
2424
numpydoc = {version = "^1.1.0", optional = true}
2525
sphinx-copybutton = {version = "^0.3.0", optional = true}
@@ -40,5 +40,5 @@ black = ">=22"
4040
docs = ["sphinx", "numpydoc", "sphinx_rtd_theme", "sphinx-copybutton", "ipython"]
4141

4242
[build-system]
43-
requires = ["poetry>=0.12"]
44-
build-backend = "poetry.masonry.api"
43+
requires = ["poetry-core>=1.0.0"]
44+
build-backend = "poetry.core.masonry.api"

0 commit comments

Comments
 (0)