Update to python > 3.10

AndreRico · AndreRico · commit 431f13c4415a · 2025-05-27T17:24:44.000-04:00
diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.10.3
+3.11.9
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,31 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [1.0.1] – 2025-05-27
+
+### Added
+- Full compatibility with `pandas.tests.extension.base` test suite.
+- `__setitem__`: added support for `Genotype`, `GenotypeArray`, and `pd.Series` (with `GenotypeArray` values).
+
+### Changed
+- `__getitem__`: uses `pandas.api.indexers.check_array_indexer` and raises clear `ValueError` for invalid keys like strings.
+- `_from_sequence`: improved validation logic for variant compatibility and handling of scalar values.
+- `factorize`: now properly detects unique values using `allele_idxs` and handles `na_value`.
+
+### Fixed
+- `insert()`: properly rejects non-Genotype scalars and raises informative errors.
+- `__eq__`: avoids invalid comparisons by checking variant compatibility.
+- Compatibility adjustments for `pandas 2.x`, `Python 3.11+`.
+
+### Deprecated
+- None.
+
+### Removed
+- None.
+
+### Security
+- None.
diff --git a/pandas_genomics/__init__.py b/pandas_genomics/__init__.py
@@ -8,6 +8,7 @@
 from .accessors import GenotypeSeriesAccessor, GenotypeDataframeAccessor
 
 # __version__ = importlib_metadata.version(__name__)
+__version__ = "1.0.1"
 
 __all__ = [
     # "__version__",
diff --git a/pandas_genomics/accessors/dataframe_accessor.py b/pandas_genomics/accessors/dataframe_accessor.py
@@ -28,7 +28,9 @@ def __init__(self, pandas_obj):
         id_counts = Counter(
             [
                 s.genomics.variant.id
-                for _, s in pandas_obj.iteritems()
+                # Andre: Update to Python >= 3.10
+                # for _, s in pandas_obj.iteritems()
+                for _, s in pandas_obj.items()
                 if GenotypeDtype.is_dtype(s)
             ]
         )
@@ -52,7 +54,8 @@ def variant_info(self) -> pd.DataFrame:
         return pd.DataFrame.from_dict(
             {
                 colname: series.genomics.variant_info
-                for colname, series in genotypes.iteritems()
+                # for colname, series in genotypes.iteritems()
+                for colname, series in genotypes.items()
             },
             orient="index",
         )
@@ -90,7 +93,8 @@ def encode_additive(self) -> pd.DataFrame:
         return pd.concat(
             [
                 s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s
-                for _, s in self._obj.iteritems()
+                # for _, s in self._obj.iteritems()
+                for _, s in self._obj.items()
             ],
             axis=1,
         )
@@ -107,7 +111,8 @@ def encode_dominant(self) -> pd.DataFrame:
         return pd.concat(
             [
                 s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s
-                for _, s in self._obj.iteritems()
+                # for _, s in self._obj.iteritems()
+                for _, s in self._obj.items()
             ],
             axis=1,
         )
@@ -124,7 +129,8 @@ def encode_recessive(self) -> pd.DataFrame:
         return pd.concat(
             [
                 s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s
-                for _, s in self._obj.iteritems()
+                # for _, s in self._obj.iteritems()
+                for _, s in self._obj.items()
             ],
             axis=1,
         )
@@ -141,7 +147,8 @@ def encode_codominant(self) -> pd.DataFrame:
         return pd.concat(
             [
                 s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s
-                for _, s in self._obj.iteritems()
+                # for _, s in self._obj.iteritems()
+                for _, s in self._obj.items()
             ],
             axis=1,
         )
@@ -204,7 +211,8 @@ def encode_edge(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
 
         # Process each variant
         results = []
-        for _, s in self._obj.iteritems():
+        # for _, s in self._obj.iteritems():
+        for _, s in self._obj.items():
             if not GenotypeDtype.is_dtype(s):
                 results.append(s)
                 continue
diff --git a/pandas_genomics/arrays/genotype_array.py b/pandas_genomics/arrays/genotype_array.py
@@ -2,20 +2,27 @@
 import re
 from copy import copy
 from typing import Dict, MutableMapping, Any, Optional, List, Union, Tuple, Iterable
+# from pandas.arrays import BooleanArray
+from pandas.arrays import BooleanArray, IntegerArray
 
 import numpy as np
 import pandas as pd
-from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
-from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
-from pandas.core.dtypes.inference import is_list_like
+
+# Andre: Update to Python >=3.10
+# from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
+# from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
+# from pandas.core.dtypes.inference import is_list_like
+from pandas.api.extensions import ExtensionArray, register_extension_dtype, ExtensionDtype
+from pandas.api.types import is_list_like
 
 from pandas_genomics.arrays.encoding_mixin import EncodingMixin
 from pandas_genomics.arrays.info_mixin import InfoMixin
 from pandas_genomics.scalars import Variant, Genotype, MISSING_IDX
 
 
 @register_extension_dtype
-class GenotypeDtype(PandasExtensionDtype):
+# class GenotypeDtype(PandasExtensionDtype):
+class GenotypeDtype(ExtensionDtype):
     """
     An ExtensionDtype for genotype data.
 
@@ -374,7 +381,15 @@ def _from_sequence(
             # Use the dtype variant
             variant = dtype.variant
         values = []
+
         for idx, gt in enumerate(scalars):
+
+            # Andre: Update to Python >= 3.10
+            if not isinstance(gt, Genotype):
+                raise TypeError(
+                    f"Expected Genotype instance at index {idx}, got {type(gt).__name__}"
+                )
+
             if not variant.is_same_position(gt.variant):
                 raise ValueError(
                     f"Variant for Genotype {idx} of {len(scalars)} ({gt.variant}) "
@@ -387,6 +402,7 @@ def _from_sequence(
                 )
             else:
                 values.append((gt.allele_idxs, gt._float_score))
+
         result = cls(values=[], dtype=GenotypeDtype(variant))
         result._data = np.array(values, dtype=result._dtype._record_type)
         return result
@@ -474,7 +490,15 @@ def __getitem__(self, index):
         # Check and convert the index
         index = pd.api.indexers.check_array_indexer(self._data, index)
 
-        result = operator.getitem(self._data, index)
+        # Andre: Update to Python >= 3.10
+        # result = operator.getitem(self._data, index)
+        try:
+            result = operator.getitem(self._data, index)
+        except (TypeError, ValueError) as e:
+            raise IndexError(
+                "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
+                "(`None`) and integer or boolean arrays are valid indices"
+            ) from e
 
         if isinstance(result, np.ndarray):
             return GenotypeArray(values=result, dtype=self.dtype)
@@ -528,6 +552,11 @@ def __setitem__(
             self._data[key] = value._data
         elif isinstance(value, pd.Series) and isinstance(value.values, GenotypeArray):
             self._data[key] = value.values._data
+        # Andre: Update to Python >= 3.10
+        elif isinstance(value, np.ndarray) and value.dtype == object:
+            # Convert to GenotypeArray assuming array of Genotype
+            value = self._from_sequence(value.tolist(), dtype=self.dtype)
+            self._data[key] = value._data
         else:
             raise ValueError(
                 f"Can't set the value in a GenotypeArray with '{type(value)}"
@@ -564,7 +593,13 @@ def take(self, indexer, allow_fill=False, fill_value=None):
     def copy(self):
         return GenotypeArray(self._data.copy(), copy(self.dtype))
 
-    def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]:
+    # Andre: Update to Python >= 3.10
+    # def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]:
+    def factorize(
+        self,
+        na_sentinel: int = -1,
+        use_na_sentinel: bool = True
+    ) -> Tuple[np.ndarray, "GenotypeArray"]:
         """
         Return an array of ints indexing unique values
         """
@@ -583,7 +618,15 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]
             codes[self == gt] = idx
 
         # Update codes for NA values
-        codes[self.isna()] = na_sentinel
+        # Andre: Update to Python >= 3.10
+        # codes[self.isna()] = na_sentinel
+        # Handle NA values
+        if use_na_sentinel:
+            codes[self.isna()] = na_sentinel
+        else:
+            # NaNs se tornam parte dos códigos únicos
+            nan_idx = len(set(codes))  # ou: codes.max() + 1
+            codes[self.isna()] = nan_idx
 
         # Return the codes and unique values (not including NA)
         return codes, uniques[~uniques.isna()]
diff --git a/pandas_genomics/io/plink/from_plink.py b/pandas_genomics/io/plink/from_plink.py
@@ -84,7 +84,11 @@ def load_sample_info(fam_file, categorical_phenotype):
     DEFAULT_CAT_MAP = {1: "Control", 2: "Case"}
     if categorical_phenotype:
         df["phenotype"] = df["phenotype"].astype("category")
-        df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP, inplace=True)
+
+        # Andre: Update to Python >= 3.10
+        # df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP, inplace=True)
+        df["phenotype"] = df["phenotype"].cat.rename_categories(DEFAULT_CAT_MAP)
+
         df.loc[~df["phenotype"].isin(DEFAULT_CAT_MAP.values()), "phenotype"] = None
     print(f"\tLoaded information for {len(df)} samples from '{fam_file.name}'")
     return df
diff --git a/pandas_genomics/io/plink/to_plink.py b/pandas_genomics/io/plink/to_plink.py
@@ -76,8 +76,12 @@ def save_fam(
     ]:
         # Recode sex
         fam_data = data.index.to_frame()
-        fam_data["sex"].cat.rename_categories(
-            {"male": 1, "female": 2, "unknown": 0}, inplace=True
+        # Andre: Update to Python >= 3.10
+        # fam_data["sex"].cat.rename_categories(
+        #     {"male": 1, "female": 2, "unknown": 0}, inplace=True
+        # )
+        fam_data["sex"] = fam_data["sex"].cat.rename_categories(
+            {"male": 1, "female": 2, "unknown": 0}
         )
         # Update phenotype if provided
         if phenotype_name is not None:
@@ -109,8 +113,12 @@ def save_fam(
                 "The phenotype must be categorical to utilize 'phenotype_control' and 'phenotype_case' parameters"
             )
         pheno_dict = {phenotype_control: 1, phenotype_case: 2}
-        fam_data["phenotype"].cat.rename_categories(
-            lambda c: pheno_dict.get(c, 0), inplace=True
+        # Andre: Update to Python >= 3.10
+        # fam_data["phenotype"].cat.rename_categories(
+        #     lambda c: pheno_dict.get(c, 0), inplace=True
+        # )
+        fam_data["phenotype"] = fam_data["phenotype"].cat.rename_categories(
+            lambda c: pheno_dict.get(c, 0)
         )
 
     fam_data.to_csv(output_fam, sep=" ", header=False, index=False)
@@ -120,7 +128,9 @@ def save_fam(
 def save_bim(data, output_bim):
     variants = [
         col_val.genomics.variant
-        for col_name, col_val in data.iteritems()
+        # Andre: Update to Python >= 3.10
+        # for col_name, col_val in data.iteritems()
+        for col_name, col_val in data.items()
         if GenotypeDtype.is_dtype(col_val.dtype)
     ]
     for var in variants:
@@ -149,7 +159,9 @@ def save_bed(data, output_bed):
     bytes = np.array(
         [
             gt_array_to_plink_bits(col_val)
-            for col_name, col_val in data.iteritems()
+            # Andre: Update to Python >= 3.10
+            # for col_name, col_val in data.iteritems()
+            for col_name, col_val in data.items()
             if GenotypeDtype.is_dtype(col_val.dtype)
         ]
     )
diff --git a/pandas_genomics/io/vcf.py b/pandas_genomics/io/vcf.py
@@ -42,7 +42,9 @@ def from_vcf(
             continue
 
         # Skip variants below the minimum quality
-        if vcf_variant.QUAL < min_qual:
+        # Andre: Update to Python >= 3.10
+        # if vcf_variant.QUAL < min_qual:
+        if vcf_variant.QUAL is None or vcf_variant.QUAL < min_qual:
             continue
 
         if len(vcf_variant.ALT) >= MISSING_IDX:
diff --git a/pandas_genomics/scalars.py b/pandas_genomics/scalars.py
@@ -430,11 +430,20 @@ def __repr__(self):
     def __hash__(self):
         return hash(repr(self))
 
+    #     # Andre: Update to Python >= 3.10
+    # def __eq__(self, other):
+    #     if other.__class__ is not self.__class__:
+    #         return NotImplemented
+    #     # if self.variant != other.variant:
+    #     #     raise NotImplementedError("Can't compare different variants")
     def __eq__(self, other):
-        if other.__class__ is not self.__class__:
+        if not isinstance(other, Genotype):
             return NotImplemented
         if self.variant != other.variant:
-            raise NotImplementedError("Can't compare different variants")
+            return False
+        return self.allele_idxs == other.allele_idxs
+
+
         return self.allele_idxs == other.allele_idxs
 
     def __lt__(self, other):
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pandas-genomics"
-version = "0.12.1"
+version = "1.0.0"
 description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
 license = "BSD-3-Clause"
 authors = ["Andre Rico <alr6366@psu.edu>"]
@@ -16,10 +16,10 @@ classifiers = [
 ]
 
 [tool.poetry.dependencies]
-python = ">=3.8.0,<3.11.0"
-numpy = "^1.24"
-pandas = "^1.3"
-cyvcf2 = {version="^0.30", markers = "sys_platform != 'win32'"}
+python = ">=3.10,<3.13"
+numpy = ">=1.24,<2.0"
+pandas = ">=1.3,<2.2"
+cyvcf2 = { version = ">=0.31.1", markers = "sys_platform != 'win32'" }
 sphinx_rtd_theme = {version = "^0.5.0", optional = true}
 numpydoc = {version = "^1.1.0", optional = true}
 sphinx-copybutton = {version = "^0.3.0", optional = true}
@@ -40,5 +40,5 @@ black = ">=22"
 docs = ["sphinx", "numpydoc", "sphinx_rtd_theme", "sphinx-copybutton", "ipython"]
 
 [build-system]
-requires = ["poetry>=0.12"]
-build-backend = "poetry.masonry.api"
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/tests/data/vcf/test.vcf b/tests/data/vcf/test.vcf
@@ -0,0 +1,31 @@
+##fileformat=VCFv4.0
+##fileDate=20090805
+##source=myImputationProgramV3.1
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
+##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
+##INFO=<ID=AC,Number=.,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=AF,Number=.,Type=Float,Description="Allele Frequency">
+##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral Allele">
+##INFO=<ID=DB,Number=0,Type=Flag,Description="dbSNP membership, build 129">
+##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
+##FILTER=<ID=q10,Description="Quality below 10">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+##ALT=<ID=CNV,Description="Copy number variable region">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
+19	111	.	A	C	9.6	.	.	GT:HQ	0|0:10,10	0|0:10,10	0/1:3,3
+19	112	.	A	G	10	.	.	GT:HQ	0|0:10,10	0|0:10,10	0/1:3,3
+20	14370	rs6054257	G	A	29	PASS	NS=3;DP=14;AF=0.5;DB;H2	GT:GQ:DP:HQ	0|0:48:1:51,51	1|0:48:8:51,51	1/1:43:5:.,.
+20	17330	.	T	A	3	q10	NS=3;DP=11;AF=0.017	GT:GQ:DP:HQ	0|0:49:3:58,50	0|1:3:5:65,3	0/0:41:3:.,.
+20	1110696	rs6040355	A	G,T	67	PASS	NS=2;DP=10;AF=0.333,0.667;AA=T;DB	GT:GQ:DP:HQ	1|2:21:6:23,27	2|1:2:0:18,2	2/2:35:4:.,.
+20	1230237	.	T	.	47	PASS	NS=3;DP=13;AA=T	GT:GQ:DP:HQ	0|0:54:.:56,60	0|0:48:4:51,51	0/0:61:2:.,.
+20	1234567	microsat1	G	GA,GAC	50	PASS	NS=3;DP=9;AA=G;AN=6;AC=3,1	GT:GQ:DP	0/1:.:4	0/2:17:2	1/1:40:3
+20	1235237	.	T	.	.	.	.	GT	0/0	0|0	./.
+X	10	rsTest	AC	A,ATG	10	PASS	.	GT	0	0/1	0|2
diff --git a/tests/genotype_array/test_ExtensionArray.py b/tests/genotype_array/test_ExtensionArray.py
diff --git a/tests/io/conftest.py b/tests/io/conftest.py
diff --git a/tests/io/test_vcf.py b/tests/io/test_vcf.py

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,9 @@ def __init__(self, pandas_obj):`
`28`	`28`	`id_counts = Counter(`
`29`	`29`	`[`
`30`	`30`	`s.genomics.variant.id`
`31`		`- for _, s in pandas_obj.iteritems()`
	`31`	`+ # Andre: Update to Python >= 3.10`
	`32`	`+ # for _, s in pandas_obj.iteritems()`
	`33`	`+ for _, s in pandas_obj.items()`
`32`	`34`	`if GenotypeDtype.is_dtype(s)`
`33`	`35`	`]`
`34`	`36`	`)`
`@@ -52,7 +54,8 @@ def variant_info(self) -> pd.DataFrame:`
`52`	`54`	`return pd.DataFrame.from_dict(`
`53`	`55`	`{`
`54`	`56`	`colname: series.genomics.variant_info`
`55`		`- for colname, series in genotypes.iteritems()`
	`57`	`+ # for colname, series in genotypes.iteritems()`
	`58`	`+ for colname, series in genotypes.items()`
`56`	`59`	`},`
`57`	`60`	`orient="index",`
`58`	`61`	`)`
`@@ -90,7 +93,8 @@ def encode_additive(self) -> pd.DataFrame:`
`90`	`93`	`return pd.concat(`
`91`	`94`	`[`
`92`	`95`	`s.genomics.encode_additive() if GenotypeDtype.is_dtype(s) else s`
`93`		`- for _, s in self._obj.iteritems()`
	`96`	`+ # for _, s in self._obj.iteritems()`
	`97`	`+ for _, s in self._obj.items()`
`94`	`98`	`],`
`95`	`99`	`axis=1,`
`96`	`100`	`)`
`@@ -107,7 +111,8 @@ def encode_dominant(self) -> pd.DataFrame:`
`107`	`111`	`return pd.concat(`
`108`	`112`	`[`
`109`	`113`	`s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s`
`110`		`- for _, s in self._obj.iteritems()`
	`114`	`+ # for _, s in self._obj.iteritems()`
	`115`	`+ for _, s in self._obj.items()`
`111`	`116`	`],`
`112`	`117`	`axis=1,`
`113`	`118`	`)`
`@@ -124,7 +129,8 @@ def encode_recessive(self) -> pd.DataFrame:`
`124`	`129`	`return pd.concat(`
`125`	`130`	`[`
`126`	`131`	`s.genomics.encode_recessive() if GenotypeDtype.is_dtype(s) else s`
`127`		`- for _, s in self._obj.iteritems()`
	`132`	`+ # for _, s in self._obj.iteritems()`
	`133`	`+ for _, s in self._obj.items()`
`128`	`134`	`],`
`129`	`135`	`axis=1,`
`130`	`136`	`)`
`@@ -141,7 +147,8 @@ def encode_codominant(self) -> pd.DataFrame:`
`141`	`147`	`return pd.concat(`
`142`	`148`	`[`
`143`	`149`	`s.genomics.encode_codominant() if GenotypeDtype.is_dtype(s) else s`
`144`		`- for _, s in self._obj.iteritems()`
	`150`	`+ # for _, s in self._obj.iteritems()`
	`151`	`+ for _, s in self._obj.items()`
`145`	`152`	`],`
`146`	`153`	`axis=1,`
`147`	`154`	`)`
`@@ -204,7 +211,8 @@ def encode_edge(self, encoding_info: pd.DataFrame) -> pd.DataFrame:`
`204`	`211`
`205`	`212`	`# Process each variant`
`206`	`213`	`results = []`
`207`		`- for _, s in self._obj.iteritems():`
	`214`	`+ # for _, s in self._obj.iteritems():`
	`215`	`+ for _, s in self._obj.items():`
`208`	`216`	`if not GenotypeDtype.is_dtype(s):`
`209`	`217`	`results.append(s)`
`210`	`218`	`continue`