22import re
33from copy import copy
44from typing import Dict , MutableMapping , Any , Optional , List , Union , Tuple , Iterable
5+ # from pandas.arrays import BooleanArray
6+ from pandas .arrays import BooleanArray , IntegerArray
57
68import numpy as np
79import pandas as pd
8- from pandas .core .arrays import ExtensionArray , BooleanArray , IntegerArray
9- from pandas .core .dtypes .dtypes import register_extension_dtype , PandasExtensionDtype
10- from pandas .core .dtypes .inference import is_list_like
10+
11+ # Andre: Update to Python >=3.10
12+ # from pandas.core.arrays import ExtensionArray, BooleanArray, IntegerArray
13+ # from pandas.core.dtypes.dtypes import register_extension_dtype, PandasExtensionDtype
14+ # from pandas.core.dtypes.inference import is_list_like
15+ from pandas .api .extensions import ExtensionArray , register_extension_dtype , ExtensionDtype
16+ from pandas .api .types import is_list_like
1117
1218from pandas_genomics .arrays .encoding_mixin import EncodingMixin
1319from pandas_genomics .arrays .info_mixin import InfoMixin
1420from pandas_genomics .scalars import Variant , Genotype , MISSING_IDX
1521
1622
1723@register_extension_dtype
18- class GenotypeDtype (PandasExtensionDtype ):
24+ # class GenotypeDtype(PandasExtensionDtype):
25+ class GenotypeDtype (ExtensionDtype ):
1926 """
2027 An ExtensionDtype for genotype data.
2128
@@ -374,7 +381,15 @@ def _from_sequence(
374381 # Use the dtype variant
375382 variant = dtype .variant
376383 values = []
384+
377385 for idx , gt in enumerate (scalars ):
386+
387+ # Andre: Update to Python >= 3.10
388+ if not isinstance (gt , Genotype ):
389+ raise TypeError (
390+ f"Expected Genotype instance at index { idx } , got { type (gt ).__name__ } "
391+ )
392+
378393 if not variant .is_same_position (gt .variant ):
379394 raise ValueError (
380395 f"Variant for Genotype { idx } of { len (scalars )} ({ gt .variant } ) "
@@ -387,6 +402,7 @@ def _from_sequence(
387402 )
388403 else :
389404 values .append ((gt .allele_idxs , gt ._float_score ))
405+
390406 result = cls (values = [], dtype = GenotypeDtype (variant ))
391407 result ._data = np .array (values , dtype = result ._dtype ._record_type )
392408 return result
@@ -474,7 +490,15 @@ def __getitem__(self, index):
474490 # Check and convert the index
475491 index = pd .api .indexers .check_array_indexer (self ._data , index )
476492
477- result = operator .getitem (self ._data , index )
493+ # Andre: Update to Python >= 3.10
494+ # result = operator.getitem(self._data, index)
495+ try :
496+ result = operator .getitem (self ._data , index )
497+ except (TypeError , ValueError ) as e :
498+ raise IndexError (
499+ "only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
500+ "(`None`) and integer or boolean arrays are valid indices"
501+ ) from e
478502
479503 if isinstance (result , np .ndarray ):
480504 return GenotypeArray (values = result , dtype = self .dtype )
@@ -528,6 +552,11 @@ def __setitem__(
528552 self ._data [key ] = value ._data
529553 elif isinstance (value , pd .Series ) and isinstance (value .values , GenotypeArray ):
530554 self ._data [key ] = value .values ._data
555+ # Andre: Update to Python >= 3.10
556+ elif isinstance (value , np .ndarray ) and value .dtype == object :
557+ # Convert to GenotypeArray assuming array of Genotype
558+ value = self ._from_sequence (value .tolist (), dtype = self .dtype )
559+ self ._data [key ] = value ._data
531560 else :
532561 raise ValueError (
533562 f"Can't set the value in a GenotypeArray with '{ type (value )} "
@@ -564,7 +593,13 @@ def take(self, indexer, allow_fill=False, fill_value=None):
564593 def copy (self ):
565594 return GenotypeArray (self ._data .copy (), copy (self .dtype ))
566595
567- def factorize (self , na_sentinel : int = - 1 ) -> Tuple [np .ndarray , "GenotypeArray" ]:
596+ # Andre: Update to Python >= 3.10
597+ # def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]:
598+ def factorize (
599+ self ,
600+ na_sentinel : int = - 1 ,
601+ use_na_sentinel : bool = True
602+ ) -> Tuple [np .ndarray , "GenotypeArray" ]:
568603 """
569604 Return an array of ints indexing unique values
570605 """
@@ -583,7 +618,15 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, "GenotypeArray"]
583618 codes [self == gt ] = idx
584619
585620 # Update codes for NA values
586- codes [self .isna ()] = na_sentinel
621+ # Andre: Update to Python >= 3.10
622+ # codes[self.isna()] = na_sentinel
623+ # Handle NA values
624+ if use_na_sentinel :
625+ codes [self .isna ()] = na_sentinel
626+ else :
627+ # NaNs se tornam parte dos códigos únicos
628+ nan_idx = len (set (codes )) # ou: codes.max() + 1
629+ codes [self .isna ()] = nan_idx
587630
588631 # Return the codes and unique values (not including NA)
589632 return codes , uniques [~ uniques .isna ()]
0 commit comments