Skip to content

Commit 0e69bf6

Browse files
committed
Merge branch 'main' into ref-stringarray-pyarrow-1
2 parents 1a38add + 4409d42 commit 0e69bf6

File tree

19 files changed

+394
-393
lines changed

19 files changed

+394
-393
lines changed

ci/code_checks.sh

Lines changed: 1 addition & 108 deletions
Large diffs are not rendered by default.

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ Other enhancements
5353
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
5454
- Multiplying two :class:`DateOffset` objects will now raise a ``TypeError`` instead of a ``RecursionError`` (:issue:`59442`)
5555
- Restore support for reading Stata 104-format and enable reading 103-format dta files (:issue:`58554`)
56+
- Support passing a :class:`Iterable[Hashable]` input to :meth:`DataFrame.drop_duplicates` (:issue:`59237`)
5657
- Support reading Stata 102-format (Stata 1) dta files (:issue:`58978`)
5758
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
5859

pandas/_libs/tslibs/offsets.pyx

Lines changed: 110 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -595,13 +595,42 @@ cdef class BaseOffset:
595595

596596
@property
597597
def rule_code(self) -> str:
598+
"""
599+
Return a string representing the base frequency.
600+
601+
See Also
602+
--------
603+
tseries.offsets.Hour.rule_code :
604+
Returns a string representing the base frequency of 'h'.
605+
tseries.offsets.Day.rule_code :
606+
Returns a string representing the base frequency of 'D'.
607+
608+
Examples
609+
--------
610+
>>> pd.offsets.Hour().rule_code
611+
'h'
612+
613+
>>> pd.offsets.Week(5).rule_code
614+
'W'
615+
"""
598616
return self._prefix
599617

600618
@cache_readonly
601619
def freqstr(self) -> str:
602620
"""
603621
Return a string representing the frequency.
604622

623+
See Also
624+
--------
625+
tseries.offsets.BusinessDay.freqstr :
626+
Return a string representing an offset frequency in Business Days.
627+
tseries.offsets.BusinessHour.freqstr :
628+
Return a string representing an offset frequency in Business Hours.
629+
tseries.offsets.Week.freqstr :
630+
Return a string representing an offset frequency in Weeks.
631+
tseries.offsets.Hour.freqstr :
632+
Return a string representing an offset frequency in Hours.
633+
605634
Examples
606635
--------
607636
>>> pd.DateOffset(5).freqstr
@@ -779,6 +808,26 @@ cdef class BaseOffset:
779808

780809
@property
781810
def nanos(self):
811+
"""
812+
Returns a integer of the total number of nanoseconds for fixed frequencies.
813+
814+
Raises
815+
------
816+
ValueError
817+
If the frequency is non-fixed.
818+
819+
See Also
820+
--------
821+
tseries.offsets.Hour.nanos :
822+
Returns an integer of the total number of nanoseconds.
823+
tseries.offsets.Day.nanos :
824+
Returns an integer of the total number of nanoseconds.
825+
826+
Examples
827+
--------
828+
>>> pd.offsets.Week(n=1).nanos
829+
ValueError: Week: weekday=None is a non-fixed frequency
830+
"""
782831
raise ValueError(f"{self} is a non-fixed frequency")
783832

784833
# ------------------------------------------------------------------
@@ -986,12 +1035,14 @@ cdef class Tick(SingleConstructorOffset):
9861035
@property
9871036
def nanos(self) -> int64_t:
9881037
"""
989-
Return an integer of the total number of nanoseconds.
1038+
Returns an integer of the total number of nanoseconds.
9901039

991-
Raises
992-
------
993-
ValueError
994-
If the frequency is non-fixed.
1040+
See Also
1041+
--------
1042+
tseries.offsets.Hour.nanos :
1043+
Returns an integer of the total number of nanoseconds.
1044+
tseries.offsets.Day.nanos :
1045+
Returns an integer of the total number of nanoseconds.
9951046

9961047
Examples
9971048
--------
@@ -2426,6 +2477,24 @@ cdef class WeekOfMonthMixin(SingleConstructorOffset):
24262477

24272478
@property
24282479
def rule_code(self) -> str:
2480+
"""
2481+
Return a string representing the base frequency.
2482+
2483+
See Also
2484+
--------
2485+
tseries.offsets.Hour.rule_code :
2486+
Returns a string representing the base frequency of 'h'.
2487+
tseries.offsets.Day.rule_code :
2488+
Returns a string representing the base frequency of 'D'.
2489+
2490+
Examples
2491+
--------
2492+
>>> pd.offsets.Week(5).rule_code
2493+
'W'
2494+
2495+
>>> pd.offsets.WeekOfMonth(n=1, week=0, weekday=0).rule_code
2496+
'WOM-1MON'
2497+
"""
24292498
weekday = int_to_weekday.get(self.weekday, "")
24302499
if self.week == -1:
24312500
# LastWeekOfMonth
@@ -2472,6 +2541,24 @@ cdef class YearOffset(SingleConstructorOffset):
24722541

24732542
@property
24742543
def rule_code(self) -> str:
2544+
"""
2545+
Return a string representing the base frequency.
2546+
2547+
See Also
2548+
--------
2549+
tseries.offsets.Hour.rule_code :
2550+
Returns a string representing the base frequency of 'h'.
2551+
tseries.offsets.Day.rule_code :
2552+
Returns a string representing the base frequency of 'D'.
2553+
2554+
Examples
2555+
--------
2556+
>>> pd.tseries.offsets.YearBegin(n=1, month=2).rule_code
2557+
'YS-FEB'
2558+
2559+
>>> pd.tseries.offsets.YearEnd(n=1, month=6).rule_code
2560+
'YE-JUN'
2561+
"""
24752562
month = MONTH_ALIASES[self.month]
24762563
return f"{self._prefix}-{month}"
24772564

@@ -3458,6 +3545,24 @@ cdef class Week(SingleConstructorOffset):
34583545

34593546
@property
34603547
def rule_code(self) -> str:
3548+
"""
3549+
Return a string representing the base frequency.
3550+
3551+
See Also
3552+
--------
3553+
tseries.offsets.Hour.name :
3554+
Returns a string representing the base frequency of 'h'.
3555+
tseries.offsets.Day.name :
3556+
Returns a string representing the base frequency of 'D'.
3557+
3558+
Examples
3559+
--------
3560+
>>> pd.offsets.Hour().rule_code
3561+
'h'
3562+
3563+
>>> pd.offsets.Week(5).rule_code
3564+
'W'
3565+
"""
34613566
suffix = ""
34623567
if self.weekday is not None:
34633568
weekday = int_to_weekday[self.weekday]

pandas/_libs/tslibs/period.pyx

Lines changed: 77 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1913,20 +1913,58 @@ cdef class _Period(PeriodMixin):
19131913
Parameters
19141914
----------
19151915
freq : str, BaseOffset
1916-
The desired frequency. If passing a `str`, it needs to be a
1917-
valid :ref:`period alias <timeseries.period_aliases>`.
1916+
The target frequency to convert the Period object to.
1917+
If a string is provided,
1918+
it must be a valid :ref:`period alias <timeseries.period_aliases>`.
1919+
19181920
how : {'E', 'S', 'end', 'start'}, default 'end'
1919-
Start or end of the timespan.
1921+
Specifies whether to align the period to the start or end of the interval:
1922+
- 'E' or 'end': Align to the end of the interval.
1923+
- 'S' or 'start': Align to the start of the interval.
19201924

19211925
Returns
19221926
-------
1923-
resampled : Period
1927+
Period : Period object with the specified frequency, aligned to the parameter.
1928+
1929+
See Also
1930+
--------
1931+
Period.end_time : Return the end Timestamp.
1932+
Period.start_time : Return the start Timestamp.
1933+
Period.dayofyear : Return the day of the year.
1934+
Period.dayofweek : Return the day of the week.
19241935

19251936
Examples
19261937
--------
1927-
>>> period = pd.Period('2023-1-1', freq='D')
1938+
Convert a daily period to an hourly period, aligning to the end of the day:
1939+
1940+
>>> period = pd.Period('2023-01-01', freq='D')
19281941
>>> period.asfreq('h')
19291942
Period('2023-01-01 23:00', 'h')
1943+
1944+
Convert a monthly period to a daily period, aligning to the start of the month:
1945+
1946+
>>> period = pd.Period('2023-01', freq='M')
1947+
>>> period.asfreq('D', how='start')
1948+
Period('2023-01-01', 'D')
1949+
1950+
Convert a yearly period to a monthly period, aligning to the last month:
1951+
1952+
>>> period = pd.Period('2023', freq='Y')
1953+
>>> period.asfreq('M', how='end')
1954+
Period('2023-12', 'M')
1955+
1956+
Convert a monthly period to an hourly period,
1957+
aligning to the first day of the month:
1958+
1959+
>>> period = pd.Period('2023-01', freq='M')
1960+
>>> period.asfreq('h', how='start')
1961+
Period('2023-01-01 00:00', 'H')
1962+
1963+
Convert a weekly period to a daily period, aligning to the last day of the week:
1964+
1965+
>>> period = pd.Period('2023-08-01', freq='W')
1966+
>>> period.asfreq('D', how='end')
1967+
Period('2023-08-04', 'D')
19301968
"""
19311969
freq = self._maybe_convert_freq(freq)
19321970
how = validate_end_alias(how)
@@ -2014,11 +2052,45 @@ cdef class _Period(PeriodMixin):
20142052
"""
20152053
Return the month this Period falls on.
20162054

2055+
Returns
2056+
-------
2057+
int
2058+
2059+
See Also
2060+
--------
2061+
period.week : Get the week of the year on the given Period.
2062+
Period.year : Return the year this Period falls on.
2063+
Period.day : Return the day of the month this Period falls on.
2064+
2065+
Notes
2066+
-----
2067+
The month is based on the `ordinal` and `base` attributes of the Period.
2068+
20172069
Examples
20182070
--------
2071+
Create a Period object for January 2022 and get the month:
2072+
20192073
>>> period = pd.Period('2022-01', 'M')
20202074
>>> period.month
20212075
1
2076+
2077+
Period object with no specified frequency, resulting in a default frequency:
2078+
2079+
>>> period = pd.Period('2022', 'Y')
2080+
>>> period.month
2081+
12
2082+
2083+
Create a Period object with a specified frequency but an incomplete date string:
2084+
2085+
>>> period = pd.Period('2022', 'M')
2086+
>>> period.month
2087+
1
2088+
2089+
Handle a case where the Period object is empty, which results in `NaN`:
2090+
2091+
>>> period = pd.Period('nan', 'M')
2092+
>>> period.month
2093+
nan
20222094
"""
20232095
base = self._dtype._dtype_code
20242096
return pmonth(self.ordinal, base)

pandas/core/arrays/string_.py

Lines changed: 23 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,12 @@ def _reduce(
746746
axis: AxisInt | None = 0,
747747
**kwargs,
748748
):
749+
if self.dtype.na_value is np.nan and name in ["any", "all"]:
750+
if name == "any":
751+
return nanops.nanany(self._ndarray, skipna=skipna)
752+
else:
753+
return nanops.nanall(self._ndarray, skipna=skipna)
754+
749755
if name in ["min", "max"]:
750756
result = getattr(self, name)(skipna=skipna, axis=axis)
751757
if keepdims:
@@ -754,6 +760,12 @@ def _reduce(
754760

755761
raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
756762

763+
def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
764+
if self.dtype.na_value is np.nan and result is libmissing.NA:
765+
# the masked_reductions use pd.NA -> convert to np.nan
766+
return np.nan
767+
return super()._wrap_reduction_result(axis, result)
768+
757769
def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
758770
nv.validate_min((), kwargs)
759771
result = masked_reductions.min(
@@ -771,8 +783,11 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
771783
def value_counts(self, dropna: bool = True) -> Series:
772784
from pandas.core.algorithms import value_counts_internal as value_counts
773785

774-
result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64")
786+
result = value_counts(self._ndarray, sort=False, dropna=dropna)
775787
result.index = result.index.astype(self.dtype)
788+
789+
if self.dtype.na_value is libmissing.NA:
790+
result = result.astype("Int64")
776791
return result
777792

778793
def memory_usage(self, deep: bool = False) -> int:
@@ -823,7 +838,13 @@ def _cmp_method(self, other, op):
823838
# logical
824839
result = np.zeros(len(self._ndarray), dtype="bool")
825840
result[valid] = op(self._ndarray[valid], other)
826-
return BooleanArray(result, mask)
841+
res_arr = BooleanArray(result, mask)
842+
if self.dtype.na_value is np.nan:
843+
if op == operator.ne:
844+
return res_arr.to_numpy(np.bool_, na_value=True)
845+
else:
846+
return res_arr.to_numpy(np.bool_, na_value=False)
847+
return res_arr
827848

828849
_arith_method = _cmp_method
829850

@@ -864,37 +885,6 @@ def _from_backing_data(self, arr: np.ndarray) -> StringArrayNumpySemantics:
864885
# we always preserve the dtype
865886
return NDArrayBacked._from_backing_data(self, arr)
866887

867-
def _reduce(
868-
self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
869-
):
870-
if name in ["any", "all"]:
871-
if name == "any":
872-
return nanops.nanany(self._ndarray, skipna=skipna)
873-
else:
874-
return nanops.nanall(self._ndarray, skipna=skipna)
875-
else:
876-
return super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
877-
878-
def _wrap_reduction_result(self, axis: AxisInt | None, result) -> Any:
879-
# the masked_reductions use pd.NA
880-
if result is libmissing.NA:
881-
return np.nan
882-
return super()._wrap_reduction_result(axis, result)
883-
884-
def _cmp_method(self, other, op):
885-
result = super()._cmp_method(other, op)
886-
if op == operator.ne:
887-
return result.to_numpy(np.bool_, na_value=True)
888-
else:
889-
return result.to_numpy(np.bool_, na_value=False)
890-
891-
def value_counts(self, dropna: bool = True) -> Series:
892-
from pandas.core.algorithms import value_counts_internal as value_counts
893-
894-
result = value_counts(self._ndarray, sort=False, dropna=dropna)
895-
result.index = result.index.astype(self.dtype)
896-
return result
897-
898888
# ------------------------------------------------------------------------
899889
# String methods interface
900890
_str_na_value = np.nan

pandas/core/dtypes/common.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
12741274
"""
12751275
Check whether the provided array or dtype is of a boolean dtype.
12761276
1277+
This function verifies whether a given object is a boolean data type. The input
1278+
can be an array or a dtype object. Accepted array types include instances
1279+
of ``np.array``, ``pd.Series``, ``pd.Index``, and similar array-like structures.
1280+
12771281
Parameters
12781282
----------
12791283
arr_or_dtype : array-like or dtype
@@ -1284,6 +1288,10 @@ def is_bool_dtype(arr_or_dtype) -> bool:
12841288
boolean
12851289
Whether or not the array or dtype is of a boolean dtype.
12861290
1291+
See Also
1292+
--------
1293+
api.types.is_bool : Check if an object is a boolean.
1294+
12871295
Notes
12881296
-----
12891297
An ExtensionArray is considered boolean when the ``_is_boolean``

0 commit comments

Comments
 (0)