Merge branch 'main' into bugfix-spss-kwargs

astronights · web-flow · commit 0d3d87c3f15f · 2024-01-24T08:22:29.000+08:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -65,16 +65,8 @@ fi
 ### DOCSTRINGS ###
 if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
 
-    MSG='Validate docstrings (EX01, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
-    RET=$(($RET + $?)) ; echo $MSG "DONE"
-
-    MSG='Partially validate docstrings (EX03)' ;  echo $MSG
-    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
-        pandas.Series.plot.line \
-        pandas.Series.to_sql \
-        pandas.read_json \
-        pandas.DataFrame.to_sql # There should be no backslash in the final line, please keep this comment in the last ignored function
+    MSG='Validate docstrings (EX01, EX03, EX04, GL01, GL02, GL03, GL04, GL05, GL06, GL07, GL09, GL10, PR03, PR04, PR05, PR06, PR08, PR09, PR10, RT01, RT02, RT04, RT05, SA02, SA03, SA04, SS01, SS02, SS03, SS04, SS05, SS06)' ; echo $MSG
+    $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01,EX03,EX04,GL01,GL02,GL03,GL04,GL05,GL06,GL07,GL09,GL10,PR03,PR04,PR05,PR06,PR08,PR09,PR10,RT01,RT02,RT04,RT05,SA02,SA03,SA04,SS01,SS02,SS03,SS04,SS05,SS06
     RET=$(($RET + $?)) ; echo $MSG "DONE"
 
     MSG='Partially validate docstrings (PR02)' ;  echo $MSG
diff --git a/doc/source/reference/frame.rst b/doc/source/reference/frame.rst
@@ -49,6 +49,7 @@ Conversion
    DataFrame.infer_objects
    DataFrame.copy
    DataFrame.bool
+   DataFrame.to_numpy
 
 Indexing, iteration
 ~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -107,6 +107,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
 - Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
 - Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
+- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 -
 
 .. ---------------------------------------------------------------------------
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -933,6 +933,9 @@ cdef class StringHashTable(HashTable):
             kh_destroy_str(self.table)
             self.table = NULL
 
+    def __len__(self) -> int:
+        return self.table.size
+
     def sizeof(self, deep: bool = False) -> int:
         overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
         for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -50,6 +50,7 @@ class UInt32Engine(IndexEngine): ...
 class UInt16Engine(IndexEngine): ...
 class UInt8Engine(IndexEngine): ...
 class ObjectEngine(IndexEngine): ...
+class StringEngine(IndexEngine): ...
 class DatetimeEngine(Int64Engine): ...
 class TimedeltaEngine(DatetimeEngine): ...
 class PeriodEngine(Int64Engine): ...
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -533,6 +533,17 @@ cdef class ObjectEngine(IndexEngine):
         return loc
 
 
+cdef class StringEngine(IndexEngine):
+
+    cdef _make_hash_table(self, Py_ssize_t n):
+        return _hash.StringHashTable(n)
+
+    cdef _check_type(self, object val):
+        if not isinstance(val, str):
+            raise KeyError(val)
+        return str(val)
+
+
 cdef class DatetimeEngine(Int64Engine):
 
     cdef:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2999,7 +2999,7 @@ def to_sql(
         3
         >>> from sqlalchemy import text
         >>> with engine.connect() as conn:
-        ...    conn.execute(text("SELECT * FROM users")).fetchall()
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
         [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
 
         An `sqlalchemy.engine.Connection` can also be passed to `con`:
@@ -3016,7 +3016,7 @@ def to_sql(
         >>> df2.to_sql(name='users', con=engine, if_exists='append')
         2
         >>> with engine.connect() as conn:
-        ...    conn.execute(text("SELECT * FROM users")).fetchall()
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
         [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
          (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
          (1, 'User 7')]
@@ -3027,7 +3027,7 @@ def to_sql(
         ...            index_label='id')
         2
         >>> with engine.connect() as conn:
-        ...    conn.execute(text("SELECT * FROM users")).fetchall()
+        ...     conn.execute(text("SELECT * FROM users")).fetchall()
         [(0, 'User 6'), (1, 'User 7')]
 
         Use ``method`` to define a callable insertion method to do nothing
@@ -3040,13 +3040,14 @@ def to_sql(
         ...     stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"])
         ...     result = conn.execute(stmt)
         ...     return result.rowcount
-        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing)  # doctest: +SKIP
+        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append",  # noqa: F821
+        ...                    method=insert_on_conflict_nothing)  # doctest: +SKIP
         0
 
         For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict
         on a primary key.
 
-        >>> from sqlalchemy.dialects.mysql import insert
+        >>> from sqlalchemy.dialects.mysql import insert   # noqa: F811
         >>> def insert_on_conflict_update(table, conn, keys, data_iter):
         ...     # update columns "b" and "c" on primary key conflict
         ...     data = [dict(zip(keys, row)) for row in data_iter]
@@ -3057,7 +3058,8 @@ def to_sql(
         ...     stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)
         ...     result = conn.execute(stmt)
         ...     return result.rowcount
-        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update)  # doctest: +SKIP
+        >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append",  # noqa: F821
+        ...                    method=insert_on_conflict_update)  # doctest: +SKIP
         2
 
         Specify the dtype (especially useful for integers with missing values).
@@ -3078,7 +3080,7 @@ def to_sql(
         3
 
         >>> with engine.connect() as conn:
-        ...   conn.execute(text("SELECT * FROM integers")).fetchall()
+        ...     conn.execute(text("SELECT * FROM integers")).fetchall()
         [(1,), (None,), (2,)]
         """  # noqa: E501
         from pandas.io import sql
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -883,6 +883,8 @@ def _engine(
             # error: Item "ExtensionArray" of "Union[ExtensionArray,
             # ndarray[Any, Any]]" has no attribute "_ndarray"  [union-attr]
             target_values = self._data._ndarray  # type: ignore[union-attr]
+        elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
+            return libindex.StringEngine(target_values)
 
         # error: Argument 1 to "ExtensionEngine" has incompatible type
         # "ndarray[Any, Any]"; expected "ExtensionArray"
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
@@ -717,7 +717,7 @@ def read_json(
 "data":[["a","b"],["c","d"]]\
 }}\
 '
-    >>> pd.read_json(StringIO(_), orient='split')
+    >>> pd.read_json(StringIO(_), orient='split')  # noqa: F821
           col 1 col 2
     row 1     a     b
     row 2     c     d
@@ -727,7 +727,7 @@ def read_json(
     >>> df.to_json(orient='index')
     '{{"row 1":{{"col 1":"a","col 2":"b"}},"row 2":{{"col 1":"c","col 2":"d"}}}}'
 
-    >>> pd.read_json(StringIO(_), orient='index')
+    >>> pd.read_json(StringIO(_), orient='index')  # noqa: F821
           col 1 col 2
     row 1     a     b
     row 2     c     d
@@ -737,7 +737,7 @@ def read_json(
 
     >>> df.to_json(orient='records')
     '[{{"col 1":"a","col 2":"b"}},{{"col 1":"c","col 2":"d"}}]'
-    >>> pd.read_json(StringIO(_), orient='records')
+    >>> pd.read_json(StringIO(_), orient='records')  # noqa: F821
       col 1 col 2
     0     a     b
     1     c     d
diff --git a/pandas/tests/io/xml/test_xml.py b/pandas/tests/io/xml/test_xml.py
@@ -1361,16 +1361,18 @@ def test_stylesheet_with_etree(kml_cta_rail_lines, xsl_flatten_doc):
 
 @pytest.mark.parametrize("val", ["", b""])
 def test_empty_stylesheet(val):
-    pytest.importorskip("lxml")
+    lxml_etree = pytest.importorskip("lxml.etree")
+
     msg = (
         "Passing literal xml to 'read_xml' is deprecated and "
         "will be removed in a future version. To read from a "
         "literal string, wrap it in a 'StringIO' object."
     )
     kml = os.path.join("data", "xml", "cta_rail_lines.kml")
 
-    with pytest.raises(FutureWarning, match=msg):
-        read_xml(kml, stylesheet=val)
+    with pytest.raises(lxml_etree.XMLSyntaxError):
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            read_xml(kml, stylesheet=val)
 
 
 # ITERPARSE
diff --git a/pandas/tests/tslibs/test_to_offset.py b/pandas/tests/tslibs/test_to_offset.py
@@ -8,6 +8,8 @@
     to_offset,
 )
 
+import pandas._testing as tm
+
 
 @pytest.mark.parametrize(
     "freq_input,expected",
@@ -194,7 +196,7 @@ def test_to_offset_lowercase_frequency_deprecated(freq_depr):
     depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a "
     f"future version, please use '{freq_depr.upper()[1:]}' instead."
 
-    with pytest.raises(FutureWarning, match=depr_msg):
+    with tm.assert_produces_warning(FutureWarning, match=depr_msg):
         to_offset(freq_depr)
 
 
@@ -214,5 +216,5 @@ def test_to_offset_uppercase_frequency_deprecated(freq_depr):
     depr_msg = f"'{freq_depr[1:]}' is deprecated and will be removed in a "
     f"future version, please use '{freq_depr.lower()[1:]}' instead."
 
-    with pytest.raises(FutureWarning, match=depr_msg):
+    with tm.assert_produces_warning(FutureWarning, match=depr_msg):
         to_offset(freq_depr)

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,7 @@ Performance improvements`
`107`	`107`	- Performance improvement in :meth:`DataFrameGroupBy.ffill`, :meth:`DataFrameGroupBy.bfill`, :meth:`SeriesGroupBy.ffill`, and :meth:`SeriesGroupBy.bfill` (:issue:`56902`)
`108`	`108`	- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
`109`	`109`	- Performance improvement in :meth:`MultiIndex.equals` for equal length indexes (:issue:`56990`)
	`110`	+- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
`110`	`111`	`-`
`111`	`112`
`112`	`113`	`.. ---------------------------------------------------------------------------`