Merge remote-tracking branch 'upstream/main' into parametrize-test-common

fangchenli · fangchenli · commit 8d4d04cb16da · 2025-02-25T13:28:36.000-08:00
diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst
@@ -198,7 +198,7 @@ In some cases you may be tempted to use ``cast`` from the typing module when you
            obj = cast(str, obj)  # Mypy complains without this!
            return obj.upper()
 
-The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 <https://github.com/python/mypy/issues/5206>`_. While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
+The limitation here is that while a human can reasonably understand that ``is_number`` would catch the ``int`` and ``float`` types mypy cannot make that same inference just yet (see `mypy #5206 <https://github.com/python/mypy/issues/5206>`_). While the above works, the use of ``cast`` is **strongly discouraged**. Where applicable a refactor of the code to appease static analysis is preferable
 
 .. code-block:: python
 
diff --git a/doc/source/reference/arrays.rst b/doc/source/reference/arrays.rst
@@ -61,7 +61,7 @@ is an :class:`ArrowDtype`.
 support as NumPy including first-class nullability support for all data types, immutability and more.
 
 The table below shows the equivalent pyarrow-backed (``pa``), pandas extension, and numpy (``np``) types that are recognized by pandas.
-Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())``
+Pyarrow-backed types below need to be passed into :class:`ArrowDtype` to be recognized by pandas e.g. ``pd.ArrowDtype(pa.bool_())``.
 
 =============================================== ========================== ===================
 PyArrow type                                    pandas extension type      NumPy type
@@ -114,7 +114,7 @@ values.
 
    ArrowDtype
 
-For more information, please see the :ref:`PyArrow user guide <pyarrow>`
+For more information, please see the :ref:`PyArrow user guide <pyarrow>`.
 
 .. _api.arrays.datetime:
 
@@ -495,7 +495,7 @@ a :class:`CategoricalDtype`.
    CategoricalDtype.categories
    CategoricalDtype.ordered
 
-Categorical data can be stored in a :class:`pandas.Categorical`
+Categorical data can be stored in a :class:`pandas.Categorical`:
 
 .. autosummary::
    :toctree: api/
diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst
@@ -13,7 +13,7 @@ Text data types
 
 There are two ways to store text data in pandas:
 
-1. ``object`` -dtype NumPy array.
+1. ``object`` dtype NumPy array.
 2. :class:`StringDtype` extension type.
 
 We recommend using :class:`StringDtype` to store text data.
@@ -40,20 +40,20 @@ to significantly increase the performance and lower the memory overhead of
    and parts of the API may change without warning.
 
 For backwards-compatibility, ``object`` dtype remains the default type we
-infer a list of strings to
+infer a list of strings to:
 
 .. ipython:: python
 
    pd.Series(["a", "b", "c"])
 
-To explicitly request ``string`` dtype, specify the ``dtype``
+To explicitly request ``string`` dtype, specify the ``dtype``:
 
 .. ipython:: python
 
    pd.Series(["a", "b", "c"], dtype="string")
    pd.Series(["a", "b", "c"], dtype=pd.StringDtype())
 
-Or ``astype`` after the ``Series`` or ``DataFrame`` is created
+Or ``astype`` after the ``Series`` or ``DataFrame`` is created:
 
 .. ipython:: python
 
@@ -88,7 +88,7 @@ Behavior differences
 ^^^^^^^^^^^^^^^^^^^^
 
 These are places where the behavior of ``StringDtype`` objects differ from
-``object`` dtype
+``object`` dtype:
 
 l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
    that return **numeric** output will always return a nullable integer dtype,
@@ -102,7 +102,7 @@ l. For ``StringDtype``, :ref:`string accessor methods<api.series.str>`
       s.str.count("a")
       s.dropna().str.count("a")
 
-   Both outputs are ``Int64`` dtype. Compare that with object-dtype
+   Both outputs are ``Int64`` dtype. Compare that with object-dtype:
 
    .. ipython:: python
 
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -790,6 +790,7 @@ ExtensionArray
 ^^^^^^^^^^^^^^
 - Bug in :class:`Categorical` when constructing with an :class:`Index` with :class:`ArrowDtype` (:issue:`60563`)
 - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`)
+- Bug in :meth:`ArrowExtensionArray.factorize` where NA values were dropped when input was dictionary-encoded even when dropna was set to False(:issue:`60567`)
 - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`)
 - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`)
 - Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`)
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1208,7 +1208,12 @@ def factorize(
             data = data.cast(pa.int64())
 
         if pa.types.is_dictionary(data.type):
-            encoded = data
+            if null_encoding == "encode":
+                # dictionary encode does nothing if an already encoded array is given
+                data = data.cast(data.type.value_type)
+                encoded = data.dictionary_encode(null_encoding=null_encoding)
+            else:
+                encoded = data
         else:
             encoded = data.dictionary_encode(null_encoding=null_encoding)
         if encoded.length() == 0:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -6267,6 +6267,11 @@ def astype(
         """
         Cast a pandas object to a specified dtype ``dtype``.
 
+        This method allows the conversion of the data types of pandas objects,
+        including DataFrames and Series, to the specified dtype. It supports casting
+        entire objects to a single data type or applying different data types to
+        individual columns using a mapping.
+
         Parameters
         ----------
         dtype : str, data type, Series or Mapping of column name -> data type
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -3329,6 +3329,18 @@ def test_factorize_chunked_dictionary():
     tm.assert_index_equal(res_uniques, exp_uniques)
 
 
+def test_factorize_dictionary_with_na():
+    # GH#60567
+    arr = pd.array(
+        ["a1", pd.NA], dtype=ArrowDtype(pa.dictionary(pa.int32(), pa.utf8()))
+    )
+    indices, uniques = arr.factorize(use_na_sentinel=False)
+    expected_indices = np.array([0, 1], dtype=np.intp)
+    expected_uniques = pd.array(["a1", None], dtype=ArrowDtype(pa.string()))
+    tm.assert_numpy_array_equal(indices, expected_indices)
+    tm.assert_extension_array_equal(uniques, expected_uniques)
+
+
 def test_dictionary_astype_categorical():
     # GH#56672
     arrs = [
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -4282,11 +4282,11 @@ def test_xsqlite_execute_fail(sqlite_buildin):
     cur.execute(create_sql)
 
     with sql.pandasSQL_builder(sqlite_buildin) as pandas_sql:
-        pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)')
-        pandas_sql.execute('INSERT INTO test VALUES("foo", "baz", 2.567)')
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
+        pandas_sql.execute("INSERT INTO test VALUES('foo', 'baz', 2.567)")
 
         with pytest.raises(sql.DatabaseError, match="Execution failed on sql"):
-            pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 7)')
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 7)")
 
 
 def test_xsqlite_execute_closed_connection():
@@ -4304,7 +4304,7 @@ def test_xsqlite_execute_closed_connection():
         cur.execute(create_sql)
 
         with sql.pandasSQL_builder(conn) as pandas_sql:
-            pandas_sql.execute('INSERT INTO test VALUES("foo", "bar", 1.234)')
+            pandas_sql.execute("INSERT INTO test VALUES('foo', 'bar', 1.234)")
 
     msg = "Cannot operate on a closed database."
     with pytest.raises(sqlite3.ProgrammingError, match=msg):