Merge branch 'main' into bug#60723

Anurag-Varma · web-flow · commit bf40bb52964d · 2025-03-06T17:23:54.000-05:00
diff --git a/Dockerfile b/Dockerfile
@@ -13,5 +13,5 @@ COPY requirements-dev.txt /tmp
 RUN python -m pip install -r /tmp/requirements-dev.txt
 RUN git config --global --add safe.directory /home/pandas
 
-ENV SHELL "/bin/bash"
+ENV SHELL="/bin/bash"
 CMD ["/bin/bash"]
diff --git a/doc/source/getting_started/overview.rst b/doc/source/getting_started/overview.rst
@@ -174,3 +174,4 @@ License
 -------
 
 .. literalinclude:: ../../../LICENSE
+   :language: none
diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -18,10 +18,10 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like
     :widths: 30, 100, 60, 60
 
     text,`CSV <https://en.wikipedia.org/wiki/Comma-separated_values>`__, :ref:`read_csv<io.read_csv_table>`, :ref:`to_csv<io.store_in_csv>`
-    text,Fixed-Width Text File, :ref:`read_fwf<io.fwf_reader>` , NA
+    text,Fixed-Width Text File, :ref:`read_fwf<io.fwf_reader>`, NA
     text,`JSON <https://www.json.org/>`__, :ref:`read_json<io.json_reader>`, :ref:`to_json<io.json_writer>`
     text,`HTML <https://en.wikipedia.org/wiki/HTML>`__, :ref:`read_html<io.read_html>`, :ref:`to_html<io.html>`
-    text,`LaTeX <https://en.wikipedia.org/wiki/LaTeX>`__, :ref:`Styler.to_latex<io.latex>` , NA
+    text,`LaTeX <https://en.wikipedia.org/wiki/LaTeX>`__, NA, :ref:`Styler.to_latex<io.latex>`
     text,`XML <https://www.w3.org/standards/xml/core>`__, :ref:`read_xml<io.read_xml>`, :ref:`to_xml<io.xml>`
     text, Local clipboard, :ref:`read_clipboard<io.clipboard>`, :ref:`to_clipboard<io.clipboard>`
     binary,`MS Excel <https://en.wikipedia.org/wiki/Microsoft_Excel>`__ , :ref:`read_excel<io.excel_reader>`, :ref:`to_excel<io.excel_writer>`
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -718,6 +718,7 @@ I/O
 ^^^
 - Bug in :class:`DataFrame` and :class:`Series` ``repr`` of :py:class:`collections.abc.Mapping`` elements. (:issue:`57915`)
 - Bug in :meth:`.DataFrame.to_json` when ``"index"`` was a value in the :attr:`DataFrame.column` and :attr:`Index.name` was ``None``. Now, this will fail with a ``ValueError`` (:issue:`58925`)
+- Bug in :meth:`.io.common.is_fsspec_url` not recognizing chained fsspec URLs (:issue:`48978`)
 - Bug in :meth:`DataFrame._repr_html_` which ignored the ``"display.float_format"`` option (:issue:`59876`)
 - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`)
 - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`)
diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx
@@ -1740,7 +1740,8 @@ cdef class _Timedelta(timedelta):
         Format the Timedelta as ISO 8601 Duration.
 
         ``P[n]Y[n]M[n]DT[n]H[n]M[n]S``, where the ``[n]`` s are replaced by the
-        values. See https://en.wikipedia.org/wiki/ISO_8601#Durations.
+        values. See Wikipedia:
+        `ISO 8601 § Durations <https://en.wikipedia.org/wiki/ISO_8601#Durations>`_.
 
         Returns
         -------
diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx
@@ -1309,7 +1309,7 @@ cdef class _Timestamp(ABCTimestamp):
         By default, the fractional part is omitted if self.microsecond == 0
         and self._nanosecond == 0.
 
-        If self.tzinfo is not None, the UTC offset is also attached, giving
+        If self.tzinfo is not None, the UTC offset is also attached,
         giving a full format of 'YYYY-MM-DD HH:MM:SS.mmmmmmnnn+HH:MM'.
 
         Parameters
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5880,6 +5880,8 @@ def set_index(
             Delete columns to be used as the new index.
         append : bool, default False
             Whether to append columns to existing index.
+            Setting to True will add the new columns to existing index.
+            When set to False, the current index will be dropped from the DataFrame.
         inplace : bool, default False
             Whether to modify the DataFrame rather than creating a new one.
         verify_integrity : bool, default False
@@ -5953,6 +5955,25 @@ def set_index(
         2 4       4  2014    40
         3 9       7  2013    84
         4 16     10  2014    31
+
+        Append a column to the existing index:
+
+        >>> df = df.set_index("month")
+        >>> df.set_index("year", append=True)
+                      sale
+        month  year
+        1      2012    55
+        4      2014    40
+        7      2013    84
+        10     2014    31
+
+        >>> df.set_index("year", append=False)
+               sale
+        year
+        2012    55
+        2014    40
+        2013    84
+        2014    31
         """
         inplace = validate_bool_kwarg(inplace, "inplace")
         self._check_inplace_and_allows_duplicate_labels(inplace)
@@ -10265,7 +10286,9 @@ def apply(
         either the DataFrame's index (``axis=0``) or the DataFrame's columns
         (``axis=1``). By default (``result_type=None``), the final return type
         is inferred from the return type of the applied function. Otherwise,
-        it depends on the `result_type` argument.
+        it depends on the `result_type` argument. The return type of the applied
+        function is inferred based on the first computed result obtained after
+        applying the function to a Series object.
 
         Parameters
         ----------
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -71,7 +71,7 @@
 
 _VALID_URLS = set(uses_relative + uses_netloc + uses_params)
 _VALID_URLS.discard("")
-_RFC_3986_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*://")
+_FSSPEC_URL_PATTERN = re.compile(r"^[A-Za-z][A-Za-z0-9+\-+.]*(::[A-Za-z0-9+\-+.]+)*://")
 
 BaseBufferT = TypeVar("BaseBufferT", bound=BaseBuffer)
 
@@ -291,7 +291,7 @@ def is_fsspec_url(url: FilePath | BaseBuffer) -> bool:
     """
     return (
         isinstance(url, str)
-        and bool(_RFC_3986_PATTERN.match(url))
+        and bool(_FSSPEC_URL_PATTERN.match(url))
         and not url.startswith(("http://", "https://"))
     )
 
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
@@ -147,7 +147,7 @@ def nested_to_record(
     return new_ds
 
 
-def _normalise_json(
+def _normalize_json(
     data: Any,
     key_string: str,
     normalized_dict: dict[str, Any],
@@ -177,7 +177,7 @@ def _normalise_json(
             if not key_string:
                 new_key = new_key.removeprefix(separator)
 
-            _normalise_json(
+            _normalize_json(
                 data=value,
                 key_string=new_key,
                 normalized_dict=normalized_dict,
@@ -188,7 +188,7 @@ def _normalise_json(
     return normalized_dict
 
 
-def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
+def _normalize_json_ordered(data: dict[str, Any], separator: str) -> dict[str, Any]:
     """
     Order the top level keys and then recursively go to depth
 
@@ -201,10 +201,10 @@ def _normalise_json_ordered(data: dict[str, Any], separator: str) -> dict[str, A
 
     Returns
     -------
-    dict or list of dicts, matching `normalised_json_object`
+    dict or list of dicts, matching `normalized_json_object`
     """
     top_dict_ = {k: v for k, v in data.items() if not isinstance(v, dict)}
-    nested_dict_ = _normalise_json(
+    nested_dict_ = _normalize_json(
         data={k: v for k, v in data.items() if isinstance(v, dict)},
         key_string="",
         normalized_dict={},
@@ -235,7 +235,7 @@ def _simple_json_normalize(
     Returns
     -------
     frame : DataFrame
-    d - dict or list of dicts, matching `normalised_json_object`
+    d - dict or list of dicts, matching `normalized_json_object`
 
     Examples
     --------
@@ -256,14 +256,14 @@ def _simple_json_normalize(
 }
 
     """
-    normalised_json_object = {}
+    normalized_json_object = {}
     # expect a dictionary, as most jsons are. However, lists are perfectly valid
     if isinstance(ds, dict):
-        normalised_json_object = _normalise_json_ordered(data=ds, separator=sep)
+        normalized_json_object = _normalize_json_ordered(data=ds, separator=sep)
     elif isinstance(ds, list):
-        normalised_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
-        return normalised_json_list
-    return normalised_json_object
+        normalized_json_list = [_simple_json_normalize(row, sep=sep) for row in ds]
+        return normalized_json_list
+    return normalized_json_object
 
 
 def json_normalize(
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -1753,6 +1753,7 @@ def test_read_timezone_information(self):
         [
             "s3://example-fsspec/",
             "gcs://another-fsspec/file.json",
+            "filecache::s3://yet-another-fsspec/file.json",
             "https://example-site.com/data",
             "some-protocol://data.txt",
         ],
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -501,6 +501,18 @@ def test_is_fsspec_url():
     assert icom.is_fsspec_url("RFC-3986+compliant.spec://something")
 
 
+def test_is_fsspec_url_chained():
+    # GH#48978 Support chained fsspec URLs
+    # See https://filesystem-spec.readthedocs.io/en/latest/features.html#url-chaining.
+    assert icom.is_fsspec_url("filecache::s3://pandas/test.csv")
+    assert icom.is_fsspec_url("zip://test.csv::filecache::gcs://bucket/file.zip")
+    assert icom.is_fsspec_url("filecache::zip://test.csv::gcs://bucket/file.zip")
+    assert icom.is_fsspec_url("filecache::dask::s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache:s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache:::s3://pandas/test.csv")
+    assert not icom.is_fsspec_url("filecache::://pandas/test.csv")
+
+
 @pytest.mark.parametrize("encoding", [None, "utf-8"])
 @pytest.mark.parametrize("format", ["csv", "json"])
 def test_codecs_encoding(encoding, format):
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
@@ -590,7 +590,7 @@ df = pd.read_csv("big.csv")  # use all your cores!
 ### [Pandarallel](https://github.com/nalepae/pandarallel)
 
 Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code.
-If also displays progress bars.
+It also displays progress bars.
 
 ```python
 from pandarallel import pandarallel

Original file line number	Diff line number	Diff line change
`@@ -174,3 +174,4 @@ License`
`174`	`174`	`-------`
`175`	`175`
`176`	`176`	`.. literalinclude:: ../../../LICENSE`
	`177`	`+ :language: none`
Original file line number	Diff line number	Diff line change
`@@ -1753,6 +1753,7 @@ def test_read_timezone_information(self):`
`1753`	`1753`	`[`
`1754`	`1754`	`"s3://example-fsspec/",`
`1755`	`1755`	`"gcs://another-fsspec/file.json",`
	`1756`	`+ "filecache::s3://yet-another-fsspec/file.json",`
`1756`	`1757`	`"https://example-site.com/data",`
`1757`	`1758`	`"some-protocol://data.txt",`
`1758`	`1759`	`],`