escape unescape sharp, single quote, double quote

aram-cedarwood · aram-cedarwood · commit a251639c7529 · 2024-07-27T18:56:39.000+02:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -34,6 +34,7 @@
     cast,
     overload,
 )
+import urllib.parse
 import warnings
 
 import numpy as np
@@ -4554,14 +4555,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
         For other characters that fall outside the ASCII range (U+0001..U+007F)
         and those that are not further specified in PEP 3131,
         the query parser will raise an error.
-        This excludes whitespace different than the space character,
-        but also the hashtag (as it is used for comments) and the backtick
-        itself (backtick can also not be escaped).
-
-        In a special case, quotes that make a pair around a backtick can
-        confuse the parser.
-        For example, ```it's` > `that's``` will raise an error,
-        as it forms a quoted string (``'s > `that'``) with a backtick inside.
+        This excludes whitespace different than the space character
+        and the backtick itself (backtick cannot be escaped).
 
         See also the `Python documentation about lexical analysis
         <https://docs.python.org/3/reference/lexical_analysis.html>`__
@@ -4615,7 +4610,35 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
             raise ValueError(msg)
         kwargs["level"] = kwargs.pop("level", 0) + 1
         kwargs["target"] = None
-        res = self.eval(expr, **kwargs)
+
+        # GH 59285
+        if any(("#" in col) or ("'" in col) or ('"' in col) for col in self.columns):
+            # Create a copy of `self` with column names escaped
+            escaped_self = self.copy()
+            escaped_self.columns = [
+                urllib.parse.quote(col) for col in escaped_self.columns
+            ]
+
+            # In expr, escape column names between backticks
+            column_name_to_escaped_name = {
+                col: urllib.parse.quote(col) for col in self.columns
+            }
+            escaped_expr = "`".join(
+                (column_name_to_escaped_name.get(token, token) if (i % 2) else token)
+                for i, token in enumerate(expr.split("`"))
+            )
+
+            # eval
+            escaped_res = escaped_self.eval(escaped_expr, **kwargs)
+
+            # If `res` is a Series or DataFrame, unescape names
+            res = escaped_res.copy()
+            if isinstance(res, Series) and res.name:
+                res.name = urllib.parse.unquote(res.name)
+            elif isinstance(res, DataFrame):
+                res.columns = [urllib.parse.unquote(col) for col in res.columns]
+        else:
+            res = self.eval(expr, **kwargs)
 
         try:
             result = self.loc[res]
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
@@ -1978,6 +1978,64 @@ def test_eval_no_support_column_name(request, column):
     tm.assert_frame_equal(result, expected)
 
 
+def test_query_on_column_name_with_hashtag_character():
+    # GH 59285
+    df = DataFrame((1, 2, 3), columns=["a#"])
+    result = df.query("`a#` < 2")
+    expected = df[df["a#"] < 2]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_query_on_expr_with_comment():
+    # GH 59285
+    df = DataFrame((1, 2, 3), columns=["a#"])
+    result = df.query("`a#` < 2  # This is a comment")
+    expected = df[df["a#"] < 2]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_query_on_column_names_with_single_quote_character():
+    df = DataFrame(
+        [
+            {"it's": 1, "that's": 2},
+            {"it's": 3, "that's": 4},
+            {"it's": -1, "that's": -2},
+            {"it's": -3, "that's": -4},
+        ]
+    )
+    result = df.query("`it's` < `that's`")
+    expected = df[df["it's"] < df["that's"]]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_query_on_column_names_with_double_quote_character():
+    df = DataFrame(
+        [
+            {'it"s': 1, 'that"s': 2},
+            {'it"s': 3, 'that"s': 4},
+            {'it"s': -1, 'that"s': -2},
+            {'it"s': -3, 'that"s': -4},
+        ]
+    )
+    result = df.query('`it"s` < `that"s`')
+    expected = df[df['it"s'] < df['that"s']]
+    tm.assert_frame_equal(result, expected)
+
+
+def test_query_on_column_names_with_single_quote_and_double_quote_character():
+    df = DataFrame(
+        [
+            {"it's": 1, 'that\'s "nice"': 2},
+            {"it's": 3, 'that\'s "nice"': 4},
+            {"it's": -1, 'that\'s "nice"': -2},
+            {"it's": -3, 'that\'s "nice"': -4},
+        ]
+    )
+    result = df.query("`it's` < `that's \"nice\"`")
+    expected = df[df["it's"] < df['that\'s "nice"']]
+    tm.assert_frame_equal(result, expected)
+
+
 def test_set_inplace():
     # https://github.com/pandas-dev/pandas/issues/47449
     # Ensure we don't only update the DataFrame inplace, but also the actual