Skip to content

Commit 5cb76d0

Browse files
authored
Merge branch 'main' into string-dtype-object-conversion
2 parents 5fbde9f + e1a79b2 commit 5cb76d0

File tree

13 files changed

+385
-59
lines changed

13 files changed

+385
-59
lines changed

.github/actions/build_pandas/action.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ runs:
2222
fi
2323
shell: bash -el {0}
2424

25+
- name: Uninstall nomkl
26+
run: |
27+
if conda list nomkl | grep nomkl 1>/dev/null; then
28+
conda remove nomkl -y
29+
fi
30+
shell: bash -el {0}
31+
2532
- name: Build Pandas
2633
run: |
2734
if [[ ${{ inputs.editable }} == "true" ]]; then

.pre-commit-config.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ repos:
2323
hooks:
2424
- id: ruff
2525
args: [--exit-non-zero-on-fix]
26+
exclude: ^pandas/tests/frame/test_query_eval.py
2627
- id: ruff
2728
# TODO: remove autofixe-only rules when they are checked by ruff
2829
name: ruff-selected-autofixes
@@ -31,7 +32,7 @@ repos:
3132
exclude: ^pandas/tests
3233
args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix]
3334
- id: ruff-format
34-
exclude: ^scripts
35+
exclude: ^scripts|^pandas/tests/frame/test_query_eval.py
3536
- repo: https://github.com/jendrikseipp/vulture
3637
rev: 'v2.11'
3738
hooks:
@@ -85,6 +86,7 @@ repos:
8586
types: [text] # overwrite types: [rst]
8687
types_or: [python, rst]
8788
- id: rst-inline-touching-normal
89+
exclude: ^pandas/tests/frame/test_query_eval.py
8890
types: [text] # overwrite types: [rst]
8991
types_or: [python, rst]
9092
- repo: https://github.com/sphinx-contrib/sphinx-lint

ci/code_checks.sh

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
191191
-i "pandas.Timestamp.nanosecond GL08" \
192192
-i "pandas.Timestamp.resolution PR02" \
193193
-i "pandas.Timestamp.second GL08" \
194-
-i "pandas.Timestamp.strptime PR01,SA01" \
195-
-i "pandas.Timestamp.timetz SA01" \
196-
-i "pandas.Timestamp.to_datetime64 SA01" \
197194
-i "pandas.Timestamp.tzinfo GL08" \
198195
-i "pandas.Timestamp.value GL08" \
199196
-i "pandas.Timestamp.year GL08" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -685,6 +685,7 @@ Other
685685
- Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`)
686686
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`)
687687
- Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`)
688+
- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`)
688689
- Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`)
689690
- Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`)
690691
- Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`)

pandas/_libs/tslibs/nattype.pyx

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,17 @@ cdef class _NaT(datetime):
229229

230230
def to_datetime64(self) -> np.datetime64:
231231
"""
232-
Return a numpy.datetime64 object with same precision.
232+
Return a NumPy datetime64 object with same precision.
233+
234+
This method returns a numpy.datetime64 object with the same
235+
date and time information and precision as the pd.Timestamp object.
236+
237+
See Also
238+
--------
239+
numpy.datetime64 : Class to represent dates and times with high precision.
240+
Timestamp.to_numpy : Alias for this method.
241+
Timestamp.asm8 : Alias for this method.
242+
pd.to_datetime : Convert argument to datetime.
233243

234244
Examples
235245
--------
@@ -764,6 +774,19 @@ class NaTType(_NaT):
764774
"""
765775
Return time object with same time and tzinfo.
766776
777+
This method returns a datetime.time object with
778+
the time and tzinfo corresponding to the pd.Timestamp
779+
object, ignoring any information about the day/date.
780+
781+
See Also
782+
--------
783+
datetime.datetime.timetz : Return datetime.time object with the
784+
same time attributes as the datetime object.
785+
datetime.time : Class to represent the time of day, independent
786+
of any particular day.
787+
datetime.datetime.tzinfo : Attribute of datetime.datetime objects
788+
representing the timezone of the datetime object.
789+
767790
Examples
768791
--------
769792
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -860,9 +883,27 @@ class NaTType(_NaT):
860883
strptime = _make_error_func(
861884
"strptime",
862885
"""
863-
Timestamp.strptime(string, format)
886+
Convert string argument to datetime.
864887
865-
Function is not implemented. Use pd.to_datetime().
888+
This method is not implemented; calling it will raise NotImplementedError.
889+
Use pd.to_datetime() instead.
890+
891+
Parameters
892+
----------
893+
date_string : str
894+
String to convert to a datetime.
895+
format : str, default None
896+
The format string to parse time, e.g. "%d/%m/%Y".
897+
898+
See Also
899+
--------
900+
pd.to_datetime : Convert argument to datetime.
901+
datetime.datetime.strptime : Return a datetime corresponding to a string
902+
representing a date and time, parsed according to a separate
903+
format string.
904+
datetime.datetime.strftime : Return a string representing the date and
905+
time, controlled by an explicit format string.
906+
Timestamp.isoformat : Return the time formatted according to ISO 8601.
866907
867908
Examples
868909
--------

pandas/_libs/tslibs/timestamps.pyx

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1342,7 +1342,17 @@ cdef class _Timestamp(ABCTimestamp):
13421342
13431343
cpdef to_datetime64(self):
13441344
"""
1345-
Return a numpy.datetime64 object with same precision.
1345+
Return a NumPy datetime64 object with same precision.
1346+
1347+
This method returns a numpy.datetime64 object with the same
1348+
date and time information and precision as the pd.Timestamp object.
1349+
1350+
See Also
1351+
--------
1352+
numpy.datetime64 : Class to represent dates and times with high precision.
1353+
Timestamp.to_numpy : Alias for this method.
1354+
Timestamp.asm8 : Alias for this method.
1355+
pd.to_datetime : Convert argument to datetime.
13461356

13471357
Examples
13481358
--------
@@ -2093,6 +2103,19 @@ class Timestamp(_Timestamp):
20932103
"""
20942104
Return time object with same time and tzinfo.
20952105

2106+
This method returns a datetime.time object with
2107+
the time and tzinfo corresponding to the pd.Timestamp
2108+
object, ignoring any information about the day/date.
2109+
2110+
See Also
2111+
--------
2112+
datetime.datetime.timetz : Return datetime.time object with the
2113+
same time attributes as the datetime object.
2114+
datetime.time : Class to represent the time of day, independent
2115+
of any particular day.
2116+
datetime.datetime.tzinfo : Attribute of datetime.datetime objects
2117+
representing the timezone of the datetime object.
2118+
20962119
Examples
20972120
--------
20982121
>>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels')
@@ -2141,9 +2164,27 @@ class Timestamp(_Timestamp):
21412164
@classmethod
21422165
def strptime(cls, date_string, format):
21432166
"""
2144-
Timestamp.strptime(string, format)
2167+
Convert string argument to datetime.
21452168

2146-
Function is not implemented. Use pd.to_datetime().
2169+
This method is not implemented; calling it will raise NotImplementedError.
2170+
Use pd.to_datetime() instead.
2171+
2172+
Parameters
2173+
----------
2174+
date_string : str
2175+
String to convert to a datetime.
2176+
format : str, default None
2177+
The format string to parse time, e.g. "%d/%m/%Y".
2178+
2179+
See Also
2180+
--------
2181+
pd.to_datetime : Convert argument to datetime.
2182+
datetime.datetime.strptime : Return a datetime corresponding to a string
2183+
representing a date and time, parsed according to a separate
2184+
format string.
2185+
datetime.datetime.strftime : Return a string representing the date and
2186+
time, controlled by an explicit format string.
2187+
Timestamp.isoformat : Return the time formatted according to ISO 8601.
21472188

21482189
Examples
21492190
--------
@@ -3073,7 +3114,8 @@ default 'raise'
30733114
"""
30743115
Convert TimeStamp to a Julian Date.
30753116

3076-
0 Julian date is noon January 1, 4713 BC.
3117+
This method returns the number of days as a float since
3118+
0 Julian date, which is noon January 1, 4713 BC.
30773119

30783120
See Also
30793121
--------

pandas/core/computation/parsing.py

Lines changed: 115 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from __future__ import annotations
66

7+
from enum import Enum
78
from io import StringIO
89
from keyword import iskeyword
910
import token
@@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str:
3233
------
3334
SyntaxError
3435
If the returned name is not a Python valid identifier, raise an exception.
35-
This can happen if there is a hashtag in the name, as the tokenizer will
36-
than terminate and not find the backtick.
37-
But also for characters that fall out of the range of (U+0001..U+007F).
3836
"""
3937
if name.isidentifier() and not iskeyword(name):
4038
return name
4139

40+
# Escape characters that fall outside the ASCII range (U+0001..U+007F).
41+
# GH 49633
42+
gen = (
43+
(c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace")))
44+
for c in name
45+
)
46+
name = "".join(
47+
c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_")
48+
for c, c_escaped in gen
49+
)
50+
4251
# Create a dict with the special characters and their replacement string.
4352
# EXACT_TOKEN_TYPES contains these special characters
4453
# token.tok_name contains a readable description of the replacement string.
@@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str:
5463
"$": "_DOLLARSIGN_",
5564
"€": "_EUROSIGN_",
5665
"°": "_DEGREESIGN_",
57-
# Including quotes works, but there are exceptions.
5866
"'": "_SINGLEQUOTE_",
5967
'"': "_DOUBLEQUOTE_",
60-
# Currently not possible. Terminates parser and won't find backtick.
61-
# "#": "_HASH_",
68+
"#": "_HASH_",
69+
"`": "_BACKTICK_",
6270
}
6371
)
6472

@@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable:
127135
which is not caught and propagates to the user level.
128136
"""
129137
try:
138+
# Escape backticks
139+
name = name.replace("`", "``") if isinstance(name, str) else name
140+
130141
tokenized = tokenize_string(f"`{name}`")
131142
tokval = next(tokenized)[1]
132143
return create_valid_python_identifier(tokval)
@@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string(
168179
return BACKTICK_QUOTED_STRING, source[string_start:string_end]
169180

170181

182+
class ParseState(Enum):
183+
DEFAULT = 0
184+
IN_BACKTICK = 1
185+
IN_SINGLE_QUOTE = 2
186+
IN_DOUBLE_QUOTE = 3
187+
188+
189+
def _split_by_backtick(s: str) -> list[tuple[bool, str]]:
190+
"""
191+
Splits a str into substrings along backtick characters (`).
192+
193+
Disregards backticks inside quotes.
194+
195+
Parameters
196+
----------
197+
s : str
198+
The Python source code string.
199+
200+
Returns
201+
-------
202+
substrings: list[tuple[bool, str]]
203+
List of tuples, where each tuple has two elements:
204+
The first is a boolean indicating if the substring is backtick-quoted.
205+
The second is the actual substring.
206+
"""
207+
substrings = []
208+
substr: list[str] = [] # Will join into a string before adding to `substrings`
209+
i = 0
210+
parse_state = ParseState.DEFAULT
211+
while i < len(s):
212+
char = s[i]
213+
214+
match char:
215+
case "`":
216+
# start of a backtick-quoted string
217+
if parse_state == ParseState.DEFAULT:
218+
if substr:
219+
substrings.append((False, "".join(substr)))
220+
221+
substr = [char]
222+
i += 1
223+
parse_state = ParseState.IN_BACKTICK
224+
continue
225+
226+
elif parse_state == ParseState.IN_BACKTICK:
227+
# escaped backtick inside a backtick-quoted string
228+
next_char = s[i + 1] if (i != len(s) - 1) else None
229+
if next_char == "`":
230+
substr.append(char)
231+
substr.append(next_char)
232+
i += 2
233+
continue
234+
235+
# end of the backtick-quoted string
236+
else:
237+
substr.append(char)
238+
substrings.append((True, "".join(substr)))
239+
240+
substr = []
241+
i += 1
242+
parse_state = ParseState.DEFAULT
243+
continue
244+
case "'":
245+
# start of a single-quoted string
246+
if parse_state == ParseState.DEFAULT:
247+
parse_state = ParseState.IN_SINGLE_QUOTE
248+
# end of a single-quoted string
249+
elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"):
250+
parse_state = ParseState.DEFAULT
251+
case '"':
252+
# start of a double-quoted string
253+
if parse_state == ParseState.DEFAULT:
254+
parse_state = ParseState.IN_DOUBLE_QUOTE
255+
# end of a double-quoted string
256+
elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"):
257+
parse_state = ParseState.DEFAULT
258+
substr.append(char)
259+
i += 1
260+
261+
if substr:
262+
substrings.append((False, "".join(substr)))
263+
264+
return substrings
265+
266+
171267
def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
172268
"""
173269
Tokenize a Python source code string.
@@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
182278
tok_generator : Iterator[Tuple[int, str]]
183279
An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
184280
"""
281+
# GH 59285
282+
# Escape characters, including backticks
283+
source = "".join(
284+
(
285+
create_valid_python_identifier(substring[1:-1])
286+
if is_backtick_quoted
287+
else substring
288+
)
289+
for is_backtick_quoted, substring in _split_by_backtick(source)
290+
)
291+
185292
line_reader = StringIO(source).readline
186293
token_generator = tokenize.generate_tokens(line_reader)
187294

188-
# Loop over all tokens till a backtick (`) is found.
189-
# Then, take all tokens till the next backtick to form a backtick quoted string
190-
for toknum, tokval, start, _, _ in token_generator:
191-
if tokval == "`":
192-
try:
193-
yield tokenize_backtick_quoted_string(
194-
token_generator, source, string_start=start[1] + 1
195-
)
196-
except Exception as err:
197-
raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
198-
else:
199-
yield toknum, tokval
295+
for toknum, tokval, _, _, _ in token_generator:
296+
yield toknum, tokval

0 commit comments

Comments
 (0)