From 9af396d09bba5d1c96e59036982059d53e95727b Mon Sep 17 00:00:00 2001 From: Sergey B Kirpichev Date: Sun, 7 Jan 2024 07:52:57 +0300 Subject: [PATCH 1/2] gh-114667: Support hexadecimal floating point literals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This add hexadecimal floating point literals (IEEE 754-2008 §5.12.3) and support construction of floats from hexadecimal strings. Note that the syntax is more permissive: everything that is currently accepted by the ``float.fromhex()``, but with a mandatory base specifier; it also allows grouping digits with underscores. Examples: ```pycon >>> 0x1.1p-1 0.53125 >>> float('0x1.1') 1.0625 >>> 0x1.1 1.0625 >>> 0x1.1_1_1 1.066650390625 ``` Added compatibility code to not break access of existing int attributes. E.g. 0x1.bit_length() will not require parentheses around the hexadecimal integer literal (like 1.bit_length() for decimal int). Minor changes: Py_ISDIGIT/ISXDIGIT macros were transformed to functions. --- Doc/library/functions.rst | 14 ++- Doc/reference/lexical_analysis.rst | 12 +++ Doc/tutorial/floatingpoint.rst | 2 +- Include/cpython/pyctype.h | 10 +- Include/internal/pycore_floatobject.h | 1 + Lib/test/support/numbers.py | 25 ++++- Lib/test/test_float.py | 18 ++-- Lib/test/test_grammar.py | 35 +++++- Lib/test/test_tokenize.py | 10 ++ Lib/tokenize.py | 5 +- ...-01-28-08-17-08.gh-issue-114667.8w_l9I.rst | 3 + Objects/floatobject.c | 100 ++++++++++++------ Parser/lexer/lexer.c | 83 +++++++++++---- Python/dtoa.c | 4 + Python/pystrtod.c | 39 ++++--- 15 files changed, 277 insertions(+), 84 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst diff --git a/Doc/library/functions.rst b/Doc/library/functions.rst index 857b40f3ba155c..2cf9e286147235 100644 --- a/Doc/library/functions.rst +++ b/Doc/library/functions.rst @@ -770,7 +770,8 @@ are always available. They are listed here in alphabetical order. >>> float('-Infinity') -inf - If the argument is a string, it should contain a decimal number, optionally + If the argument is a string, it should contain a decimal number + or a hexadecimal number, optionally preceded by a sign, and optionally embedded in whitespace. The optional sign may be ``'+'`` or ``'-'``; a ``'+'`` sign has no effect on the value produced. The argument may also be a string representing a NaN @@ -787,12 +788,16 @@ are always available. They are listed here in alphabetical order. digitpart: `digit` (["_"] `digit`)* number: [`digitpart`] "." `digitpart` | `digitpart` ["."] exponent: ("e" | "E") [`sign`] `digitpart` - floatnumber: `number` [`exponent`] + floatnumber: (`number` [`exponent`]) | `hexfloatnumber` absfloatvalue: `floatnumber` | `infinity` | `nan` floatvalue: [`sign`] `absfloatvalue` + hexfloatnumber: `~python-grammar:hexinteger` | `~python-grammar:hexfraction` | `~python-grammar:hexfloat` Case is not significant, so, for example, "inf", "Inf", "INFINITY", and - "iNfINity" are all acceptable spellings for positive infinity. + "iNfINity" are all acceptable spellings for positive infinity. Note also + that the exponent of a hexadecimal floating point number is written in + decimal, and that it gives the power of 2 by which to multiply the + coefficient. Otherwise, if the argument is an integer or a floating-point number, a floating-point number with the same value (within Python's floating-point @@ -818,6 +823,9 @@ are always available. They are listed here in alphabetical order. .. versionchanged:: 3.8 Falls back to :meth:`~object.__index__` if :meth:`~object.__float__` is not defined. + .. versionchanged:: next + Added support for hexadecimal floating-point numbers. + .. index:: single: __format__ diff --git a/Doc/reference/lexical_analysis.rst b/Doc/reference/lexical_analysis.rst index e320eedfa67a27..f69bf024c4411e 100644 --- a/Doc/reference/lexical_analysis.rst +++ b/Doc/reference/lexical_analysis.rst @@ -1265,6 +1265,9 @@ The ``e`` or ``E`` represents "times ten raised to the power of":: 1.166e-5 # (represents 1.166×10⁻⁵, or 0.00001166) 6.02214076e+23 # (represents 6.02214076×10²³, or 602214076000000000000000.) +The exponent of a hexadecimal floating point literal is written in decimal, and +it gives the power of 2 by which to multiply the coefficient. + In floats with only integer and exponent parts, the decimal point may be omitted:: @@ -1281,12 +1284,21 @@ lexical definitions: | `digitpart` "." [`digitpart`] [`exponent`] | "." `digitpart` [`exponent`] | `digitpart` `exponent` + | `hexfloat` digitpart: `digit` (["_"] `digit`)* exponent: ("e" | "E") ["+" | "-"] `digitpart` + hexfloat: ("0x | "0X") ["_"] (`hexdigitpart` | `hexpointfloat`) [`binexponent`] + hexpointfloat: [`hexdigit`] `hexfraction` | `hexdigitpart` "." + hexfraction: "." `hexdigitpart` + hexdigitpart: `hexdigit` (["_"] `hexdigit`)* + binexponent: ("p" | "P") ["+" | "-"] `digitpart` .. versionchanged:: 3.6 Underscores are now allowed for grouping purposes in literals. +.. versionchanged:: next + Added support for hexadecimal floating-point literals. + .. index:: single: j; in numeric literal diff --git a/Doc/tutorial/floatingpoint.rst b/Doc/tutorial/floatingpoint.rst index dfe2d1d3a8378f..44baeee12d1165 100644 --- a/Doc/tutorial/floatingpoint.rst +++ b/Doc/tutorial/floatingpoint.rst @@ -210,7 +210,7 @@ the float value exactly: .. doctest:: - >>> x == float.fromhex('0x1.921f9f01b866ep+1') + >>> x == 0x1.921f9f01b866ep+1 True Since the representation is exact, it is useful for reliably porting values diff --git a/Include/cpython/pyctype.h b/Include/cpython/pyctype.h index 729d93275e6c53..71c870080fe5ad 100644 --- a/Include/cpython/pyctype.h +++ b/Include/cpython/pyctype.h @@ -21,11 +21,17 @@ PyAPI_DATA(const unsigned int) _Py_ctype_table[256]; #define Py_ISLOWER(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_LOWER) #define Py_ISUPPER(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_UPPER) #define Py_ISALPHA(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALPHA) -#define Py_ISDIGIT(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT) -#define Py_ISXDIGIT(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT) #define Py_ISALNUM(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALNUM) #define Py_ISSPACE(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_SPACE) +static inline int Py_ISDIGIT(char c) { + return _Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT; +} + +static inline int Py_ISXDIGIT(char c) { + return _Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT; +} + PyAPI_DATA(const unsigned char) _Py_ctype_tolower[256]; PyAPI_DATA(const unsigned char) _Py_ctype_toupper[256]; diff --git a/Include/internal/pycore_floatobject.h b/Include/internal/pycore_floatobject.h index 317f984188bad8..0e513cc897f906 100644 --- a/Include/internal/pycore_floatobject.h +++ b/Include/internal/pycore_floatobject.h @@ -42,6 +42,7 @@ extern double _Py_parse_inf_or_nan(const char *p, char **endptr); extern int _Py_convert_int_to_double(PyObject **v, double *dbl); +extern double _Py_dg_strtod_hex(const char *str, char **ptr); #ifdef __cplusplus } diff --git a/Lib/test/support/numbers.py b/Lib/test/support/numbers.py index d5dbb41acebc38..ce7bde75495b9f 100644 --- a/Lib/test/support/numbers.py +++ b/Lib/test/support/numbers.py @@ -24,6 +24,16 @@ '.1_4j', '(1_2.5+3_3j)', '(.5_6j)', + '0x_.1p1', + '0X_.1p1', + '0x1_1.p1', + '0x_1_1.p1', + '0x1.1_1p1', + '0x1.p1_1', + '0xa.p1', + '0x.ap1', + '0xa_c.p1', + '0x.a_cp1', ] INVALID_UNDERSCORE_LITERALS = [ # Trailing underscores: @@ -35,6 +45,8 @@ '0xf_', '0o5_', '0 if 1_Else 1', + '0x1p1_', + '0x1.1p1_', # Underscores in the base selector: '0_b0', '0_xf', @@ -52,17 +64,23 @@ '0o5__77', '1e1__0', '1e1__0j', + '0x1__1.1p1', # Underscore right before a dot: '1_.4', '1_.4j', + '0x1_.p1', + '0xa_.p1', # Underscore right after a dot: '1._4', '1._4j', '._5', '._5j', + '0x1._p1', + '0xa._p1', # Underscore right after a sign: '1.0e+_1', '1.0e+_1j', + '0x1.1p+_1', # Underscore right before j: '1.4_j', '1.4e5_j', @@ -70,10 +88,15 @@ '1_e1', '1.4_e1', '1.4_e1j', - # Underscore right after e: + '0x1.1p1_j', + # Underscore right after e or p: '1e_1', '1.4e_1', '1.4e_1j', + '0x1_p1', + '0x1_P1', + '0x1.1_p1', + '0x1.1_P1', # Complex cases with parens: '(1+1.5_j_)', '(1+1.5_j)', diff --git a/Lib/test/test_float.py b/Lib/test/test_float.py index 00518abcb11b46..70b5bc6ca83f47 100644 --- a/Lib/test/test_float.py +++ b/Lib/test/test_float.py @@ -63,9 +63,9 @@ def test_float(self): self.assertEqual(float(3.14), 3.14) self.assertEqual(float(314), 314.0) self.assertEqual(float(" 3.14 "), 3.14) - self.assertRaises(ValueError, float, " 0x3.1 ") - self.assertRaises(ValueError, float, " -0x3.p-1 ") - self.assertRaises(ValueError, float, " +0x3.p-1 ") + self.assertEqual(float(" 0x3.1 "), 3.0625) + self.assertEqual(float(" -0x3.p-1 "), -1.5) + self.assertEqual(float(" +0x3.p-1 "), 1.5) self.assertRaises(ValueError, float, "++3.14") self.assertRaises(ValueError, float, "+-3.14") self.assertRaises(ValueError, float, "-+3.14") @@ -95,13 +95,13 @@ def test_noargs(self): def test_underscores(self): for lit in VALID_UNDERSCORE_LITERALS: - if not any(ch in lit for ch in 'jJxXoObB'): + if not any(ch in lit for ch in 'jJoObB'): self.assertEqual(float(lit), eval(lit)) self.assertEqual(float(lit), float(lit.replace('_', ''))) for lit in INVALID_UNDERSCORE_LITERALS: if lit in ('0_7', '09_99'): # octals are not recognized here continue - if not any(ch in lit for ch in 'jJxXoObB'): + if not any(ch in lit for ch in 'jJoObB'): self.assertRaises(ValueError, float, lit) # Additional test cases; nan and inf are never valid as literals, # only in the float() constructor, but we don't allow underscores @@ -198,9 +198,9 @@ def test_float_with_comma(self): self.assertRaises(ValueError, float, " 3,14 ") self.assertRaises(ValueError, float, " +3,14 ") self.assertRaises(ValueError, float, " -3,14 ") - self.assertRaises(ValueError, float, " 0x3.1 ") - self.assertRaises(ValueError, float, " -0x3.p-1 ") - self.assertRaises(ValueError, float, " +0x3.p-1 ") + self.assertEqual(float(" 0x3.1 "), 3.0625) + self.assertEqual(float(" -0x3.p-1 "), -1.5) + self.assertEqual(float(" +0x3.p-1 "), 1.5) self.assertEqual(float(" 25.e-1 "), 2.5) self.assertAlmostEqual(float(" .25e-1 "), .025) @@ -1559,7 +1559,7 @@ def roundtrip(x): except OverflowError: pass else: - self.identical(x, fromHex(toHex(x))) + self.identical(x, roundtrip(x)) def test_subclass(self): class F(float): diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 7f5d48b9c63ab7..818aff9db3eaf2 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -74,6 +74,15 @@ def test_plain_integers(self): else: self.fail('Weird maxsize value %r' % maxsize) + def test_attrs_on_hexintegers(self): + good_meth = [m for m in dir(int) if not m.startswith('_')] + for m in good_meth: + self.assertEqual(eval('0x1.' + m), eval('(0x1).' + m)) + self.check_syntax_error('0x1.spam', "invalid hexadecimal literal", + lineno=1, offset=4) + self.check_syntax_error('0x1.foo', "invalid hexadecimal literal", + lineno=1, offset=5) + def test_long_integers(self): x = 0 x = 0xffffffffffffffff @@ -97,6 +106,23 @@ def test_floats(self): x = 3.e14 x = .3e14 x = 3.1e4 + x = 0x1.2p1 + x = 0x1.2p+1 + x = 0x1.p1 + x = 0x1.p-1 + x = 0x1p0 + x = 0x1ap1 + x = 0x1P1 + x = 0x1cp2 + x = 0x1.p1 + x = 0x1.P1 + x = 0x001.1p2 + x = 0X1p1 + x = 0x1.1_1p1 + x = 0x1.1p1_1 + x = 0x1. + x = 0x1.1 + x = 0x.1 def test_float_exponent_tokenization(self): # See issue 21642. @@ -134,7 +160,14 @@ def test_bad_numerical_literals(self): "use an 0o prefix for octal integers") check("1.2_", "invalid decimal literal") check("1e2_", "invalid decimal literal") - check("1e+", "invalid decimal literal") + check("1e+", "invalid float literal") + check("0x.p", "invalid float literal") + check("0x_.p", "invalid float literal") + check("0x1.1p", "invalid float literal") + check("0x1.1_p", "invalid float literal") + check("0x1.1p_", "invalid float literal") + check("0xp", "invalid hexadecimal literal") + check("0xP", "invalid hexadecimal literal") def test_end_of_numerical_literals(self): def check(test, error=False): diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py index 865e0c5b40ddd3..a8e5438424d345 100644 --- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -273,6 +273,16 @@ def test_float(self): NAME 'x' (1, 0) (1, 1) OP '=' (1, 2) (1, 3) NUMBER '3.14e159' (1, 4) (1, 12) + """) + self.check_tokenize("x = 0x1p1", """\ + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + NUMBER '0x1p1' (1, 4) (1, 9) + """) + self.check_tokenize("x = 0x.1p1", """\ + NAME 'x' (1, 0) (1, 1) + OP '=' (1, 2) (1, 3) + NUMBER '0x.1p1' (1, 4) (1, 10) """) def test_underscore_literals(self): diff --git a/Lib/tokenize.py b/Lib/tokenize.py index 7e71755068e1df..82cae0d3e9da07 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -77,7 +77,10 @@ def maybe(*choices): return group(*choices) + '?' Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) Expfloat = r'[0-9](?:_?[0-9])*' + Exponent -Floatnumber = group(Pointfloat, Expfloat) +HexExponent = r'[pP][-+]?[0-9](?:_?[0-9])*' +Hexfloat = group(r'0[xX]_?[0-9a-f](?:_?[0-9a-f])*\.(?:[0-9a-f](?:_?[0-9a-f])*)?', + r'0[xX]_?\.[0-9a-f](?:_?[0-9a-f])*') + HexExponent +Floatnumber = group(Pointfloat, Expfloat, Hexfloat) Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') Number = group(Imagnumber, Floatnumber, Intnumber) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst new file mode 100644 index 00000000000000..c01c0cde3e1892 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-01-28-08-17-08.gh-issue-114667.8w_l9I.rst @@ -0,0 +1,3 @@ +Add hexadecimal floating point literals (IEEE 754-2008 §5.12.3) and support +construction of floats from hexadecimal strings. Patch by Sergey B +Kirpichev. diff --git a/Objects/floatobject.c b/Objects/floatobject.c index 93e1973d6b32fc..fb09d1b098083a 100644 --- a/Objects/floatobject.c +++ b/Objects/floatobject.c @@ -1247,11 +1247,64 @@ float_fromhex_impl(PyTypeObject *type, PyObject *string) /*[clinic end generated code: output=c54b4923552e5af5 input=0407bebd354bca89]*/ { PyObject *result; + Py_ssize_t length; + const char *s, *end, *last; + double x; + + s = PyUnicode_AsUTF8AndSize(string, &length); + if (s == NULL) { + return NULL; + } + last = s + length; + + while (Py_ISSPACE(*s)) { + s++; + } + while (s < last - 1 && Py_ISSPACE(last[-1])) { + last--; + } + + errno = 0; + x = _Py_dg_strtod_hex(s, (char **)&end); + + if (errno == ERANGE) { + PyErr_SetString(PyExc_OverflowError, + "hexadecimal value too large to represent as a float"); + return NULL; + } + + if (end != last) { + if (end != s && (*end && !Py_ISSPACE(*end))) { + PyErr_SetString(PyExc_ValueError, + "hexadecimal string too long to convert"); + return NULL; + } + /* Nothing parsed, maybe inf/nan? */ + x = _Py_parse_inf_or_nan(s, (char **)&end); + } + if (end != last || end == s) { + PyErr_SetString(PyExc_ValueError, + "invalid hexadecimal floating-point string"); + return NULL; + } + + result = PyFloat_FromDouble(x); + if (type != &PyFloat_Type && result != NULL) { + Py_SETREF(result, PyObject_CallOneArg((PyObject *)type, result)); + } + return result; +} + +double +_Py_dg_strtod_hex(const char *s00, char **se) +{ double x; long exp, top_exp, lsb, key_digit; - const char *s, *coeff_start, *s_store, *coeff_end, *exp_start, *s_end; + const char *coeff_start, *s_store, *coeff_end, *exp_start, *s = s00; int half_eps, digit, round_up, negate=0; - Py_ssize_t length, ndigits, fdigits, i; + Py_ssize_t ndigits, fdigits, i; + + *se = (char *)s00; /* * For the sake of simplicity and correctness, we impose an artificial @@ -1298,11 +1351,6 @@ float_fromhex_impl(PyTypeObject *type, PyObject *string) * exp+4*ndigits and exp-4*ndigits are within the range of a long. */ - s = PyUnicode_AsUTF8AndSize(string, &length); - if (s == NULL) - return NULL; - s_end = s + length; - /******************** * Parse the string * ********************/ @@ -1311,13 +1359,6 @@ float_fromhex_impl(PyTypeObject *type, PyObject *string) while (Py_ISSPACE(*s)) s++; - /* infinities and nans */ - x = _Py_parse_inf_or_nan(s, (char **)&coeff_end); - if (coeff_end != s) { - s = coeff_end; - goto finished; - } - /* optional sign */ if (*s == '-') { s++; @@ -1356,8 +1397,10 @@ float_fromhex_impl(PyTypeObject *type, PyObject *string) if (ndigits == 0) goto parse_error; if (ndigits > Py_MIN(DBL_MIN_EXP - DBL_MANT_DIG - LONG_MIN/2, - LONG_MAX/2 + 1 - DBL_MAX_EXP)/4) + LONG_MAX/2 + 1 - DBL_MAX_EXP)/4) { + *se = (char*)coeff_end; goto insane_length_error; + } /* [p ] */ if (*s == 'p' || *s == 'P') { @@ -1456,31 +1499,20 @@ float_fromhex_impl(PyTypeObject *type, PyObject *string) x = ldexp(x, (int)(exp+4*key_digit)); finished: - /* optional trailing whitespace leading to the end of the string */ - while (Py_ISSPACE(*s)) - s++; - if (s != s_end) + if (*s && !Py_ISSPACE(*s)) goto parse_error; - result = PyFloat_FromDouble(negate ? -x : x); - if (type != &PyFloat_Type && result != NULL) { - Py_SETREF(result, PyObject_CallOneArg((PyObject *)type, result)); - } - return result; + *se = (char *)s; + errno = 0; + return negate ? -x : x; overflow_error: - PyErr_SetString(PyExc_OverflowError, - "hexadecimal value too large to represent as a float"); - return NULL; + errno = ERANGE; + return HUGE_VAL; parse_error: - PyErr_SetString(PyExc_ValueError, - "invalid hexadecimal floating-point string"); - return NULL; - insane_length_error: - PyErr_SetString(PyExc_ValueError, - "hexadecimal string too long to convert"); - return NULL; + errno = 0; + return 0.0; } /*[clinic input] diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index 81363cf8e810fe..a38f264c40aa47 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -410,21 +410,23 @@ verify_identifier(struct tok_state *tok) } static int -tok_decimal_tail(struct tok_state *tok) +tok_digits_tail(struct tok_state *tok, int base) { int c; + int (*_isdigit)(char) = base == 16 ? &Py_ISXDIGIT : &Py_ISDIGIT; while (1) { do { c = tok_nextc(tok); - } while (Py_ISDIGIT(c)); + } while ((*_isdigit)(c)); if (c != '_') { break; } c = tok_nextc(tok); - if (!Py_ISDIGIT(c)) { + if (!(*_isdigit)(c)) { tok_backup(tok, c); - _PyTokenizer_syntaxerror(tok, "invalid decimal literal"); + _PyTokenizer_syntaxerror(tok, "invalid %s literal", + base == 16 ? "hexadecimal" : "decimal"); return 0; } } @@ -856,20 +858,61 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t /* Hex, octal or binary -- maybe. */ c = tok_nextc(tok); if (c == 'x' || c == 'X') { - /* Hex */ + /* Hex integer/float */ c = tok_nextc(tok); - do { - if (c == '_') { - c = tok_nextc(tok); - } + if (c == '_') { + c = tok_nextc(tok); + } + if (c == '.') { + c = tok_nextc(tok); if (!Py_ISXDIGIT(c)) { tok_backup(tok, c); - return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); + return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid float literal")); } - do { - c = tok_nextc(tok); - } while (Py_ISXDIGIT(c)); - } while (c == '_'); + goto hexfraction; + } + else if (!Py_ISXDIGIT(c)) { + tok_backup(tok, c); + return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal")); + } + c = tok_digits_tail(tok, 16); + if (c == 0) { + return MAKE_TOKEN(ERRORTOKEN); + } + if (c == '.') { + c = tok_nextc(tok); + hexfraction: + /* Allow attribute access on hexadecimal integer literals for + * for existing public attributes on int's, e.g. 0x1.bit_length(). */ + if ((c == 'a' && lookahead(tok, "s_integer_ratio")) || + (c == 't' && lookahead(tok, "o_bytes")) || + (c == 'b' && (lookahead(tok, "it_count") || + lookahead(tok, "it_length"))) || + (c == 'c' && lookahead(tok, "onjugate")) || + (c == 'd' && lookahead(tok, "enominator")) || + (c == 'f' && lookahead(tok, "rom_bytes")) || + (c == 'i' && (lookahead(tok, "mag") || + lookahead(tok, "s_integer"))) || + (c == 'n' && lookahead(tok, "umerator")) || + (c == 'r' && lookahead(tok, "eal"))) + { + tok_backup(tok, c); + c = '.'; + goto hexint; + } + if (Py_ISXDIGIT(c)) { + c = tok_digits_tail(tok, 16); + if (c == 0) { + tok->done = E_OK; + _PyTokenizer_syntaxerror(tok, "invalid float literal"); + return MAKE_TOKEN(ERRORTOKEN); + } + } + } + if (c == 'p' || c == 'P') { + goto exponent; + } + hexint: if (!verify_end_of_number(tok, c, "hexadecimal")) { return MAKE_TOKEN(ERRORTOKEN); } @@ -950,7 +993,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t char* zeros_end = tok->cur; if (Py_ISDIGIT(c)) { nonzero = 1; - c = tok_decimal_tail(tok); + c = tok_digits_tail(tok, 10); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -982,7 +1025,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t } else { /* Decimal */ - c = tok_decimal_tail(tok); + c = tok_digits_tail(tok, 10); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -993,7 +1036,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t fraction: /* Fraction */ if (Py_ISDIGIT(c)) { - c = tok_decimal_tail(tok); + c = tok_digits_tail(tok, 10); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } @@ -1009,11 +1052,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t c = tok_nextc(tok); if (!Py_ISDIGIT(c)) { tok_backup(tok, c); - return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal")); + return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid float literal")); } } else if (!Py_ISDIGIT(c)) { tok_backup(tok, c); - if (!verify_end_of_number(tok, e, "decimal")) { + if (!verify_end_of_number(tok, e, "float")) { return MAKE_TOKEN(ERRORTOKEN); } tok_backup(tok, e); @@ -1021,7 +1064,7 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t p_end = tok->cur; return MAKE_TOKEN(NUMBER); } - c = tok_decimal_tail(tok); + c = tok_digits_tail(tok, 10); if (c == 0) { return MAKE_TOKEN(ERRORTOKEN); } diff --git a/Python/dtoa.c b/Python/dtoa.c index 3de150351a4ef8..9bf01562982b98 100644 --- a/Python/dtoa.c +++ b/Python/dtoa.c @@ -118,6 +118,7 @@ /* Linking of Python's #defines to Gay's #defines starts here. */ #include "Python.h" +#include "pycore_floatobject.h" // _Py_dg_strtod_hex() #include "pycore_dtoa.h" // _PY_SHORT_FLOAT_REPR #include "pycore_interp_structs.h"// struct Bigint #include "pycore_pystate.h" // _PyInterpreterState_GET() @@ -1412,6 +1413,9 @@ _Py_dg_strtod(const char *s00, char **se) c = *++s; } + if (*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X')) + return _Py_dg_strtod_hex(s, se) * (sign ? -1: 1); + /* Skip leading zeros: lz is true iff there were leading zeros. */ s1 = s; while (c == '0') diff --git a/Python/pystrtod.c b/Python/pystrtod.c index 7b74f613ed563b..57206bb7fe73d6 100644 --- a/Python/pystrtod.c +++ b/Python/pystrtod.c @@ -169,13 +169,8 @@ _PyOS_ascii_strtod(const char *nptr, char **endptr) p++; } - /* Some platform strtods accept hex floats; Python shouldn't (at the - moment), so we check explicitly for strings starting with '0x'. */ - if (*p == '0' && (*(p+1) == 'x' || *(p+1) == 'X')) - goto invalid_string; - /* Check that what's left begins with a digit or decimal point */ - if (!Py_ISDIGIT(*p) && *p != '.') + if (!Py_ISXDIGIT(*p) && *p != '.') goto invalid_string; digits_pos = p; @@ -186,7 +181,7 @@ _PyOS_ascii_strtod(const char *nptr, char **endptr) swapped for the current locale's decimal point before we call strtod. On the other hand, if we find the current locale's decimal point then the input is invalid. */ - while (Py_ISDIGIT(*p)) + while (Py_ISXDIGIT(*p)) p++; if (*p == '.') @@ -194,10 +189,10 @@ _PyOS_ascii_strtod(const char *nptr, char **endptr) decimal_point_pos = p++; /* locate end of number */ - while (Py_ISDIGIT(*p)) + while (Py_ISXDIGIT(*p)) p++; - if (*p == 'e' || *p == 'E') + if (*p == 'e' || *p == 'E' || *p == 'p' || *p == 'P') p++; if (*p == '+' || *p == '-') p++; @@ -350,6 +345,7 @@ _Py_string_to_number_with_underscores( const char *p, *last; char *dup, *end; PyObject *result; + int (*_isdigit)(char) = &Py_ISDIGIT; assert(s[orig_len] == '\0'); @@ -364,21 +360,40 @@ _Py_string_to_number_with_underscores( end = dup; prev = '\0'; last = s + orig_len; - for (p = s; *p; p++) { + p = s; + /* Has hexadecimal prefix? */ + if (*p == '0' && (*(p+1) == 'x' || *(p+1) == 'X')) { + _isdigit = &Py_ISXDIGIT; + /* Accept prefix. */ + *end++ = *p; + p++; + *end++ = *p; + p++; + /* Underscore allowed right after the prefix and before '.' */ + if (*p == '_') { + p++; + if (*p == '.') { + *end++ = *p; + p++; + } + } + } + while (*p) { if (*p == '_') { /* Underscores are only allowed after digits. */ - if (!(prev >= '0' && prev <= '9')) { + if (!(*_isdigit)(prev)) { goto error; } } else { *end++ = *p; /* Underscores are only allowed before digits. */ - if (prev == '_' && !(*p >= '0' && *p <= '9')) { + if (prev == '_' && !(*_isdigit)(*p)) { goto error; } } prev = *p; + p++; } /* Underscores are not allowed at the end. */ if (prev == '_') { From cd9bf163c829f23aaf6d45366b840101cd57a9dc Mon Sep 17 00:00:00 2001 From: Sergey B Kirpichev Date: Tue, 2 Apr 2024 16:06:01 +0300 Subject: [PATCH 2/2] As it's requires a PEP, lets trigger a syntax warning --- Lib/test/test_grammar.py | 4 +++- Lib/test/test_zstd.py | 2 +- Parser/lexer/lexer.c | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_grammar.py b/Lib/test/test_grammar.py index 818aff9db3eaf2..8afdcdb9cab34a 100644 --- a/Lib/test/test_grammar.py +++ b/Lib/test/test_grammar.py @@ -77,7 +77,9 @@ def test_plain_integers(self): def test_attrs_on_hexintegers(self): good_meth = [m for m in dir(int) if not m.startswith('_')] for m in good_meth: - self.assertEqual(eval('0x1.' + m), eval('(0x1).' + m)) + with self.assertWarns(SyntaxWarning): + v = eval('0x1.' + m) + self.assertEqual(v, eval('(0x1).' + m)) self.check_syntax_error('0x1.spam', "invalid hexadecimal literal", lineno=1, offset=4) self.check_syntax_error('0x1.foo', "invalid hexadecimal literal", diff --git a/Lib/test/test_zstd.py b/Lib/test/test_zstd.py index 6358cc78739cd9..d692a3ff0c53d7 100644 --- a/Lib/test/test_zstd.py +++ b/Lib/test/test_zstd.py @@ -1239,7 +1239,7 @@ def test_is_raw(self): ZstdDict(desk333=345) def test_invalid_dict(self): - DICT_MAGIC = 0xEC30A437.to_bytes(4, byteorder='little') + DICT_MAGIC = (0xEC30A437).to_bytes(4, byteorder='little') dict_content = DICT_MAGIC + b'abcdefghighlmnopqrstuvwxyz' # corrupted diff --git a/Parser/lexer/lexer.c b/Parser/lexer/lexer.c index a38f264c40aa47..e1c5e3fca4e2c7 100644 --- a/Parser/lexer/lexer.c +++ b/Parser/lexer/lexer.c @@ -896,6 +896,11 @@ tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct t (c == 'n' && lookahead(tok, "umerator")) || (c == 'r' && lookahead(tok, "eal"))) { + if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning, + "invalid float literal")) + { + return 0; + } tok_backup(tok, c); c = '.'; goto hexint;