1- from test import support
2- from test .support import os_helper
1+ import os
2+ import re
3+ import token
4+ import unittest
35from tokenize import (tokenize , untokenize , NUMBER , NAME , OP ,
46 STRING , ENDMARKER , ENCODING , tok_name , detect_encoding ,
57 open as tokenize_open , Untokenizer , generate_tokens ,
68 NEWLINE , _generate_tokens_from_c_tokenizer , DEDENT , TokenInfo ,
79 TokenError )
810from io import BytesIO , StringIO
9- import unittest
1011from textwrap import dedent
1112from unittest import TestCase , mock
13+ from test import support
1214from test .test_grammar import (VALID_UNDERSCORE_LITERALS ,
1315 INVALID_UNDERSCORE_LITERALS )
1416from test .support import os_helper
1517from test .support .script_helper import run_test_script , make_script , run_python_until_end
16- import os
17- import token
1818
1919# Converts a source string into a list of textual representation
2020# of the tokens such as:
@@ -1816,6 +1816,22 @@ def test_iter_compat(self):
18161816 self .assertEqual (untokenize (iter (tokens )), b'Hello ' )
18171817
18181818
1819+ def contains_ambiguous_backslash (source ):
1820+ """Return `True` if the source contains a backslash on a
1821+ line by itself. For example:
1822+
1823+ a = (1
1824+ \\
1825+ )
1826+
1827+ Code like this cannot be untokenized exactly. This is because
1828+ the tokenizer does not produce any tokens for the line containing
1829+ the backslash and so there is no way to know its indent.
1830+ """
1831+ pattern = re .compile (br'\n\s*\\\r?\n' )
1832+ return pattern .search (source ) is not None
1833+
1834+
18191835class TestRoundtrip (TestCase ):
18201836
18211837 def check_roundtrip (self , f ):
@@ -1826,6 +1842,9 @@ def check_roundtrip(self, f):
18261842 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
18271843 The test fails if the 3 pair tokenizations do not match.
18281844
1845+ If the source code can be untokenized unambiguously, the
1846+ untokenized code must match the original code exactly.
1847+
18291848 When untokenize bugs are fixed, untokenize with 5-tuples should
18301849 reproduce code that does not contain a backslash continuation
18311850 following spaces. A proper test should test this.
@@ -1849,6 +1868,13 @@ def check_roundtrip(self, f):
18491868 tokens2_from5 = [tok [:2 ] for tok in tokenize (readline5 )]
18501869 self .assertEqual (tokens2_from5 , tokens2 )
18511870
1871+ if not contains_ambiguous_backslash (code ):
1872+ # The BOM does not produce a token so there is no way to preserve it.
1873+ code_without_bom = code .removeprefix (b'\xef \xbb \xbf ' )
1874+ readline = iter (code_without_bom .splitlines (keepends = True )).__next__
1875+ untokenized_code = untokenize (tokenize (readline ))
1876+ self .assertEqual (code_without_bom , untokenized_code )
1877+
18521878 def check_line_extraction (self , f ):
18531879 if isinstance (f , str ):
18541880 code = f .encode ('utf-8' )
0 commit comments