Simplify BREAK_LOOP detection...

rocky · rocky · commit d731d32c1172 · 2024-07-14T14:45:25.000-04:00
by making more us of linestart. At least for now...
diff --git a/uncompyle6/scanners/scanner2.py b/uncompyle6/scanners/scanner2.py
@@ -495,7 +495,8 @@ def ingest(self, co, classname=None, code_objects={}, show_asm=None):
 
         if show_asm in ("both", "after"):
             print("\n# ---- tokenization:")
-            for t in new_tokens:
+            # FIXME: t.format() is changing tokens!
+            for t in new_tokens.copy():
                 print(t.format(line_prefix=""))
             print()
         return new_tokens, customize
diff --git a/uncompyle6/scanners/scanner26.py b/uncompyle6/scanners/scanner26.py
@@ -353,7 +353,8 @@ def ingest(self, co, classname=None, code_objects={}, show_asm=None):
 
         if show_asm in ("both", "after"):
             print("\n# ---- tokenization:")
-            for t in tokens:
+            # FIXME: t.format() is changing tokens!
+            for t in tokens.copy():
                 print(t.format(line_prefix=""))
             print()
         return tokens, customize
diff --git a/uncompyle6/scanners/scanner3.py b/uncompyle6/scanners/scanner3.py
@@ -797,7 +797,8 @@ def ingest(
 
         if show_asm in ("both", "after"):
             print("\n# ---- tokenization:")
-            for t in new_tokens:
+            # FIXME: t.format() is changing tokens!
+            for t in new_tokens.copy():
                 print(t.format(line_prefix=""))
             print()
         return new_tokens, customize
diff --git a/uncompyle6/scanners/scanner37base.py b/uncompyle6/scanners/scanner37base.py
@@ -228,13 +228,13 @@ def tokens_append(j, token):
 
         if show_asm in ("both", "before"):
             print("\n# ---- disassembly:")
-            self.insts = bytecode.disassemble_bytes(
+            bytecode.disassemble_bytes(
                 co.co_code,
                 varnames=co.co_varnames,
                 names=co.co_names,
                 constants=co.co_consts,
                 cells=bytecode._cell_names,
-                linestarts=bytecode._linestarts,
+                line_starts=bytecode._linestarts,
                 asm_format="extended",
                 filename=co.co_filename,
                 show_source=True,
@@ -481,12 +481,17 @@ def tokens_append(j, token):
                     next_opname = self.insts[i + 1].opname
 
                     # 'Continue's include jumps to loops that are not
-                    # and the end of a block which follow with POP_BLOCK and COME_FROM_LOOP.
-                    # If the JUMP_ABSOLUTE is to a FOR_ITER and it is followed by another JUMP_FORWARD
-                    # then we'll take it as a "continue".
-                    is_continue = (
-                        self.insts[self.offset2inst_index[target]].opname == "FOR_ITER"
-                        and self.insts[i + 1].opname == "JUMP_FORWARD"
+                    # and the end of a block which follow with
+                    # POP_BLOCK and COME_FROM_LOOP.  If the
+                    # JUMP_ABSOLUTE is to a FOR_ITER, and it is
+                    # followed by another JUMP_FORWARD then we'll take
+                    # it as a "continue".
+                    next_inst = self.insts[i + 1]
+                    is_continue = self.insts[
+                        self.offset2inst_index[target]
+                    ].opname == "FOR_ITER" and next_inst.opname in (
+                        "JUMP_FORWARD",
+                        "JUMP_ABSOLUTE",
                     )
 
                     if self.version < (3, 8) and (
@@ -501,21 +506,65 @@ def tokens_append(j, token):
                     ):
                         opname = "CONTINUE"
                     else:
+                        # "continue" versus "break_loop" dectction is more complicated
+                        # because "continue" to an outer loop is really a "break loop"
                         opname = "JUMP_BACK"
+
                         # FIXME: this is a hack to catch stuff like:
                         #   if x: continue
                         # the "continue" is not on a new line.
-                        # There are other situations where we don't catch
-                        # CONTINUE as well.
-                        if tokens[-1].kind == "JUMP_BACK" and tokens[-1].attr <= argval:
+                        #
+                        # Another situation is where we have
+                        #   for method in methods:
+                        #      for B in method:
+                        #         if c:
+                        #           return
+                        #        break  # A "continue" but not the innermost one
+                        if tokens[-1].kind == "JUMP_LOOP" and tokens[-1].attr <= argval:
                             if tokens[-2].kind == "BREAK_LOOP":
                                 del tokens[-1]
+                                j -= 1
                             else:
-                                # intern is used because we are changing the *previous* token
-                                tokens[-1].kind = sys.intern("CONTINUE")
-                    if last_op_was_break and opname == "CONTINUE":
-                        last_op_was_break = False
-                        continue
+                                # "intern" is used because we are
+                                # changing the *previous* token.  A
+                                # POP_TOP suggests a "break" rather
+                                # than a "continue"?
+                                if tokens[-2] == "POP_TOP" and (
+                                    is_continue and next_inst.argval != tokens[-1].attr
+                                ):
+                                    tokens[-1].kind = sys.intern("BREAK_LOOP")
+                                else:
+                                    tokens[-1].kind = sys.intern("CONTINUE")
+                                    last_continue = tokens[-1]
+                                    pass
+                                pass
+                            pass
+                    #     elif (
+                    #         last_continue is not None
+                    #         and tokens[-1].kind == "JUMP_LOOP"
+                    #         and last_continue.attr <= tokens[-1].attr
+                    #         and last_continue.offset > tokens[-1].attr
+                    #     ):
+                    #         # Handle mis-characterized "CONTINUE"
+                    #         # We have a situation like:
+                    #         # loop ... for or while)
+                    #         #   loop
+                    #         #     if ...:   # code below starts here
+                    #         #       break  # not continue
+                    #         #
+                    #         #   POP_JUMP_IF_FALSE_LOOP   # to outer loop
+                    #         #   JUMP_LOOP                # to inner loop
+                    #         #   ...
+                    #         #   JUMP_LOOP                # to outer loop
+                    #         tokens[-2].kind = sys.intern("BREAK_LOOP")
+                    #         pass
+
+                    # if last_op_was_break and opname == "CONTINUE":
+                    #     last_op_was_break = False
+                    #     continue
+                    pass
+                else:
+                    opname = "JUMP_FORWARD"
 
             elif inst.offset in self.load_asserts:
                 opname = "LOAD_ASSERT"
@@ -538,9 +587,10 @@ def tokens_append(j, token):
             )
             pass
 
-        if show_asm in ("both", "after"):
+        if show_asm in ("both", "after") and self.version < (3, 8):
             print("\n# ---- tokenization:")
-            for t in tokens:
+            # FIXME: t.format() is changing tokens!
+            for t in tokens.copy():
                 print(t.format(line_prefix=""))
             print()
         return tokens, customize
diff --git a/uncompyle6/scanners/scanner38.py b/uncompyle6/scanners/scanner38.py
@@ -24,13 +24,13 @@
 
 from typing import Dict, Tuple
 
-from uncompyle6.scanners.tok import off2int
-from uncompyle6.scanners.scanner37 import Scanner37
-from uncompyle6.scanners.scanner37base import Scanner37Base
-
 # bytecode verification, verify(), uses JUMP_OPs from here
 from xdis.opcodes import opcode_38 as opc
 
+from uncompyle6.scanners.scanner37 import Scanner37
+from uncompyle6.scanners.scanner37base import Scanner37Base
+from uncompyle6.scanners.tok import off2int
+
 # bytecode verification, verify(), uses JUMP_OPS from here
 JUMP_OPs = opc.JUMP_OPS
 
@@ -121,35 +121,26 @@ def ingest(
                     new_tokens.append(token)
                     continue
 
-                # We also want to avoid confusing BREAK_LOOPS with parts of the
-                # grammar rules for loops. (Perhaps we should change the grammar.)
-                # Try to find an adjacent JUMP_BACK which is part of the normal loop end.
-
-                if i + 1 < len(tokens) and tokens[i + 1] == "JUMP_BACK":
-                    # Sometimes the jump back is after the "break" instruction..
-                    jump_back_index = i + 1
-                else:
-                    # and sometimes, because of jump-to-jump optimization, it is before the
-                    # jump target instruction.
-                    jump_back_index = self.offset2tok_index[jump_target] - 1
-                    while tokens[jump_back_index].kind.startswith("COME_FROM_"):
-                        jump_back_index -= 1
-                        pass
-                    pass
-                jump_back_token = tokens[jump_back_index]
-
-                # Is this a forward jump not next to a JUMP_BACK ? ...
-                break_loop = token.linestart and jump_back_token != "JUMP_BACK"
+                j = i
+                while tokens[j - 1] in ("POP_TOP", "POP_BLOCK", "POP_EXCEPT"):
+                    j -= 1
+                    if tokens[j].linestart:
+                        break
+                token_with_linestart = tokens[j]
 
-                # or if there is looping jump back, then that loop
-                # should start before where the "break" instruction sits.
-                if break_loop or (
-                    jump_back_token == "JUMP_BACK"
-                    and jump_back_token.attr < token.off2int()
-                ):
+                if token_with_linestart.linestart:
                     token.kind = "BREAK_LOOP"
+
                 pass
             new_tokens.append(token)
+
+        if show_asm in ("both", "after"):
+            print("\n# ---- tokenization:")
+            # FIXME: t.format() is changing tokens!
+            for t in new_tokens.copy():
+                print(t.format(line_prefix=""))
+            print()
+
         return new_tokens, customize