diff --git a/pywb/rewrite/content_rewriter.py b/pywb/rewrite/content_rewriter.py index 98c6d39ad..de38794f5 100644 --- a/pywb/rewrite/content_rewriter.py +++ b/pywb/rewrite/content_rewriter.py @@ -315,6 +315,17 @@ def _get_adaptive_metadata(self, rwinfo): # ============================================================================ class StreamingRewriter(object): + # Regex to match ES6 import statements and common comment patterns + # Matches import statements at the start, allowing for leading whitespace/comments + # The regex captures: + # - Group 1: Leading comments and whitespace + # - Group 2: One or more import statements (with their trailing newlines) + # Note: Does not capture blank lines after the last import + IMPORT_REGEX = re.compile( + r'^((?:\s|//[^\n]*\n|/\*(?:[^*]|\*(?!/))*\*/)*)((?:import\s+[^;\n]+;?\n)+)', + re.MULTILINE + ) + def __init__(self, url_rewriter, align_to_line=True, first_buff=''): self.url_rewriter = url_rewriter self.align_to_line = align_to_line @@ -327,7 +338,24 @@ def rewrite(self, string): return string def rewrite_complete(self, string, **kwargs): - return self.first_buff + self.rewrite(string) + self.final_read() + return self._insert_with_import_check(string, **kwargs) + + def _insert_with_import_check(self, string, **kwargs): + """Insert first_buff after any leading ES6 import statements.""" + if not self.first_buff: + return self.rewrite(string) + self.final_read() + + # Check if the string starts with import statements (after comments/whitespace) + match = self.IMPORT_REGEX.match(string) + if match: + # Insert after imports + leading = match.group(1) # comments/whitespace + imports = match.group(2) # import statements + rest = string[match.end():] + return leading + imports + self.first_buff + self.rewrite(rest) + self.final_read() + else: + # No imports at start, insert at beginning as before + return self.first_buff + self.rewrite(string) + self.final_read() def final_read(self): return '' @@ -339,7 +367,9 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo): Align to line boundaries if needed. """ try: - buff = self.first_buff + insert_buff = self.first_buff + first_chunk = True + import_check_done = False # for html rewriting: # if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding @@ -349,8 +379,11 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo): else: charset = 'iso-8859-1' - if buff: - yield buff.encode(charset) + # Check if we should look for ES6 imports + # Only do this for JavaScript files (not HTML, CSS, etc.) + should_check_imports = (insert_buff and + rwinfo.text_type in ('js', 'js-proxy', 'js-worker') and + not rwinfo.text_type == 'html') decoder = codecs.getincrementaldecoder(charset)() @@ -371,7 +404,30 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo): decoder = codecs.getincrementaldecoder(charset)() buff = decoder.decode(buff) - buff = self.rewrite(buff) + # On first chunk, check for ES6 imports if needed + if first_chunk and should_check_imports and not import_check_done: + match = self.IMPORT_REGEX.match(buff) + if match: + # Insert after imports + leading = match.group(1) # comments/whitespace + imports = match.group(2) # import statements + rest = buff[match.end():] + buff = leading + imports + insert_buff + self.rewrite(rest) + insert_buff = '' # Don't insert again + else: + # No imports, insert at beginning + buff = insert_buff + self.rewrite(buff) + insert_buff = '' + import_check_done = True + first_chunk = False + elif first_chunk and insert_buff: + # Not checking imports, insert at beginning + yield insert_buff.encode(charset) + insert_buff = '' + first_chunk = False + buff = self.rewrite(buff) + else: + buff = self.rewrite(buff) yield buff.encode(charset) diff --git a/pywb/rewrite/test/test_content_rewriter.py b/pywb/rewrite/test/test_content_rewriter.py index 4555291ae..cb60a56dd 100644 --- a/pywb/rewrite/test/test_content_rewriter.py +++ b/pywb/rewrite/test/test_content_rewriter.py @@ -838,3 +838,82 @@ def test_json_body_but_mime_html(self): assert headers.headers == [('Content-Type', 'text/html')] result = b''.join(gen).decode('utf-8') assert result == content + + def test_es6_imports_insertion_after_imports(self): + """Test that first_buff is inserted after ES6 import statements, not before.""" + headers = {'Content-Type': 'text/javascript'} + # ES6 module with imports at the beginning + content = """import { foo } from 'module1'; +import bar from 'module2'; + +console.log('test'); +location = 'http://example.com/'; +""" + + rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + + result = b''.join(gen).decode('utf-8') + + # Check that imports are at the very beginning + assert result.startswith("import { foo } from 'module1';") + + # Check that the variable declarations (from first_buff) come AFTER imports + import_end = result.find("import bar from 'module2';") + assert import_end != -1 + + # Find where the injected variables start (they should be after all imports) + var_window_pos = result.find('let window =') + imports_end_pos = result.find('\n', import_end) + 1 + + # The injected code should come after the imports + assert var_window_pos > imports_end_pos, \ + f"Expected injected code after imports, but found at {var_window_pos} vs imports ending at {imports_end_pos}" + + # Verify location rewriting still works + assert 'WB_wombat_location' in result + + def test_es6_imports_with_comments(self): + """Test that ES6 imports are detected even with leading comments.""" + headers = {'Content-Type': 'text/javascript'} + # ES6 module with comments before imports + content = """// This is a comment +/* Multi-line + comment */ +import { foo } from 'module1'; + +console.log('test'); +""" + + rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + + result = b''.join(gen).decode('utf-8') + + # Comments should be preserved at the beginning + assert result.startswith("// This is a comment") + + # Import should come after comments + assert "import { foo } from 'module1';" in result + + # The injected code should come after imports + import_pos = result.find("import { foo } from 'module1';") + var_window_pos = result.find('let window =') + assert var_window_pos > import_pos + + def test_no_es6_imports_normal_insertion(self): + """Test that JS without imports still gets first_buff inserted at the beginning.""" + headers = {'Content-Type': 'text/javascript'} + # Regular JS without imports + content = """console.log('test'); +location = 'http://example.com/'; +""" + + rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_') + + result = b''.join(gen).decode('utf-8') + + # The injected code should come at the very beginning (before console.log) + var_window_pos = result.find('let window =') + console_pos = result.find('console.log') + + assert var_window_pos < console_pos, \ + "Expected injected code at the beginning when no imports present"