Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 61 additions & 5 deletions pywb/rewrite/content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,17 @@ def _get_adaptive_metadata(self, rwinfo):

# ============================================================================
class StreamingRewriter(object):
# Regex to match ES6 import statements and common comment patterns
# Matches import statements at the start, allowing for leading whitespace/comments
# The regex captures:
# - Group 1: Leading comments and whitespace
# - Group 2: One or more import statements (with their trailing newlines)
# Note: Does not capture blank lines after the last import
IMPORT_REGEX = re.compile(
r'^((?:\s|//[^\n]*\n|/\*(?:[^*]|\*(?!/))*\*/)*)((?:import\s+[^;\n]+;?\n)+)',
re.MULTILINE
)

def __init__(self, url_rewriter, align_to_line=True, first_buff=''):
self.url_rewriter = url_rewriter
self.align_to_line = align_to_line
Expand All @@ -327,7 +338,24 @@ def rewrite(self, string):
return string

def rewrite_complete(self, string, **kwargs):
return self.first_buff + self.rewrite(string) + self.final_read()
return self._insert_with_import_check(string, **kwargs)

def _insert_with_import_check(self, string, **kwargs):
"""Insert first_buff after any leading ES6 import statements."""
if not self.first_buff:
return self.rewrite(string) + self.final_read()

# Check if the string starts with import statements (after comments/whitespace)
match = self.IMPORT_REGEX.match(string)
if match:
# Insert after imports
leading = match.group(1) # comments/whitespace
imports = match.group(2) # import statements
rest = string[match.end():]
return leading + imports + self.first_buff + self.rewrite(rest) + self.final_read()
else:
# No imports at start, insert at beginning as before
return self.first_buff + self.rewrite(string) + self.final_read()

def final_read(self):
return ''
Expand All @@ -339,7 +367,9 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo):
Align to line boundaries if needed.
"""
try:
buff = self.first_buff
insert_buff = self.first_buff
first_chunk = True
import_check_done = False

# for html rewriting:
# if charset is utf-8, use that, otherwise default to encode to ascii-compatible encoding
Expand All @@ -349,8 +379,11 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo):
else:
charset = 'iso-8859-1'

if buff:
yield buff.encode(charset)
# Check if we should look for ES6 imports
# Only do this for JavaScript files (not HTML, CSS, etc.)
should_check_imports = (insert_buff and
rwinfo.text_type in ('js', 'js-proxy', 'js-worker') and
not rwinfo.text_type == 'html')

decoder = codecs.getincrementaldecoder(charset)()

Expand All @@ -371,7 +404,30 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo):
decoder = codecs.getincrementaldecoder(charset)()
buff = decoder.decode(buff)

buff = self.rewrite(buff)
# On first chunk, check for ES6 imports if needed
if first_chunk and should_check_imports and not import_check_done:
match = self.IMPORT_REGEX.match(buff)
if match:
# Insert after imports
leading = match.group(1) # comments/whitespace
imports = match.group(2) # import statements
rest = buff[match.end():]
buff = leading + imports + insert_buff + self.rewrite(rest)
insert_buff = '' # Don't insert again
else:
# No imports, insert at beginning
buff = insert_buff + self.rewrite(buff)
insert_buff = ''
import_check_done = True
first_chunk = False
elif first_chunk and insert_buff:
# Not checking imports, insert at beginning
yield insert_buff.encode(charset)
insert_buff = ''
first_chunk = False
buff = self.rewrite(buff)
else:
buff = self.rewrite(buff)

yield buff.encode(charset)

Expand Down
79 changes: 79 additions & 0 deletions pywb/rewrite/test/test_content_rewriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -838,3 +838,82 @@ def test_json_body_but_mime_html(self):
assert headers.headers == [('Content-Type', 'text/html')]
result = b''.join(gen).decode('utf-8')
assert result == content

def test_es6_imports_insertion_after_imports(self):
"""Test that first_buff is inserted after ES6 import statements, not before."""
headers = {'Content-Type': 'text/javascript'}
# ES6 module with imports at the beginning
content = """import { foo } from 'module1';
import bar from 'module2';

console.log('test');
location = 'http://example.com/';
"""

rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')

result = b''.join(gen).decode('utf-8')

# Check that imports are at the very beginning
assert result.startswith("import { foo } from 'module1';")

# Check that the variable declarations (from first_buff) come AFTER imports
import_end = result.find("import bar from 'module2';")
assert import_end != -1

# Find where the injected variables start (they should be after all imports)
var_window_pos = result.find('let window =')
imports_end_pos = result.find('\n', import_end) + 1

# The injected code should come after the imports
assert var_window_pos > imports_end_pos, \
f"Expected injected code after imports, but found at {var_window_pos} vs imports ending at {imports_end_pos}"

# Verify location rewriting still works
assert 'WB_wombat_location' in result

def test_es6_imports_with_comments(self):
"""Test that ES6 imports are detected even with leading comments."""
headers = {'Content-Type': 'text/javascript'}
# ES6 module with comments before imports
content = """// This is a comment
/* Multi-line
comment */
import { foo } from 'module1';

console.log('test');
"""

rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')

result = b''.join(gen).decode('utf-8')

# Comments should be preserved at the beginning
assert result.startswith("// This is a comment")

# Import should come after comments
assert "import { foo } from 'module1';" in result

# The injected code should come after imports
import_pos = result.find("import { foo } from 'module1';")
var_window_pos = result.find('let window =')
assert var_window_pos > import_pos

def test_no_es6_imports_normal_insertion(self):
"""Test that JS without imports still gets first_buff inserted at the beginning."""
headers = {'Content-Type': 'text/javascript'}
# Regular JS without imports
content = """console.log('test');
location = 'http://example.com/';
"""

rwheaders, gen, is_rw = self.rewrite_record(headers, content, ts='201701js_')

result = b''.join(gen).decode('utf-8')

# The injected code should come at the very beginning (before console.log)
var_window_pos = result.find('let window =')
console_pos = result.find('console.log')

assert var_window_pos < console_pos, \
"Expected injected code at the beginning when no imports present"
Loading