Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 96 additions & 24 deletions archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)
Expand Down Expand Up @@ -165,14 +165,6 @@ async function main() {
chromePid = result.pid;
const cdpUrl = result.cdpUrl;

// Write extensions metadata
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}

// Connect puppeteer for extension verification
console.error(`[*] Connecting puppeteer to CDP...`);
const browser = await puppeteer.connect({
Expand All @@ -181,30 +173,102 @@ async function main() {
});
browserInstance = browser;

// Verify extensions loaded
// Get actual extension IDs from chrome://extensions page
if (extensionPaths.length > 0) {
await new Promise(r => setTimeout(r, 3000));
await new Promise(r => setTimeout(r, 2000));

try {
const extPage = await browser.newPage();
await extPage.goto('chrome://extensions', { waitUntil: 'domcontentloaded', timeout: 10000 });
await new Promise(r => setTimeout(r, 2000));

// Parse extension info from the page
const extensionsFromPage = await extPage.evaluate(() => {
const extensions = [];
// Extensions manager uses shadow DOM
const manager = document.querySelector('extensions-manager');
if (!manager || !manager.shadowRoot) return extensions;

const itemList = manager.shadowRoot.querySelector('extensions-item-list');
if (!itemList || !itemList.shadowRoot) return extensions;

const items = itemList.shadowRoot.querySelectorAll('extensions-item');
for (const item of items) {
const id = item.getAttribute('id');
const nameEl = item.shadowRoot?.querySelector('#name');
const name = nameEl?.textContent?.trim() || '';
if (id && name) {
extensions.push({ id, name });
}
}
return extensions;
});

const targets = browser.targets();
console.error(`[*] All browser targets (${targets.length}):`);
for (const t of targets) {
console.error(` - ${t.type()}: ${t.url().slice(0, 80)}`);
}
console.error(`[*] Found ${extensionsFromPage.length} extension(s) on chrome://extensions`);
for (const e of extensionsFromPage) {
console.error(` - ${e.id}: "${e.name}"`);
}

const extTargets = targets.filter(t =>
t.url().startsWith('chrome-extension://') ||
t.type() === 'service_worker' ||
t.type() === 'background_page'
);
// Match extensions by name (strict matching)
for (const ext of installedExtensions) {
// Read the extension's manifest to get its display name
const manifestPath = path.join(ext.unpacked_path, 'manifest.json');
if (fs.existsSync(manifestPath)) {
const manifest = JSON.parse(fs.readFileSync(manifestPath, 'utf-8'));
let manifestName = manifest.name || '';

// Resolve message placeholder (e.g., __MSG_extName__)
if (manifestName.startsWith('__MSG_') && manifestName.endsWith('__')) {
const msgKey = manifestName.slice(6, -2); // Extract key from __MSG_key__
const defaultLocale = manifest.default_locale || 'en';
const messagesPath = path.join(ext.unpacked_path, '_locales', defaultLocale, 'messages.json');
if (fs.existsSync(messagesPath)) {
try {
const messages = JSON.parse(fs.readFileSync(messagesPath, 'utf-8'));
if (messages[msgKey] && messages[msgKey].message) {
manifestName = messages[msgKey].message;
}
} catch (e) {
console.error(`[!] Failed to read messages.json: ${e.message}`);
}
}
}

console.error(`[*] Looking for match: ext.name="${ext.name}" manifest.name="${manifestName}"`);

// Find matching extension from page by exact name match first
let match = extensionsFromPage.find(e => e.name === manifestName);

// Filter out built-in extensions
// If no exact match, try case-insensitive exact match
if (!match) {
match = extensionsFromPage.find(e =>
e.name.toLowerCase() === manifestName.toLowerCase()
);
}

if (match) {
ext.id = match.id;
console.error(`[+] Matched extension: ${ext.name} (${manifestName}) -> ${match.id}`);
} else {
console.error(`[!] No match found for: ${ext.name} (${manifestName})`);
}
}
}

await extPage.close();
} catch (e) {
console.error(`[!] Failed to get extensions from chrome://extensions: ${e.message}`);
}

// Fallback: check browser targets
const targets = browser.targets();
const builtinIds = [
'nkeimhogjdpnpccoofpliimaahmaaome',
'fignfifoniblkonapihmkfakmlgkbkcf',
'ahfgeienlihckogmohjhadlkjgocpleb',
'mhjfbmdgcfjbbpaeojofohoefgiehjai',
];
const customExtTargets = extTargets.filter(t => {
const customExtTargets = targets.filter(t => {
const url = t.url();
if (!url.startsWith('chrome-extension://')) return false;
const extId = url.split('://')[1].split('/')[0];
Expand All @@ -216,7 +280,7 @@ async function main() {
for (const target of customExtTargets) {
const url = target.url();
const extId = url.split('://')[1].split('/')[0];
console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
console.error(`[+] Extension target: ${extId} (${target.type()})`);
}

if (customExtTargets.length === 0 && extensionPaths.length > 0) {
Expand All @@ -225,6 +289,14 @@ async function main() {
}
}

// Write extensions metadata with actual IDs
if (installedExtensions.length > 0) {
fs.writeFileSync(
path.join(OUTPUT_DIR, 'extensions.json'),
JSON.stringify(installedExtensions, null, 2)
);
}

console.error(`[+] Chromium session started for crawl ${crawlId}`);
console.error(`[+] CDP URL: ${cdpUrl}`);
console.error(`[+] PID: ${chromePid}`);
Expand Down
4 changes: 2 additions & 2 deletions archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*
Expand Down Expand Up @@ -215,7 +215,7 @@ async function launchNewChrome(url, binary) {
console.log(`[*] Launched Chrome (PID: ${chromePid}), waiting for debug port...`);

// Write PID immediately for cleanup
fs.writeFileSync(path.join(OUTPUT_DIR, 'pid.txt'), String(chromePid));
fs.writeFileSync(path.join(OUTPUT_DIR, 'chrome.pid'), String(chromePid));

try {
// Wait for Chrome to be ready
Expand Down
17 changes: 10 additions & 7 deletions archivebox/plugins/chrome/tests/test_chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import platform

PLUGIN_DIR = Path(__file__).parent.parent
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)

Expand Down Expand Up @@ -176,6 +176,7 @@ def test_chrome_launch_and_tab_creation():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()

# Get test environment with NODE_MODULES_DIR set
env = get_test_env()
Expand All @@ -184,7 +185,7 @@ def test_chrome_launch_and_tab_creation():
# Launch Chrome at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-123'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down Expand Up @@ -292,7 +293,7 @@ def test_chrome_navigation():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-nav'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down Expand Up @@ -363,7 +364,7 @@ def test_tab_cleanup_on_sigterm():
# Launch Chrome (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-cleanup'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down Expand Up @@ -423,11 +424,12 @@ def test_multiple_snapshots_share_chrome():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()

# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-multi-crawl'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down Expand Up @@ -513,7 +515,7 @@ def test_chrome_cleanup_on_crawl_end():
# Launch Chrome in background
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-crawl-end'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down Expand Up @@ -554,11 +556,12 @@ def test_zombie_prevention_hook_killed():
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()

# Launch Chrome
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-zombie'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down
5 changes: 3 additions & 2 deletions archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = PLUGIN_DIR.parent
INFINISCROLL_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_infiniscroll.*'), None)
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_LAUNCH_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_TAB_HOOK = PLUGINS_ROOT / 'chrome' / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next((PLUGINS_ROOT / 'chrome').glob('on_Snapshot__*_chrome_navigate.*'), None)
TEST_URL = 'https://www.singsing.movie/'
Expand Down Expand Up @@ -122,14 +122,15 @@ def setup_chrome_session(tmpdir):
crawl_dir = Path(tmpdir) / 'crawl'
crawl_dir.mkdir()
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir()

env = get_test_env()
env['CHROME_HEADLESS'] = 'true'

# Launch Chrome at crawl level
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), '--crawl-id=test-infiniscroll'],
cwd=str(crawl_dir),
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
Expand Down
Loading
Loading