-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoffscreen.js
More file actions
92 lines (78 loc) · 3.63 KB
/
offscreen.js
File metadata and controls
92 lines (78 loc) · 3.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
// offscreen.js
const SCRAPE_TIMEOUT_MS = 15000; // 15 seconds
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (message.type === 'scrape') {
scrape(message.payload).then(sendResponse);
return true;
}
});
async function scrape(url) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), SCRAPE_TIMEOUT_MS);
try {
const response = await fetch(url, { signal: controller.signal });
clearTimeout(timeoutId);
if (!response.ok) {
// Prefix with a tag for easy identification
return { error: `[FETCH_FAILED:${response.status}] Failed to fetch ${url}: ${response.statusText}` };
}
const contentType = response.headers.get('Content-Type');
if (!contentType || !contentType.includes('text/html')) {
return { error: `[UNSUPPORTED_CONTENT] Unsupported content type: ${contentType || 'N/A'}` };
}
const html = await response.text();
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
// Remove script, style, nav, footer, and header elements
doc.querySelectorAll('script, style, nav, footer, header, aside').forEach((el) => el.remove());
// Try to find the main content of the page
const mainContent = doc.querySelector('article') || doc.querySelector('main') || doc.body;
if (!mainContent) {
return { text: '' };
}
// Get text and clean it up
let rawText = mainContent.innerText;
// --- Comprehensive Cleaning ---
// Define a list of regex patterns to remove unwanted content like URLs, file paths, and code snippets.
const cleaningPatterns = [
/https?:\/\/[^\s/$.?#].[^\s]*/g, // URLs
/(?:[a-zA-Z]:)?(?:\\|\/)[^\s:"|*?<>]+\/[^\s:"|*?<>]*/g, // File Paths
/^\s*[\$#%>]\s*.*/gm, // Lines starting with shell prompts
/^\s*\w+\s*=\s*.*$/gm, // Lines with variable assignments (e.g., x = 10)
/^\s*.*\b\w+\.\w+\(.*?\).*$/gm, // Lines with method calls (e.g., object.method())
/^\s*\[.*,.*\]\s*$/gm, // Lines that look like lists/arrays
/^\s*".*"\s*:\s*".*",?\s*$/gm, // Lines that look like key-value pairs (JSON/Headers)
/^\s*(.)\1{4,}\s*$/gm, // Lines with repeated characters (e.g., -----, =====)
/^\s*[\w.-]+\s*\(\d{4}-\d{2}-\d{2}\)\s*$/gm, // Changelog entries (e.g., 1.2.3 (2023-01-01))
/b(["']).*?\1/g, // Python-style byte strings (e.g., b'...')
/^.*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\s*→\s*\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}.*$/gm, // Network logs (e.g., 127.0.0.1 -> 127.0.0.1)
/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{3}Z\+\d{2}:\d{2}/g // ISO-formatted timestamps
];
// Apply all cleaning patterns
for (const pattern of cleaningPatterns) {
rawText = rawText.replace(pattern, '');
}
// --- Word Length Filter ---
// Reasonable maximum length for a single "word".
const MAX_WORD_LENGTH = 150;
const words = rawText.split(/\s+/);
const saneWords = words.flatMap(word => {
if (word.length > MAX_WORD_LENGTH) {
const chunks = [];
for (let i = 0; i < word.length; i += MAX_WORD_LENGTH) {
chunks.push(word.substring(i, i + MAX_WORD_LENGTH));
}
return chunks;
}
return [word];
});
const text = saneWords.join(' ').replace(/\s\s+/g, ' ').trim();
return { text };
} catch (error) {
clearTimeout(timeoutId);
if (error.name === 'AbortError') {
return { error: `[TIMEOUT] Failed to scrape ${url}: Request timed out after ${SCRAPE_TIMEOUT_MS / 1000} seconds.` };
}
return { error: `[NETWORK_ERROR] Failed to scrape ${url}: ${error.message}` };
}
}