Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
254 changes: 254 additions & 0 deletions article_scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
// n8n Code Node - JavaScript

// Get the URL from the input or a parameter
const url = $input.first().json.url;

if (!url) {
throw new Error("URL is missing from input item.");
}

let htmlText = '';
try {
const response = await this.helpers.httpRequest({
url: url,
method: 'GET',
json: false, // We want the raw HTML string
});
htmlText = response;
} catch (error) {
console.error(`Error fetching URL ${url}:`, error.message);
return [{ json: { error: `Failed to fetch URL: ${error.message}`, sourceUrl: url } }];
}

// ---- PARSING LOGIC ----
// IMPORTANT: Cheerio is NOT available in n8n Cloud Code Nodes.
// The following code will ALWAYS use the regex fallback in n8n Cloud.
// For a more robust solution in n8n, use the HTTP Request Node followed by the HTML Node.

let $; // This will remain undefined in n8n Cloud

try {
// This check will fail in n8n Cloud, leading to the regex fallback.
if (typeof cheerio !== 'undefined') {
// This block will NOT be executed in n8n Cloud
$ = cheerio.load(htmlText);

// --- Cheerio-based Extraction Logic (NOT USED IN N8N CLOUD) ---
let title = $('head title').text().trim();
if (!title) title = $('meta[property="og:title"]').attr('content')?.trim() || '';
if (!title) title = $('h1').first().text().trim();
if (!title) title = $('article h1').first().text().trim();
if (!title) title = $('.entry-title').first().text().trim();
if (!title) title = 'Title not found (Cheerio)';

const images = [];
const imageUrls = new Set();
// Example: Target images with specific classes or within specific containers
$('article img, .entry-content img, .post-content img, .article-body img, main img, .td-post-content img, .single-post-content img, img.featured-image, img.wp-post-image, img.article-thumbnail').each((i, el) => {
let src = $(el).attr('src') || $(el).attr('data-src') || $(el).attr('data-lazy-src');
if (src) {
try {
const base = new URL(url);
// Resolve relative URLs
if (src.startsWith('//')) src = base.protocol + src;
else if (src.startsWith('/')) src = base.origin + src;
else if (!src.startsWith('http')) src = new URL(src, base.origin + (base.pathname.endsWith('/') ? base.pathname : base.pathname.substring(0, base.pathname.lastIndexOf('/') + 1))).href;

// Ensure it's a valid URL before adding
src = new URL(src).href;

} catch (e) {
// Attempt basic concatenation if advanced resolution fails
if (src && !src.startsWith('http')) {
try {
src = new URL(url).origin + (src.startsWith('/') ? '' : '/') + src;
src = new URL(src).href; // Validate again
} catch (e2) {
console.warn(`Could not resolve image URL: ${$(el).attr('src')} with base ${url}`);
return; // Skip if still not resolvable
}
} else if (!src) {
return; // Skip if src is empty
}
}

const alt = ($(el).attr('alt') || '').toLowerCase();
const className = ($(el).attr('class') || '').toLowerCase();
const parentClasses = ($(el).parent().attr('class') || '').toLowerCase();
const width = parseInt($(el).attr('width') || $(el).css('width'), 10) || 0;
const height = parseInt($(el).attr('height') || $(el).css('height'), 10) || 0;

if (alt.includes('logo') || alt.includes('avatar') || alt.includes('icon') || className.includes('logo') || className.includes('avatar') || className.includes('ad') || parentClasses.includes('ad') || parentClasses.includes('logo')) return;
if ((width && width < 100 && !src.endsWith('.svg')) || (height && height < 100 && !src.endsWith('.svg'))) return;
if (src.includes('data:image') || ['doubleclick.net', 'googleadservices.com', 'googlesyndication.com', 'amazon-adsystem.com', 'adnxs.com', '/spinner.gif', 'loading.gif', 'placeholder.svg'].some(p => src.includes(p))) return;

if (!imageUrls.has(src)) {
// Modified to return only the src URL as requested for the primary task
images.push({ src: src });
imageUrls.add(src);
}
}
});

let mainText = '';
const contentSelectors = ['article.hentry .entry-content', 'article .td-post-content', 'article .entry-content', 'article .post-content', 'div[itemprop="articleBody"]', '.entry-content', '.article-body', 'article'];
let contentElement = null;
for (const selector of contentSelectors) {
const found = $(selector);
if (found.length) { contentElement = found.first(); break; }
}

if (contentElement && contentElement.length) {
const $content = contentElement.clone();
$content.find('script, style, noscript, iframe, form, button, input, textarea, svg, header, footer, nav, aside, .noprint, .advertisement, .ad, .banner, .sidebar, .related-posts, .comments, .author-bio, .share-buttons, .post-meta, .entry-meta, .toc, #toc, [class*="social"], [aria-hidden="true"]').remove();
const textBlocks = [];
$content.children('p, h1, h2, h3, h4, h5, h6, li, blockquote, pre').each((i, el) => {
const blockText = $(el).text().trim();
if (blockText && blockText.length > 10) textBlocks.push(blockText);
});
mainText = textBlocks.join('\n\n');
if (mainText.length < 200 && contentElement.text().length > mainText.length) { // Check if heuristic stripping was too aggressive
mainText = contentElement.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim();
}
} else {
const $bodyClone = $('body').clone();
$bodyClone.find('header, footer, nav, aside, script, style, noscript, iframe, .advertisement, .ad, .banner, .noprint').remove();
mainText = $bodyClone.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim();
}
mainText = mainText.replace(/\n{3,}/g, '\n\n').trim() || 'Main text not found (Cheerio).';

return [{
json: {
title: title,
images: images.slice(0,10), // Return more images if available with Cheerio
mainText: mainText.substring(0, 5000), // Return more text if available
sourceUrl: url,
parser: "cheerio" // Indicate which parser was theoretically used
}
}];
// --- End of Cheerio-based Extraction Logic ---

} else {
// --- REGEX FALLBACK LOGIC (This WILL be used in n8n Cloud) ---
console.warn("Warning: HTML parsing library (like Cheerio) not available. Attempting simplified regex extraction. Results will be unreliable. Consider using the n8n HTML Node for robust parsing.");

const titleMatch = htmlText.match(/<title>([^<]+)<\/title>/i);
const extractedTitle = titleMatch ? titleMatch[1].trim() : 'Title not found (regex)';

const extractedImages = [];
const imageUrlsRegex = new Set(); // To avoid duplicate image URLs

// Regex to capture img tags and their src, alt, and class attributes
// This is more complex and still less reliable than a proper parser.
// It tries to account for different orders of src, alt, class attributes.
const imgTagRegex = /<img[^>]*src\s*=\s*["']([^"']+)["'](?:[^>]*alt\s*=\s*["']([^"']*)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*alt\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*class\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*alt\s*=\s*["']([^"']*)["'])?[^>]*>/gi;

let imgMatch;
while((imgMatch = imgTagRegex.exec(htmlText)) !== null) {
let src, alt, classAttr;

if (imgMatch[1] !== undefined) { // src first
src = imgMatch[1];
alt = imgMatch[2];
classAttr = imgMatch[3];
} else if (imgMatch[5] !== undefined) { // alt first, then src
alt = imgMatch[4];
src = imgMatch[5];
classAttr = imgMatch[6];
} else if (imgMatch[8] !== undefined) { // class first, then src
classAttr = imgMatch[7];
src = imgMatch[8];
alt = imgMatch[9];
} else {
// Fallback for simple src if others fail (e.g. if only src is present)
const simpleSrcMatch = /<img[^>]+src="([^" >]+)"/i.exec(imgMatch[0]);
if (simpleSrcMatch && simpleSrcMatch[1]) {
src = simpleSrcMatch[1];
} else {
continue;
}
alt = ''; // Assume no alt if not captured by complex regex
classAttr = ''; // Assume no class
}

if (!src) continue;

src = src.trim();
alt = (alt || '').trim().toLowerCase(); // Ensure alt is a string
// classAttr = (classAttr || '').trim().toLowerCase(); // For potential future use

// Basic filtering (similar to original, can be expanded)
const commonExclusions = ['logo', 'avatar', 'icon', 'ad', 'banner', 'spinner', 'loading', 'placeholder', 'transparent.gif', 'data:image', '.svg'];
if (commonExclusions.some(ex => src.includes(ex) || alt.includes(ex))) {
continue;
}
// Example of how you might try to use classAttr, though it's still limited:
// if (classAttr && (classAttr.includes('ad-icon') || classAttr.includes('profile-pic'))) {
// continue;
// }

try {
let resolvedSrc;
const base = new URL(url); // Assuming `url` is the base URL of the article
if (src.startsWith('//')) resolvedSrc = base.protocol + src;
else if (src.startsWith('/')) resolvedSrc = base.origin + src;
else if (!src.startsWith('http')) resolvedSrc = new URL(src, base.href).href; // More robust relative URL resolution
else resolvedSrc = src;

resolvedSrc = new URL(resolvedSrc).href; // Validate and normalize

if (!imageUrlsRegex.has(resolvedSrc)) {
extractedImages.push({ src: resolvedSrc }); // Return only src as requested
imageUrlsRegex.add(resolvedSrc);
}
} catch (e) {
console.warn(`Regex: Could not resolve or validate image URL: ${src} with base ${url}. Error: ${e.message}`);
}
}

const bodyTextMatch = htmlText.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
let extractedText = 'Main text not found (regex)';
if (bodyTextMatch && bodyTextMatch[1]) {
extractedText = bodyTextMatch[1].replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
.replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
.replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
.replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
.replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
.replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
.replace(/<form[^>]*>[\s\S]*?<\/form>/gi, '')
.replace(/<button[^>]*>[\s\S]*?<\/button>/gi, '')
.replace(/<iframe[^>]*>[\s\S]*?<\/iframe>/gi, '')
.replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, '')
.replace(/<!--[\s\S]*?-->/g, '') // Remove comments
.replace(/<[^>]+>/g, ' ') // Strip remaining tags
.replace(/\s\s+/g, ' ').trim(); // Normalize whitespace
}

return [{ json: {
title: extractedTitle,
images: extractedImages.slice(0, 5), // Limit to 5 images for regex output
mainText: extractedText.substring(0,1500), // Limit text length
sourceUrl: url,
parser: "regex", // Indicate regex was used
warning: 'Used unreliable regex extraction due to missing parser. Consider using n8n HTML Node.'
}}];
// --- End of Simplified Regex Extraction ---
}

} catch (e) {
console.error("Failed to load HTML into parser or during regex processing:", e.message);
return [{ json: { error: `Failed during parsing: ${e.message}`, sourceUrl: url } }];
}

// This part of the code should ideally not be reached if the try/catch above handles all returns.
// It's a fallback for unexpected scenarios.
return [{
json: {
title: "Error: Unexpected end of script execution.",
images: [],
mainText: "HTML processing did not complete as expected.",
sourceUrl: url,
parser: "unknown",
warning: "Execution path terminated unexpectedly."
}
}];