diff --git a/article_scraper.js b/article_scraper.js new file mode 100644 index 00000000..ccbb6dac --- /dev/null +++ b/article_scraper.js @@ -0,0 +1,254 @@ +// n8n Code Node - JavaScript + +// Get the URL from the input or a parameter +const url = $input.first().json.url; + +if (!url) { + throw new Error("URL is missing from input item."); +} + +let htmlText = ''; +try { + const response = await this.helpers.httpRequest({ + url: url, + method: 'GET', + json: false, // We want the raw HTML string + }); + htmlText = response; +} catch (error) { + console.error(`Error fetching URL ${url}:`, error.message); + return [{ json: { error: `Failed to fetch URL: ${error.message}`, sourceUrl: url } }]; +} + +// ---- PARSING LOGIC ---- +// IMPORTANT: Cheerio is NOT available in n8n Cloud Code Nodes. +// The following code will ALWAYS use the regex fallback in n8n Cloud. +// For a more robust solution in n8n, use the HTTP Request Node followed by the HTML Node. + +let $; // This will remain undefined in n8n Cloud + +try { + // This check will fail in n8n Cloud, leading to the regex fallback. + if (typeof cheerio !== 'undefined') { + // This block will NOT be executed in n8n Cloud + $ = cheerio.load(htmlText); + + // --- Cheerio-based Extraction Logic (NOT USED IN N8N CLOUD) --- + let title = $('head title').text().trim(); + if (!title) title = $('meta[property="og:title"]').attr('content')?.trim() || ''; + if (!title) title = $('h1').first().text().trim(); + if (!title) title = $('article h1').first().text().trim(); + if (!title) title = $('.entry-title').first().text().trim(); + if (!title) title = 'Title not found (Cheerio)'; + + const images = []; + const imageUrls = new Set(); + // Example: Target images with specific classes or within specific containers + $('article img, .entry-content img, .post-content img, .article-body img, main img, .td-post-content img, .single-post-content img, img.featured-image, img.wp-post-image, img.article-thumbnail').each((i, el) => { + let src = $(el).attr('src') || $(el).attr('data-src') || $(el).attr('data-lazy-src'); + if (src) { + try { + const base = new URL(url); + // Resolve relative URLs + if (src.startsWith('//')) src = base.protocol + src; + else if (src.startsWith('/')) src = base.origin + src; + else if (!src.startsWith('http')) src = new URL(src, base.origin + (base.pathname.endsWith('/') ? base.pathname : base.pathname.substring(0, base.pathname.lastIndexOf('/') + 1))).href; + + // Ensure it's a valid URL before adding + src = new URL(src).href; + + } catch (e) { + // Attempt basic concatenation if advanced resolution fails + if (src && !src.startsWith('http')) { + try { + src = new URL(url).origin + (src.startsWith('/') ? '' : '/') + src; + src = new URL(src).href; // Validate again + } catch (e2) { + console.warn(`Could not resolve image URL: ${$(el).attr('src')} with base ${url}`); + return; // Skip if still not resolvable + } + } else if (!src) { + return; // Skip if src is empty + } + } + + const alt = ($(el).attr('alt') || '').toLowerCase(); + const className = ($(el).attr('class') || '').toLowerCase(); + const parentClasses = ($(el).parent().attr('class') || '').toLowerCase(); + const width = parseInt($(el).attr('width') || $(el).css('width'), 10) || 0; + const height = parseInt($(el).attr('height') || $(el).css('height'), 10) || 0; + + if (alt.includes('logo') || alt.includes('avatar') || alt.includes('icon') || className.includes('logo') || className.includes('avatar') || className.includes('ad') || parentClasses.includes('ad') || parentClasses.includes('logo')) return; + if ((width && width < 100 && !src.endsWith('.svg')) || (height && height < 100 && !src.endsWith('.svg'))) return; + if (src.includes('data:image') || ['doubleclick.net', 'googleadservices.com', 'googlesyndication.com', 'amazon-adsystem.com', 'adnxs.com', '/spinner.gif', 'loading.gif', 'placeholder.svg'].some(p => src.includes(p))) return; + + if (!imageUrls.has(src)) { + // Modified to return only the src URL as requested for the primary task + images.push({ src: src }); + imageUrls.add(src); + } + } + }); + + let mainText = ''; + const contentSelectors = ['article.hentry .entry-content', 'article .td-post-content', 'article .entry-content', 'article .post-content', 'div[itemprop="articleBody"]', '.entry-content', '.article-body', 'article']; + let contentElement = null; + for (const selector of contentSelectors) { + const found = $(selector); + if (found.length) { contentElement = found.first(); break; } + } + + if (contentElement && contentElement.length) { + const $content = contentElement.clone(); + $content.find('script, style, noscript, iframe, form, button, input, textarea, svg, header, footer, nav, aside, .noprint, .advertisement, .ad, .banner, .sidebar, .related-posts, .comments, .author-bio, .share-buttons, .post-meta, .entry-meta, .toc, #toc, [class*="social"], [aria-hidden="true"]').remove(); + const textBlocks = []; + $content.children('p, h1, h2, h3, h4, h5, h6, li, blockquote, pre').each((i, el) => { + const blockText = $(el).text().trim(); + if (blockText && blockText.length > 10) textBlocks.push(blockText); + }); + mainText = textBlocks.join('\n\n'); + if (mainText.length < 200 && contentElement.text().length > mainText.length) { // Check if heuristic stripping was too aggressive + mainText = contentElement.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim(); + } + } else { + const $bodyClone = $('body').clone(); + $bodyClone.find('header, footer, nav, aside, script, style, noscript, iframe, .advertisement, .ad, .banner, .noprint').remove(); + mainText = $bodyClone.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim(); + } + mainText = mainText.replace(/\n{3,}/g, '\n\n').trim() || 'Main text not found (Cheerio).'; + + return [{ + json: { + title: title, + images: images.slice(0,10), // Return more images if available with Cheerio + mainText: mainText.substring(0, 5000), // Return more text if available + sourceUrl: url, + parser: "cheerio" // Indicate which parser was theoretically used + } + }]; + // --- End of Cheerio-based Extraction Logic --- + + } else { + // --- REGEX FALLBACK LOGIC (This WILL be used in n8n Cloud) --- + console.warn("Warning: HTML parsing library (like Cheerio) not available. Attempting simplified regex extraction. Results will be unreliable. Consider using the n8n HTML Node for robust parsing."); + + const titleMatch = htmlText.match(/([^<]+)<\/title>/i); + const extractedTitle = titleMatch ? titleMatch[1].trim() : 'Title not found (regex)'; + + const extractedImages = []; + const imageUrlsRegex = new Set(); // To avoid duplicate image URLs + + // Regex to capture img tags and their src, alt, and class attributes + // This is more complex and still less reliable than a proper parser. + // It tries to account for different orders of src, alt, class attributes. + const imgTagRegex = /<img[^>]*src\s*=\s*["']([^"']+)["'](?:[^>]*alt\s*=\s*["']([^"']*)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*alt\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*class\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*alt\s*=\s*["']([^"']*)["'])?[^>]*>/gi; + + let imgMatch; + while((imgMatch = imgTagRegex.exec(htmlText)) !== null) { + let src, alt, classAttr; + + if (imgMatch[1] !== undefined) { // src first + src = imgMatch[1]; + alt = imgMatch[2]; + classAttr = imgMatch[3]; + } else if (imgMatch[5] !== undefined) { // alt first, then src + alt = imgMatch[4]; + src = imgMatch[5]; + classAttr = imgMatch[6]; + } else if (imgMatch[8] !== undefined) { // class first, then src + classAttr = imgMatch[7]; + src = imgMatch[8]; + alt = imgMatch[9]; + } else { + // Fallback for simple src if others fail (e.g. if only src is present) + const simpleSrcMatch = /<img[^>]+src="([^" >]+)"/i.exec(imgMatch[0]); + if (simpleSrcMatch && simpleSrcMatch[1]) { + src = simpleSrcMatch[1]; + } else { + continue; + } + alt = ''; // Assume no alt if not captured by complex regex + classAttr = ''; // Assume no class + } + + if (!src) continue; + + src = src.trim(); + alt = (alt || '').trim().toLowerCase(); // Ensure alt is a string + // classAttr = (classAttr || '').trim().toLowerCase(); // For potential future use + + // Basic filtering (similar to original, can be expanded) + const commonExclusions = ['logo', 'avatar', 'icon', 'ad', 'banner', 'spinner', 'loading', 'placeholder', 'transparent.gif', 'data:image', '.svg']; + if (commonExclusions.some(ex => src.includes(ex) || alt.includes(ex))) { + continue; + } + // Example of how you might try to use classAttr, though it's still limited: + // if (classAttr && (classAttr.includes('ad-icon') || classAttr.includes('profile-pic'))) { + // continue; + // } + + try { + let resolvedSrc; + const base = new URL(url); // Assuming `url` is the base URL of the article + if (src.startsWith('//')) resolvedSrc = base.protocol + src; + else if (src.startsWith('/')) resolvedSrc = base.origin + src; + else if (!src.startsWith('http')) resolvedSrc = new URL(src, base.href).href; // More robust relative URL resolution + else resolvedSrc = src; + + resolvedSrc = new URL(resolvedSrc).href; // Validate and normalize + + if (!imageUrlsRegex.has(resolvedSrc)) { + extractedImages.push({ src: resolvedSrc }); // Return only src as requested + imageUrlsRegex.add(resolvedSrc); + } + } catch (e) { + console.warn(`Regex: Could not resolve or validate image URL: ${src} with base ${url}. Error: ${e.message}`); + } + } + + const bodyTextMatch = htmlText.match(/<body[^>]*>([\s\S]*?)<\/body>/i); + let extractedText = 'Main text not found (regex)'; + if (bodyTextMatch && bodyTextMatch[1]) { + extractedText = bodyTextMatch[1].replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '') + .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '') + .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '') + .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '') + .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '') + .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '') + .replace(/<form[^>]*>[\s\S]*?<\/form>/gi, '') + .replace(/<button[^>]*>[\s\S]*?<\/button>/gi, '') + .replace(/<iframe[^>]*>[\s\S]*?<\/iframe>/gi, '') + .replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, '') + .replace(/<!--[\s\S]*?-->/g, '') // Remove comments + .replace(/<[^>]+>/g, ' ') // Strip remaining tags + .replace(/\s\s+/g, ' ').trim(); // Normalize whitespace + } + + return [{ json: { + title: extractedTitle, + images: extractedImages.slice(0, 5), // Limit to 5 images for regex output + mainText: extractedText.substring(0,1500), // Limit text length + sourceUrl: url, + parser: "regex", // Indicate regex was used + warning: 'Used unreliable regex extraction due to missing parser. Consider using n8n HTML Node.' + }}]; + // --- End of Simplified Regex Extraction --- + } + +} catch (e) { + console.error("Failed to load HTML into parser or during regex processing:", e.message); + return [{ json: { error: `Failed during parsing: ${e.message}`, sourceUrl: url } }]; +} + +// This part of the code should ideally not be reached if the try/catch above handles all returns. +// It's a fallback for unexpected scenarios. +return [{ + json: { + title: "Error: Unexpected end of script execution.", + images: [], + mainText: "HTML processing did not complete as expected.", + sourceUrl: url, + parser: "unknown", + warning: "Execution path terminated unexpectedly." + } +}];