diff --git a/article_scraper.js b/article_scraper.js new file mode 100644 index 00000000..ccbb6dac --- /dev/null +++ b/article_scraper.js @@ -0,0 +1,254 @@ +// n8n Code Node - JavaScript + +// Get the URL from the input or a parameter +const url = $input.first().json.url; + +if (!url) { + throw new Error("URL is missing from input item."); +} + +let htmlText = ''; +try { + const response = await this.helpers.httpRequest({ + url: url, + method: 'GET', + json: false, // We want the raw HTML string + }); + htmlText = response; +} catch (error) { + console.error(`Error fetching URL ${url}:`, error.message); + return [{ json: { error: `Failed to fetch URL: ${error.message}`, sourceUrl: url } }]; +} + +// ---- PARSING LOGIC ---- +// IMPORTANT: Cheerio is NOT available in n8n Cloud Code Nodes. +// The following code will ALWAYS use the regex fallback in n8n Cloud. +// For a more robust solution in n8n, use the HTTP Request Node followed by the HTML Node. + +let $; // This will remain undefined in n8n Cloud + +try { + // This check will fail in n8n Cloud, leading to the regex fallback. + if (typeof cheerio !== 'undefined') { + // This block will NOT be executed in n8n Cloud + $ = cheerio.load(htmlText); + + // --- Cheerio-based Extraction Logic (NOT USED IN N8N CLOUD) --- + let title = $('head title').text().trim(); + if (!title) title = $('meta[property="og:title"]').attr('content')?.trim() || ''; + if (!title) title = $('h1').first().text().trim(); + if (!title) title = $('article h1').first().text().trim(); + if (!title) title = $('.entry-title').first().text().trim(); + if (!title) title = 'Title not found (Cheerio)'; + + const images = []; + const imageUrls = new Set(); + // Example: Target images with specific classes or within specific containers + $('article img, .entry-content img, .post-content img, .article-body img, main img, .td-post-content img, .single-post-content img, img.featured-image, img.wp-post-image, img.article-thumbnail').each((i, el) => { + let src = $(el).attr('src') || $(el).attr('data-src') || $(el).attr('data-lazy-src'); + if (src) { + try { + const base = new URL(url); + // Resolve relative URLs + if (src.startsWith('//')) src = base.protocol + src; + else if (src.startsWith('/')) src = base.origin + src; + else if (!src.startsWith('http')) src = new URL(src, base.origin + (base.pathname.endsWith('/') ? base.pathname : base.pathname.substring(0, base.pathname.lastIndexOf('/') + 1))).href; + + // Ensure it's a valid URL before adding + src = new URL(src).href; + + } catch (e) { + // Attempt basic concatenation if advanced resolution fails + if (src && !src.startsWith('http')) { + try { + src = new URL(url).origin + (src.startsWith('/') ? '' : '/') + src; + src = new URL(src).href; // Validate again + } catch (e2) { + console.warn(`Could not resolve image URL: ${$(el).attr('src')} with base ${url}`); + return; // Skip if still not resolvable + } + } else if (!src) { + return; // Skip if src is empty + } + } + + const alt = ($(el).attr('alt') || '').toLowerCase(); + const className = ($(el).attr('class') || '').toLowerCase(); + const parentClasses = ($(el).parent().attr('class') || '').toLowerCase(); + const width = parseInt($(el).attr('width') || $(el).css('width'), 10) || 0; + const height = parseInt($(el).attr('height') || $(el).css('height'), 10) || 0; + + if (alt.includes('logo') || alt.includes('avatar') || alt.includes('icon') || className.includes('logo') || className.includes('avatar') || className.includes('ad') || parentClasses.includes('ad') || parentClasses.includes('logo')) return; + if ((width && width < 100 && !src.endsWith('.svg')) || (height && height < 100 && !src.endsWith('.svg'))) return; + if (src.includes('data:image') || ['doubleclick.net', 'googleadservices.com', 'googlesyndication.com', 'amazon-adsystem.com', 'adnxs.com', '/spinner.gif', 'loading.gif', 'placeholder.svg'].some(p => src.includes(p))) return; + + if (!imageUrls.has(src)) { + // Modified to return only the src URL as requested for the primary task + images.push({ src: src }); + imageUrls.add(src); + } + } + }); + + let mainText = ''; + const contentSelectors = ['article.hentry .entry-content', 'article .td-post-content', 'article .entry-content', 'article .post-content', 'div[itemprop="articleBody"]', '.entry-content', '.article-body', 'article']; + let contentElement = null; + for (const selector of contentSelectors) { + const found = $(selector); + if (found.length) { contentElement = found.first(); break; } + } + + if (contentElement && contentElement.length) { + const $content = contentElement.clone(); + $content.find('script, style, noscript, iframe, form, button, input, textarea, svg, header, footer, nav, aside, .noprint, .advertisement, .ad, .banner, .sidebar, .related-posts, .comments, .author-bio, .share-buttons, .post-meta, .entry-meta, .toc, #toc, [class*="social"], [aria-hidden="true"]').remove(); + const textBlocks = []; + $content.children('p, h1, h2, h3, h4, h5, h6, li, blockquote, pre').each((i, el) => { + const blockText = $(el).text().trim(); + if (blockText && blockText.length > 10) textBlocks.push(blockText); + }); + mainText = textBlocks.join('\n\n'); + if (mainText.length < 200 && contentElement.text().length > mainText.length) { // Check if heuristic stripping was too aggressive + mainText = contentElement.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim(); + } + } else { + const $bodyClone = $('body').clone(); + $bodyClone.find('header, footer, nav, aside, script, style, noscript, iframe, .advertisement, .ad, .banner, .noprint').remove(); + mainText = $bodyClone.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim(); + } + mainText = mainText.replace(/\n{3,}/g, '\n\n').trim() || 'Main text not found (Cheerio).'; + + return [{ + json: { + title: title, + images: images.slice(0,10), // Return more images if available with Cheerio + mainText: mainText.substring(0, 5000), // Return more text if available + sourceUrl: url, + parser: "cheerio" // Indicate which parser was theoretically used + } + }]; + // --- End of Cheerio-based Extraction Logic --- + + } else { + // --- REGEX FALLBACK LOGIC (This WILL be used in n8n Cloud) --- + console.warn("Warning: HTML parsing library (like Cheerio) not available. Attempting simplified regex extraction. Results will be unreliable. Consider using the n8n HTML Node for robust parsing."); + + const titleMatch = htmlText.match(/