martinsbalodis · chawboiii · Jul 5, 2025
diff --git a/article_scraper.js b/article_scraper.js
@@ -0,0 +1,254 @@
+// n8n Code Node - JavaScript
+
+// Get the URL from the input or a parameter
+const url = $input.first().json.url;
+
+if (!url) {
+  throw new Error("URL is missing from input item.");
+}
+
+let htmlText = '';
+try {
+  const response = await this.helpers.httpRequest({
+    url: url,
+    method: 'GET',
+    json: false, // We want the raw HTML string
+  });
+  htmlText = response;
+} catch (error) {
+  console.error(`Error fetching URL ${url}:`, error.message);
+  return [{ json: { error: `Failed to fetch URL: ${error.message}`, sourceUrl: url } }];
+}
+
+// ---- PARSING LOGIC ----
+// IMPORTANT: Cheerio is NOT available in n8n Cloud Code Nodes.
+// The following code will ALWAYS use the regex fallback in n8n Cloud.
+// For a more robust solution in n8n, use the HTTP Request Node followed by the HTML Node.
+
+let $; // This will remain undefined in n8n Cloud
+
+try {
+  // This check will fail in n8n Cloud, leading to the regex fallback.
+  if (typeof cheerio !== 'undefined') {
+    // This block will NOT be executed in n8n Cloud
+    $ = cheerio.load(htmlText);
+
+    // --- Cheerio-based Extraction Logic (NOT USED IN N8N CLOUD) ---
+    let title = $('head title').text().trim();
+    if (!title) title = $('meta[property="og:title"]').attr('content')?.trim() || '';
+    if (!title) title = $('h1').first().text().trim();
+    if (!title) title = $('article h1').first().text().trim();
+    if (!title) title = $('.entry-title').first().text().trim();
+    if (!title) title = 'Title not found (Cheerio)';
+
+    const images = [];
+    const imageUrls = new Set();
+    // Example: Target images with specific classes or within specific containers
+    $('article img, .entry-content img, .post-content img, .article-body img, main img, .td-post-content img, .single-post-content img, img.featured-image, img.wp-post-image, img.article-thumbnail').each((i, el) => {
+        let src = $(el).attr('src') || $(el).attr('data-src') || $(el).attr('data-lazy-src');
+        if (src) {
+            try {
+                const base = new URL(url);
+                // Resolve relative URLs
+                if (src.startsWith('//')) src = base.protocol + src;
+                else if (src.startsWith('/')) src = base.origin + src;
+                else if (!src.startsWith('http')) src = new URL(src, base.origin + (base.pathname.endsWith('/') ? base.pathname : base.pathname.substring(0, base.pathname.lastIndexOf('/') + 1))).href;
+
+                // Ensure it's a valid URL before adding
+                src = new URL(src).href;
+
+            } catch (e) {
+                // Attempt basic concatenation if advanced resolution fails
+                if (src && !src.startsWith('http')) {
+                    try {
+                        src = new URL(url).origin + (src.startsWith('/') ? '' : '/') + src;
+                        src = new URL(src).href; // Validate again
+                    } catch (e2) {
+                        console.warn(`Could not resolve image URL: ${$(el).attr('src')} with base ${url}`);
+                        return; // Skip if still not resolvable
+                    }
+                } else if (!src) {
+                    return; // Skip if src is empty
+                }
+            }
+
+            const alt = ($(el).attr('alt') || '').toLowerCase();
+            const className = ($(el).attr('class') || '').toLowerCase();
+            const parentClasses = ($(el).parent().attr('class') || '').toLowerCase();
+            const width = parseInt($(el).attr('width') || $(el).css('width'), 10) || 0;
+            const height = parseInt($(el).attr('height') || $(el).css('height'), 10) || 0;
+
+            if (alt.includes('logo') || alt.includes('avatar') || alt.includes('icon') || className.includes('logo') || className.includes('avatar') || className.includes('ad') || parentClasses.includes('ad') || parentClasses.includes('logo')) return;
+            if ((width && width < 100 && !src.endsWith('.svg')) || (height && height < 100 && !src.endsWith('.svg'))) return;
+            if (src.includes('data:image') || ['doubleclick.net', 'googleadservices.com', 'googlesyndication.com', 'amazon-adsystem.com', 'adnxs.com', '/spinner.gif', 'loading.gif', 'placeholder.svg'].some(p => src.includes(p))) return;
+
+            if (!imageUrls.has(src)) {
+                // Modified to return only the src URL as requested for the primary task
+                images.push({ src: src });
+                imageUrls.add(src);
+            }
+        }
+    });
+
+    let mainText = '';
+    const contentSelectors = ['article.hentry .entry-content', 'article .td-post-content', 'article .entry-content', 'article .post-content', 'div[itemprop="articleBody"]', '.entry-content', '.article-body', 'article'];
+    let contentElement = null;
+    for (const selector of contentSelectors) {
+        const found = $(selector);
+        if (found.length) { contentElement = found.first(); break; }
+    }
+
+    if (contentElement && contentElement.length) {
+        const $content = contentElement.clone();
+        $content.find('script, style, noscript, iframe, form, button, input, textarea, svg, header, footer, nav, aside, .noprint, .advertisement, .ad, .banner, .sidebar, .related-posts, .comments, .author-bio, .share-buttons, .post-meta, .entry-meta, .toc, #toc, [class*="social"], [aria-hidden="true"]').remove();
+        const textBlocks = [];
+        $content.children('p, h1, h2, h3, h4, h5, h6, li, blockquote, pre').each((i, el) => {
+            const blockText = $(el).text().trim();
+            if (blockText && blockText.length > 10) textBlocks.push(blockText);
+        });
+        mainText = textBlocks.join('\n\n');
+        if (mainText.length < 200 && contentElement.text().length > mainText.length) { // Check if heuristic stripping was too aggressive
+             mainText = contentElement.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim();
+        }
+    } else {
+        const $bodyClone = $('body').clone();
+        $bodyClone.find('header, footer, nav, aside, script, style, noscript, iframe, .advertisement, .ad, .banner, .noprint').remove();
+        mainText = $bodyClone.text().replace(/\s\s+/g, ' ').replace(/(\r\n|\n|\r){2,}/g, '\n\n').trim();
+    }
+    mainText = mainText.replace(/\n{3,}/g, '\n\n').trim() || 'Main text not found (Cheerio).';
+
+    return [{
+      json: {
+        title: title,
+        images: images.slice(0,10), // Return more images if available with Cheerio
+        mainText: mainText.substring(0, 5000), // Return more text if available
+        sourceUrl: url,
+        parser: "cheerio" // Indicate which parser was theoretically used
+      }
+    }];
+    // --- End of Cheerio-based Extraction Logic ---
+
+  } else {
+      // --- REGEX FALLBACK LOGIC (This WILL be used in n8n Cloud) ---
+      console.warn("Warning: HTML parsing library (like Cheerio) not available. Attempting simplified regex extraction. Results will be unreliable. Consider using the n8n HTML Node for robust parsing.");
+
+      const titleMatch = htmlText.match(/<title>([^<]+)<\/title>/i);
+      const extractedTitle = titleMatch ? titleMatch[1].trim() : 'Title not found (regex)';
+
+      const extractedImages = [];
+      const imageUrlsRegex = new Set(); // To avoid duplicate image URLs
+
+      // Regex to capture img tags and their src, alt, and class attributes
+      // This is more complex and still less reliable than a proper parser.
+      // It tries to account for different orders of src, alt, class attributes.
+      const imgTagRegex = /<img[^>]*src\s*=\s*["']([^"']+)["'](?:[^>]*alt\s*=\s*["']([^"']*)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*alt\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*class\s*=\s*["']([^"']*)["'])?[^>]*>|<img[^>]*class\s*=\s*["']([^"']*)["'](?:[^>]*src\s*=\s*["']([^"']+)["'])?(?:[^>]*alt\s*=\s*["']([^"']*)["'])?[^>]*>/gi;
+
+      let imgMatch;
+      while((imgMatch = imgTagRegex.exec(htmlText)) !== null) {
+          let src, alt, classAttr;
+
+          if (imgMatch[1] !== undefined) { // src first
+              src = imgMatch[1];
+              alt = imgMatch[2];
+              classAttr = imgMatch[3];
+          } else if (imgMatch[5] !== undefined) { // alt first, then src
+              alt = imgMatch[4];
+              src = imgMatch[5];
+              classAttr = imgMatch[6];
+          } else if (imgMatch[8] !== undefined) { // class first, then src
+              classAttr = imgMatch[7];
+              src = imgMatch[8];
+              alt = imgMatch[9];
+          } else {
+              // Fallback for simple src if others fail (e.g. if only src is present)
+              const simpleSrcMatch = /<img[^>]+src="([^" >]+)"/i.exec(imgMatch[0]);
+              if (simpleSrcMatch && simpleSrcMatch[1]) {
+                  src = simpleSrcMatch[1];
+              } else {
+                  continue;
+              }
+              alt = ''; // Assume no alt if not captured by complex regex
+              classAttr = ''; // Assume no class
+          }
+
+          if (!src) continue;
+
+          src = src.trim();
+          alt = (alt || '').trim().toLowerCase(); // Ensure alt is a string
+          // classAttr = (classAttr || '').trim().toLowerCase(); // For potential future use
+
+          // Basic filtering (similar to original, can be expanded)
+          const commonExclusions = ['logo', 'avatar', 'icon', 'ad', 'banner', 'spinner', 'loading', 'placeholder', 'transparent.gif', 'data:image', '.svg'];
+          if (commonExclusions.some(ex => src.includes(ex) || alt.includes(ex))) {
+              continue;
+          }
+          // Example of how you might try to use classAttr, though it's still limited:
+          // if (classAttr && (classAttr.includes('ad-icon') || classAttr.includes('profile-pic'))) {
+          //     continue;
+          // }
+
+          try {
+            let resolvedSrc;
+            const base = new URL(url); // Assuming `url` is the base URL of the article
+            if (src.startsWith('//')) resolvedSrc = base.protocol + src;
+            else if (src.startsWith('/')) resolvedSrc = base.origin + src;
+            else if (!src.startsWith('http')) resolvedSrc = new URL(src, base.href).href; // More robust relative URL resolution
+            else resolvedSrc = src;
+
+            resolvedSrc = new URL(resolvedSrc).href; // Validate and normalize
+
+            if (!imageUrlsRegex.has(resolvedSrc)) {
+                extractedImages.push({ src: resolvedSrc }); // Return only src as requested
+                imageUrlsRegex.add(resolvedSrc);
+            }
+          } catch (e) {
+            console.warn(`Regex: Could not resolve or validate image URL: ${src} with base ${url}. Error: ${e.message}`);
+          }
+      }
+
+      const bodyTextMatch = htmlText.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
+      let extractedText = 'Main text not found (regex)';
+      if (bodyTextMatch && bodyTextMatch[1]) {
+          extractedText = bodyTextMatch[1].replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
+                                       .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, '')
+                                       .replace(/<nav[^>]*>[\s\S]*?<\/nav>/gi, '')
+                                       .replace(/<header[^>]*>[\s\S]*?<\/header>/gi, '')
+                                       .replace(/<footer[^>]*>[\s\S]*?<\/footer>/gi, '')
+                                       .replace(/<aside[^>]*>[\s\S]*?<\/aside>/gi, '')
+                                       .replace(/<form[^>]*>[\s\S]*?<\/form>/gi, '')
+                                       .replace(/<button[^>]*>[\s\S]*?<\/button>/gi, '')
+                                       .replace(/<iframe[^>]*>[\s\S]*?<\/iframe>/gi, '')
+                                       .replace(/<svg[^>]*>[\s\S]*?<\/svg>/gi, '')
+                                       .replace(/<!--[\s\S]*?-->/g, '') // Remove comments
+                                       .replace(/<[^>]+>/g, ' ') // Strip remaining tags
+                                       .replace(/\s\s+/g, ' ').trim(); // Normalize whitespace
+      }
+
+      return [{ json: {
+          title: extractedTitle,
+          images: extractedImages.slice(0, 5), // Limit to 5 images for regex output
+          mainText: extractedText.substring(0,1500), // Limit text length
+          sourceUrl: url,
+          parser: "regex", // Indicate regex was used
+          warning: 'Used unreliable regex extraction due to missing parser. Consider using n8n HTML Node.'
+      }}];
+      // --- End of Simplified Regex Extraction ---
+  }
+
+} catch (e) {
+  console.error("Failed to load HTML into parser or during regex processing:", e.message);
+  return [{ json: { error: `Failed during parsing: ${e.message}`, sourceUrl: url } }];
+}
+
+// This part of the code should ideally not be reached if the try/catch above handles all returns.
+// It's a fallback for unexpected scenarios.
+return [{
+  json: {
+    title: "Error: Unexpected end of script execution.",
+    images: [],
+    mainText: "HTML processing did not complete as expected.",
+    sourceUrl: url,
+    parser: "unknown",
+    warning: "Execution path terminated unexpectedly."
+  }
+}];