diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index fdf1ff9c9..756c630fd 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -1510,4 +1510,127 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return results; }; -})(window); \ No newline at end of file +})(window); + +/* Media extraction support - listens for clicks on media elements and extracts text. + Sends { url, tag, selector, extractedText } via postMessage to the parent window. */ + +// Extract text from image: alt/title first, then OCR via Tesseract if available +async function extractImageText(img) { + try { + const altTitle = (img.alt || img.title || '').trim(); + if (altTitle) return altTitle; + + if (window.Tesseract && typeof window.Tesseract.recognize === 'function') { + // Use the image src (may be data: or remote); ignore if data: that contains large chunks + const src = img.currentSrc || img.src || ''; + if (!src) return ''; + try { + const result = await window.Tesseract.recognize(src, 'eng'); + return (result?.data?.text || '').trim(); + } catch (e) { + return ''; + } + } + } catch (e) { + return ''; + } + return ''; +} + +// Extract text from PDF using pdf.js if available +async function extractPdfText(url) { + try { + if (!window.pdfjsLib) return ''; + const loadingTask = window.pdfjsLib.getDocument(url); + const pdf = await loadingTask.promise; + let text = ''; + for (let i = 1; i <= pdf.numPages; i++) { + // eslint-disable-next-line no-await-in-loop + const page = await pdf.getPage(i); + // eslint-disable-next-line no-await-in-loop + const content = await page.getTextContent(); + text += content.items.map((it) => it.str).join(' ') + '\n'; + } + return text.trim(); + } catch (e) { + return ''; + } +} + +// Helper to generate structural selector if function is available +function structuralSelector(el) { + try { + if (typeof GetSelectorStructural === 'function') return GetSelectorStructural(el); + } catch (e) { + // fallthrough + } + return ''; +} + +// Click listener for media elements +document.addEventListener('click', async (ev) => { + try { + const el = ev.target; + if (!el || !el.tagName) return; + const tag = el.tagName.toLowerCase(); + let url = ''; + let selector = structuralSelector(el); + let extractedText = ''; + + if (tag === 'img') { + url = el.currentSrc || el.src || ''; + extractedText = (el.alt || el.title || '').trim(); + if (!extractedText) extractedText = await extractImageText(el); + } else if (tag === 'iframe' || tag === 'embed') { + url = el.src || el.data || ''; + if (url && /\.pdf(\?|$)/i.test(url)) { + extractedText = await extractPdfText(url); + } + } else if (tag === 'object') { + // style + url = el.data || ''; + if (url && /\.pdf(\?|$)/i.test(url)) { + extractedText = await extractPdfText(url); + } + } + + if (url && extractedText) { + // Post to parent so the recorder frontend (or wrapper) can relay it to server socket + try { + window.parent.postMessage({ + type: 'maxun:media-extracted', + url, + tag, + selector, + extractedText + }, '*'); + } catch (e) { + // ignore + } + } + } catch (e) { + // swallow + } +}); + +// Load Tesseract and PDF.js if not already present (CDN). +if (!window.Tesseract) { + const s = document.createElement('script'); + s.src = 'https://cdn.jsdelivr.net/npm/tesseract.js@4.0.2/dist/tesseract.min.js'; + s.async = true; + document.head.appendChild(s); +} +if (!window.pdfjsLib) { + const s2 = document.createElement('script'); + s2.src = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.min.js'; + s2.async = true; + s2.onload = () => { + try { + // eslint-disable-next-line no-undef + window.pdfjsLib = window['pdfjs-dist/build/pdf']; + if (window.pdfjsLib) window.pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/build/pdf.worker.min.js'; + } catch (e) {} + }; + document.head.appendChild(s2); +} \ No newline at end of file diff --git a/mediaParser.js b/mediaParser.js new file mode 100644 index 000000000..c8b0b192e --- /dev/null +++ b/mediaParser.js @@ -0,0 +1,123 @@ +// mediaParser.js + +import axios from 'axios'; +import * as cheerio from 'cheerio'; +import { URL } from 'url'; +import logger from './logger'; // Adjust path if necessary + +/** + * Fetches and extracts all images from a webpage, including responsive images. + * This includes regular tags, srcset URLs, and tags within elements. + * + * @param {string} url - The webpage URL to extract images from. + * Must be a valid, non-empty string. + * @returns {Array} - An array of objects: + * { + * url: string, // The absolute URL of the image + * altText: string // The alt text of the image (if any) + * } + * @throws {TypeError} - If the URL is missing or not a string. + * @throws {Error} - If the fetch fails or the response is not HTML. + */ +async function extractImages(url) { + if (!url || typeof url !== 'string') { + throw new TypeError('URL must be a non-empty string'); + } + + try { + // Fetch webpage with axios + const response = await axios.get(url, { + timeout: 10000, + maxContentLength: 10 * 1024 * 1024, + maxBodyLength: 10 * 1024 * 1024, + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; MaxunBot/1.0; +https://maxun.dev)' + }, + maxRedirects: 5 + }); + + // Validate that content is HTML + const contentType = response.headers['content-type'] || ''; + if (!contentType.includes('text/html')) { + throw new Error(`Expected HTML but got ${contentType}`); + } + + const html = response.data; + const $ = cheerio.load(html, { + decodeEntities: true, + normalizeWhitespace: false + }); + + const images = []; + const seen = new Set(); + + // Extract tags + $('img').each((index, element) => { + const alt = $(element).attr('alt') || ''; + let src = $(element).attr('src'); + + if (src) { + try { + const absoluteUrl = new URL(src, url).href; + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: alt }); + } + } catch { + logger.warn(`Invalid image URL: ${src}`); + } + } + + // Handle srcset (responsive images) + const srcset = $(element).attr('srcset'); + if (srcset) { + const srcsetUrls = srcset.split(',') + .map(s => s.trim().split(/\s+/)[0]) + .filter(Boolean); + + for (const srcsetUrl of srcsetUrls) { + try { + const absoluteUrl = new URL(srcsetUrl, url).href; + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: alt }); + } + } catch { + logger.warn(`Invalid srcset URL: ${srcsetUrl}`); + } + } + } + }); + + // Extract tags inside elements + $('picture source').each((i, element) => { + const srcset = $(element).attr('srcset'); + if (srcset) { + const srcsetUrls = srcset.split(',') + .map(s => s.trim().split(/\s+/)[0]) + .filter(Boolean); + + for (const srcsetUrl of srcsetUrls) { + try { + const absoluteUrl = new URL(srcsetUrl, url).href; + if (!seen.has(absoluteUrl) && !absoluteUrl.startsWith('data:')) { + seen.add(absoluteUrl); + images.push({ url: absoluteUrl, altText: '' }); + } + } catch { + logger.warn(`Invalid srcset URL in : ${srcsetUrl}`); + } + } + } + }); + + return images; + + } catch (error) { + // Preserve original stack trace + throw new Error(`Failed to extract images from ${url}`, { cause: error }); + } +} + +// Export function for other modules +export { extractImages }; diff --git a/package.json b/package.json index a5eab10c6..4d62fcb7c 100644 --- a/package.json +++ b/package.json @@ -24,10 +24,11 @@ "@types/react-dom": "^18.0.1", "@types/uuid": "^8.3.4", "airtable": "^0.12.2", - "axios": "^1.9.0", + "axios": "^1.12.2", "bcrypt": "^5.1.1", "body-parser": "^1.20.3", "buffer": "^6.0.3", + "cheerio": "^1.1.2", "connect-pg-simple": "^10.0.0", "cookie-parser": "^1.4.6", "cors": "^2.8.5", @@ -55,6 +56,7 @@ "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", + "pdfjs-dist": "^5.4.296", "pg": "^8.13.0", "pg-boss": "^10.1.6", "pkce-challenge": "^4.1.0", @@ -80,6 +82,7 @@ "styled-components": "^5.3.3", "swagger-jsdoc": "^6.2.8", "swagger-ui-express": "^5.0.1", + "tesseract.js": "^6.0.1", "typedoc": "^0.23.8", "typescript": "^4.6.3", "uuid": "^8.3.2", diff --git a/server/src/browser-management/inputHandlers.ts b/server/src/browser-management/inputHandlers.ts index c014af3d9..a3859b429 100644 --- a/server/src/browser-management/inputHandlers.ts +++ b/server/src/browser-management/inputHandlers.ts @@ -608,6 +608,34 @@ const handleGoForward = async (activeBrowser: RemoteBrowser, page: Page) => { } }; +/** + * Handle media extracted event forwarded from client (via postMessage relay). + * data: { url, tag, selector, extractedText } + */ +const onMediaExtracted = async (data: { url: string; tag: string; selector: string; extractedText: string }, userId: string) => { + logger.log('debug', 'Handling media-extracted event emitted from client'); + await handleWrapper(handleMediaExtracted, userId, data); +} + +const handleMediaExtracted = async (activeBrowser: RemoteBrowser, page: Page, data: { url: string; tag: string; selector: string; extractedText: string }) => { + try { + if (page.isClosed()) { + logger.log("debug", `Ignoring media-extracted event: page is closed`); + return; + } + const generator = activeBrowser.generator; + if (generator && typeof generator.handleMediaExtracted === 'function') { + await generator.handleMediaExtracted(data, page); + } else { + logger.log('warn', 'Generator does not implement handleMediaExtracted'); + } + logger.log('debug', `Media extracted added: ${data.url}`); + } catch (e) { + const { message } = e as Error; + logger.log('warn', `Error handling media-extracted event: ${message}`); + } +} + /** * Handles the click action event. * @param activeBrowser - the active remote browser {@link RemoteBrowser} @@ -851,6 +879,7 @@ const registerInputHandlers = (socket: Socket, userId: string) => { socket.on("dom:click", (data) => onDOMClickAction(data, userId)); socket.on("dom:keypress", (data) => onDOMKeyboardAction(data, userId)); socket.on("dom:addpair", (data) => onDOMWorkflowPair(data, userId)); + socket.on("dom:media-extracted", (data) => onMediaExtracted(data, userId)); }; export default registerInputHandlers; diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index a5bc2edc4..c90407c16 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -149,6 +149,36 @@ export class WorkflowGenerator { }); } + /** + * Handle media extraction event from browser-side snippet. + * Appends a media event object to workflowRecord.events. + */ + public async handleMediaExtracted(data: { url: string; tag: string; selector: string; extractedText: string }, page: Page) { + try { + if (!this.workflowRecord) this.workflowRecord = { workflow: [] } as WorkflowFile; + // Ensure events array exists on workflowRecord (non-standard addition) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (this.workflowRecord as any).events = (this.workflowRecord as any).events || []; + (this.workflowRecord as any).events.push({ + type: 'media', + url: data.url, + tag: data.tag, + selector: data.selector, + extractedText: data.extractedText, + timestamp: Date.now(), + }); + + // notify client of new event if needed + try { + this.socket.emit('workflow:media-added', { url: data.url, selector: data.selector }); + } catch (e) { + // ignore + } + } catch (e) { + logger.log('warn', `handleMediaExtracted failed: ${(e as Error).message}`); + } + } + /** * Registers the event handlers for all generator-related events on the socket. * @param socket The socket used to communicate with the client. diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 7fcafdeb4..f9770b492 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -199,6 +199,72 @@ export const DOMBrowserRenderer: React.FC = ({ clientSelectorGenerator.setPaginationMode(paginationMode); }, [getList, listSelector, paginationMode]); + // Relay media-extracted postMessage from the iframe to server socket + useEffect(() => { + const handler = (ev: MessageEvent) => { + try { + const data = ev.data; + if (!data || data.type !== 'maxun:media-extracted') return; + + // Ensure the message comes from the recorded iframe only + const iframeWindow = iframeRef.current?.contentWindow || null; + if (ev.source !== iframeWindow) { + // Not from the recorded iframe - ignore + // console.debug('Dropped media-extracted: source mismatch'); + return; + } + + // Require a non-null origin for messages + if (!ev.origin || ev.origin === 'null') { + // console.debug('Dropped media-extracted: null origin'); + return; + } + + // If snapshot.baseUrl is available, validate origin and also verify data.url origin + if (snapshot?.baseUrl) { + try { + const expectedOrigin = new URL(snapshot.baseUrl).origin; + if (ev.origin !== expectedOrigin) { + // origin mismatch - ignore + // console.debug('Dropped media-extracted: origin mismatch', ev.origin, expectedOrigin); + return; + } + + // Validate that the reported data.url has the same origin + try { + const reportedOrigin = new URL(data.url).origin; + if (reportedOrigin !== expectedOrigin) { + // reported url is not from the recorded page origin + // console.debug('Dropped media-extracted: data.url origin mismatch', reportedOrigin, expectedOrigin); + return; + } + } catch (e) { + // invalid data.url - drop + return; + } + } catch (e) { + // If snapshot.baseUrl parsing failed, drop the message + return; + } + } + + const payload = { + url: data.url, + tag: data.tag, + selector: data.selector, + extractedText: data.extractedText, + }; + if (socket && socket.emit) { + socket.emit('dom:media-extracted', payload); + } + } catch (e) { + // ignore + } + }; + window.addEventListener('message', handler); + return () => window.removeEventListener('message', handler); + }, [socket, iframeRef, snapshot]); + useEffect(() => { if (listSelector) { clientSelectorGenerator.setListSelector(listSelector);