diff --git a/EXAMPLES.md b/EXAMPLES.md new file mode 100644 index 0000000..a2eb3c0 --- /dev/null +++ b/EXAMPLES.md @@ -0,0 +1,298 @@ +# Examples + +## Task Document Structure + +### Query Type Examples + +#### ID Selector +```json +{ + "id": "header", + "type": "id", + "value": "header", + "target": "html" +} +``` +Selects element with ID `#header` + +#### Class Selector +```json +{ + "id": "menu", + "type": "class", + "value": "menu-item", + "target": "html" +} +``` +Selects elements with class `.menu-item` + +#### Tag Selector +```json +{ + "id": "links", + "type": "tag", + "value": "a", + "target": "html" +} +``` +Selects all `` tags + +#### Attribute Selector +```json +{ + "id": "buttons", + "type": "attribute", + "value": "data-role", + "target": "attribute", + "attr": "data-role" +} +``` +Selects elements with matching attribute + +#### CSS Selector +```json +{ + "id": "content", + "type": "selector", + "value": "div.content", + "target": "html" +} +``` +Selects elements using CSS selector + +--- + +### Target Type Examples + +#### HTML Target +```json +{ + "id": "content", + "type": "class", + "value": "content", + "target": "html" +} +``` +Extracts the HTML content of selected elements + +#### Text Target +```json +{ + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" +} +``` +Extracts the text content of selected elements + +#### Attribute Target +```json +{ + "id": "links", + "type": "tag", + "value": "a", + "target": "attribute", + "attr": "href" +} +``` +Extracts the value of specified attribute from selected elements + +--- + +### Complete Example + +#### Task Document (Before Processing) +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + }, + { + "id": "links", + "type": "tag", + "value": "a", + "target": "attribute", + "attr": "href" + } + ] +} +``` + +Extracts the text content of the `

` tag, the text content of the element with class `description`, and the value of the `href` attribute from all `` tags. + +#### Result Document (After Processing) +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + }, + { + "id": "links", + "type": "tag", + "value": "a", + "target": "attribute", + "attr": "href" + } + ], + "data": { + "title": ["Example Domain"], + "description": ["

This domain is for use in illustrative examples...

"], + "links": ["https://www.iana.org/domains/example", "https://www.iana.org/domains/reserved"] + }, + "startedAt": "2023-01-01T00:00:00Z", + "concludedAt": "2023-01-01T00:00:00Z", + "stage": "Success" +} +``` + +## Template Document Structure +Using the document id as the template id, the following is an example of a template document with the id `template-id`: + +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + }, + { + "id": "links", + "type": "tag", + "value": "a", + "target": "attribute", + "attr": "href" + } + ], +} +``` + +### Using a Template in a Task + +To use a template in a task, add the template id to the task document in the **`template`** field. + +```json +{ + "template": "template-id" +} +``` + +The template will be merged with the task document and the queries will be added to the task document. For example, by using the template above, the task document will now have the following queries: + +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + }, + { + "id": "links", + "type": "tag", + "value": "a", + "target": "attribute", + "attr": "href" + } + ], +} +``` + +### Template Merging + +When you specify a template in a task, the queries from the template and the task are combined. If both the template and the task contain a query with the same `id`, the version from the task will override the one from the template. The same rule applies to the `url` field: if the task provides a `url`, it will take precedence over the template's `url`. This allows you to customize or override specific queries or the URL in your task while still reusing the rest of the template. + +**template document** +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "inner" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + } + ] +} +``` + +**task document** +```json +{ + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + } + ] +} +``` + +**merged task document** +```json +{ + "url": "https://example.com", + "queries": [ + { + "id": "title", + "type": "tag", + "value": "h1", + "target": "text" + }, + { + "id": "description", + "type": "class", + "value": "description", + "target": "html" + } + ] +} +``` \ No newline at end of file diff --git a/POSTINSTALL.md b/POSTINSTALL.md index bdd7648..a082eda 100644 --- a/POSTINSTALL.md +++ b/POSTINSTALL.md @@ -1,203 +1,45 @@ # Post-Installation Guide -After installing the extension, follow this guide to configure scraping tasks and manage extracted data. Below you'll find detailed instructions, document structures, and examples. +Welcome! After installing the extension, use this guide to set up scraping tasks and manage your extracted data. This document covers configuration steps, document structures, and helpful examples. -## Setting Up a Task -Create a document in your tasks collection **`${param:SCRAPE_COLLECTION}`** to define a scraping task. +## Tasks ---- +A **task** defines what to scrape and how. Each task is stored in the **`${param:SCRAPE_COLLECTION}`** collection. ### Task Document Structure -#### Required Fields: -- **url** (string): Target URL to scrape (e.g., `"https://example.com"`) -- **queries** (array of objects): List of queries to extract data from the HTML content +A task document must include: ---- +- **url** (`string`): The target URL to scrape (e.g., `"https://example.com"`). +- **queries** (`array` of objects): An array of queries specifying what data to extract from the HTML. ### Query Configuration -Each query in the `queries` array narrows down specific elements from the HTML. Multiple queries can be used to extract different types of data from the same HTML. -#### Query Object Fields: -- **id** (string, required): Unique identifier for the query. Will be used as the key in the output `data` object. -- **type** (string, required): Selector type. Supported values: +Each query in the `queries` array describes how to select elements and what to extract from them. You can include multiple queries to extract different data from the same page. + +#### Query Object Fields + +- **id** (`string`, required): Unique identifier for the query. This becomes the key in the output `data` object. +- **type** (`string`, required): Selector type. Supported values: - `id`: Select by element ID - `class`: Select by CSS class - `tag`: Select by HTML tag - `attribute`: Select by attribute - - `text`: Select by text content - - `selector`: Select using CSS selector -- **value** (string, required): Value for the selector -- **target** (string, optional): What to extract from selected elements - - `html`: Extract HTML content (default) - - `text`: Extract text content - - `attribute`: Extract attribute value -- **attr** (string, optional): Attribute name to extract when `target` is set to `attribute`. Only allowed when `type` is `attribute`. - ---- - -### Query Type Examples - -#### ID Selector -```json -{ - "id": "header", - "type": "id", - "value": "header", - "target": "html" -} -``` -Selects element with ID `#header` - -#### Class Selector -```json -{ - "id": "menu", - "type": "class", - "value": "menu-item", - "target": "html" -} -``` -Selects elements with class `.menu-item` - -#### Tag Selector -```json -{ - "id": "links", - "type": "tag", - "value": "a", - "target": "html" -} -``` -Selects all `
` tags - -#### Attribute Selector -```json -{ - "id": "buttons", - "type": "attribute", - "value": "data-role", - "target": "attribute", - "attr": "data-role" -} -``` -Selects elements with matching attribute - -#### CSS Selector -```json -{ - "id": "content", - "type": "selector", - "value": "div.content", - "target": "html" -} -``` -Selects elements using CSS selector - ---- - -### Target Type Examples - -#### HTML Target -```json -{ - "id": "content", - "type": "class", - "value": "content", - "target": "html" -} -``` -Extracts the HTML content of selected elements - -#### Text Target -```json -{ - "id": "title", - "type": "tag", - "value": "h1", - "target": "text" -} -``` -Extracts the text content of selected elements - -#### Attribute Target -```json -{ - "id": "links", - "type": "tag", - "value": "a", - "target": "attribute", - "attr": "href" -} -``` -Extracts the value of specified attribute from selected elements + - `selector`: Select using a CSS selector +- **value** (`string`, required): The selector value (e.g., class name, tag name, attribute name, or CSS selector). +- **target** (`string`, optional): What to extract from the selected elements. Supported values: + - `html`: Extract the HTML content (default) + - `text`: Extract the text content + - `inner`: Extract the inner HTML content + - `attribute`: Extract the value of a specific attribute +- **attr** (`string`, only required if `target` is `attribute`): The attribute name to extract (e.g., `"href"`). ---- +## Templates -### Complete Example +Templates let you reuse query configurations across multiple tasks. To create a template, add a document to your templates collection (**`${param:TEMPLATES_COLLECTION}`**) with the same structure as a task document. -#### Task Document (Before Processing) -```json -{ - "url": "https://example.com", - "queries": [ - { - "id": "title", - "type": "tag", - "value": "h1", - "target": "text" - }, - { - "id": "description", - "type": "class", - "value": "description", - "target": "html" - }, - { - "id": "links", - "type": "tag", - "value": "a", - "target": "attribute", - "attr": "href" - } - ] -} -``` +You can then reference a template in your task documents to avoid repeating query definitions. -Extracts the text content of the `

` tag, the text content of the element with class `description`, and the value of the `href` attribute from all `` tags. +## Examples -#### Result Document (After Processing) -```json -{ - "url": "https://example.com", - "queries": [ - { - "id": "title", - "type": "tag", - "value": "h1", - "target": "text" - }, - { - "id": "description", - "type": "class", - "value": "description", - "target": "html" - }, - { - "id": "links", - "type": "tag", - "value": "a", - "target": "attribute", - "attr": "href" - } - ], - "data": { - "title": ["Example Domain"], - "description": ["

This domain is for use in illustrative examples...

"], - "links": ["https://www.iana.org/domains/example", "https://www.iana.org/domains/reserved"] - }, - "startedAt": "2023-01-01T00:00:00Z", - "concludedAt": "2023-01-01T00:00:00Z", - "stage": "Success" -} -``` \ No newline at end of file +See the [Examples](https://github.com/CorieW/firestore-web-scraper/blob/master/EXAMPLES.md) file for examples of the document structures for tasks and templates. \ No newline at end of file diff --git a/extension.yaml b/extension.yaml index 033d403..8fa14ce 100644 --- a/extension.yaml +++ b/extension.yaml @@ -164,6 +164,15 @@ params: validationErrorMessage: Must be a valid Cloud Firestore collection required: true + - param: TEMPLATES_COLLECTION + label: Templates collection + description: The collection in which templates are stored. + type: string + default: templates + validationRegex: "^[^/]+(/[^/]+/[^/]+)*$" + validationErrorMessage: Must be a valid Cloud Firestore collection + required: false + - param: LOG_LEVEL label: Log level description: The log level to use. diff --git a/functions/src/config.ts b/functions/src/config.ts index 00d13a3..2b91d37 100644 --- a/functions/src/config.ts +++ b/functions/src/config.ts @@ -5,6 +5,7 @@ const config: Config = { location: process.env.LOCATION, database: process.env.DATABASE, scrapeCollection: process.env.SCRAPE_COLLECTION, + templatesCollection: process.env.TEMPLATES_COLLECTION, logLevel: process.env.LOG_LEVEL as LogLevel, } diff --git a/functions/src/firebase.ts b/functions/src/firebase.ts new file mode 100644 index 0000000..38f10be --- /dev/null +++ b/functions/src/firebase.ts @@ -0,0 +1,20 @@ +import * as admin from 'firebase-admin' +import * as events from './events' + +let db: admin.firestore.Firestore +let initialized = false + +/** + * Initializes Admin SDK, Firestore, and Eventarc + */ +async function initialize() { + if (initialized === true) return + initialized = true + admin.initializeApp() + db = admin.firestore() + + /** setup events */ + events.setupEventChannel() +} + +export { db, initialize } diff --git a/functions/src/index.ts b/functions/src/index.ts index 34dd0f5..2377552 100644 --- a/functions/src/index.ts +++ b/functions/src/index.ts @@ -1,4 +1,3 @@ -import * as admin from 'firebase-admin' import { Timestamp } from 'firebase-admin/firestore' import { FirestoreEvent, @@ -9,26 +8,12 @@ import { import { logger } from './logger' import config from './config' import * as events from './events' -import { Task } from './types/Task' +import { QUERIES_KEY, Task, TEMPLATE_KEY, URL_KEY } from './types/Task' import { validateTask } from './validation/task-validation' import { sendHttpRequestTo } from './http' import { TaskStage } from './types/TaskStage' - -let db: admin.firestore.Firestore -let initialized = false - -/** - * Initializes Admin SDK, Firestore, and Eventarc - */ -async function initialize() { - if (initialized === true) return - initialized = true - admin.initializeApp() - db = admin.firestore() - - /** setup events */ - events.setupEventChannel() -} +import { Template } from './types/Template' +import { db, initialize } from './firebase' export const processQueue = onDocumentCreated( config.scrapeCollection, @@ -54,7 +39,7 @@ export const processQueue = onDocumentCreated( } ) -async function processWrite(snapshot: QueryDocumentSnapshot) { +export async function processWrite(snapshot: QueryDocumentSnapshot) { if (!snapshot.exists) { logger.error('Process called with non-existent document') return @@ -63,17 +48,18 @@ async function processWrite(snapshot: QueryDocumentSnapshot) { logger.info(`Starting task: ${snapshot.id}`) const startedAtTimestamp = Timestamp.now() - const task: Task = snapshot.data() as Task const doc = db.collection(config.scrapeCollection).doc(snapshot.id) + let task: Task = snapshot.data() as Task logger.info(`Validating task: ${snapshot.id}`) // The task is invalid, set the error and return - const isNotValid = validateTask(task) // is a message (invalid) or null (valid) - if (isNotValid) { + try { + validateTask(task) + } catch (err) { await doc.update({ ...task, - error: isNotValid, + error: err.toString().replace(/^Error: /, ''), startedAt: startedAtTimestamp, concludedAt: Timestamp.now(), stage: TaskStage.ERROR, @@ -82,7 +68,26 @@ async function processWrite(snapshot: QueryDocumentSnapshot) { return } - const { url, queries } = task + // When a template is provided, load in values + if (task[TEMPLATE_KEY]) { + try { + const template = new Template(task[TEMPLATE_KEY]) + await template.initialize() + task = template.mergeWithTask(task) + } catch (err) { + await doc.update({ + ...task, + error: err.toString().replace(/^Error: /, ''), + startedAt: startedAtTimestamp, + concludedAt: Timestamp.now(), + stage: TaskStage.ERROR, + }) + return + } + } + + const url = task[URL_KEY] + const queries = task[QUERIES_KEY] // Set the task to processing logger.info(`Processing task: ${snapshot.id}`) diff --git a/functions/src/test/integration.test.ts b/functions/src/test/integration.test.ts index da869e8..ce9d45a 100644 --- a/functions/src/test/integration.test.ts +++ b/functions/src/test/integration.test.ts @@ -60,8 +60,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should reject task with invalid URL', () => { @@ -77,8 +76,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBe("Task URL ('url') is not a valid URL") + expect(() => validateTask(task)).toThrow("Task URL ('url') is not a valid URL") }) it('should reject task with missing URL', () => { @@ -94,8 +92,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBe("Task URL ('url') is not a valid URL") + expect(() => validateTask(task)).toThrow("Task URL ('url') is not a valid URL") }) it('should reject task with no queries', () => { @@ -104,8 +101,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { queries: [], } - const validationResult = validateTask(task) - expect(validationResult).toBe("Task queries ('queries') are empty") + expect(() => validateTask(task)).toThrow("Task queries ('queries') are empty") }) it('should reject task with non-array queries', () => { @@ -114,13 +110,13 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { queries: 'not-an-array', } - const validationResult = validateTask(task) - expect(validationResult).toBe("Task queries ('queries') must be provided as an array") + expect(() => validateTask(task)).toThrow( + "Task queries ('queries') must be provided as an array" + ) }) it('should reject undefined task', () => { - const validationResult = validateTask(undefined) - expect(validationResult).toBe('Task is missing') + expect(() => validateTask(undefined)).toThrow('Task is missing') }) }) @@ -139,8 +135,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -166,8 +161,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -194,8 +188,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -263,8 +256,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -352,8 +344,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -411,8 +402,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { } // Validate the task - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() // Execute the task const queriable = await sendHttpRequestTo(task.url) @@ -461,8 +451,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() const queriable = await sendHttpRequestTo(task.url) const results = queriable.multiQuery(task.queries) @@ -512,8 +501,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() const queriable = await sendHttpRequestTo(task.url) const results = queriable.multiQuery(task.queries) @@ -575,8 +563,7 @@ describe('Integration Tests - Complete Workflow with Test HTML Page', () => { ], } - const validationResult = validateTask(task) - expect(validationResult).toBeNull() + expect(() => validateTask(task)).not.toThrow() const queriable = await sendHttpRequestTo(task.url) const results = queriable.multiQuery(task.queries) diff --git a/functions/src/test/validation.test.ts b/functions/src/test/task-validation.test.ts similarity index 90% rename from functions/src/test/validation.test.ts rename to functions/src/test/task-validation.test.ts index 9b659b6..fdaf292 100644 --- a/functions/src/test/validation.test.ts +++ b/functions/src/test/task-validation.test.ts @@ -8,18 +8,15 @@ describe('Validation Tests - Complete Coverage', () => { describe('Task Validation - From Empty to Valid', () => { describe('Null/Undefined/Empty Tasks', () => { it('should reject null task', () => { - const result = validateTask(null as any) - expect(result).toBeTruthy() + expect(() => validateTask(null as any)).toThrow() }) it('should reject undefined task', () => { - const result = validateTask(undefined) - expect(result).toBeTruthy() + expect(() => validateTask(undefined)).toThrow() }) it('should reject empty object task', () => { - const result = validateTask({} as any) - expect(result).toBeTruthy() + expect(() => validateTask({} as any)).toThrow() }) }) @@ -36,8 +33,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task missing queries field', () => { @@ -45,8 +41,7 @@ describe('Validation Tests - Complete Coverage', () => { url: 'https://example.com', } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task missing both URL and queries', () => { @@ -54,8 +49,7 @@ describe('Validation Tests - Complete Coverage', () => { someOtherField: 'value', } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) }) @@ -66,8 +60,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: [], } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with boolean URL', () => { @@ -76,8 +69,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: [], } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with object URL', () => { @@ -86,8 +78,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: [], } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with non-array queries', () => { @@ -96,8 +87,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: 'not an array', } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with object queries', () => { @@ -106,8 +96,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: { id: 'test' }, } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with null queries', () => { @@ -116,8 +105,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: null, } as any - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) }) @@ -135,8 +123,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with whitespace-only URL', () => { @@ -152,8 +139,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with invalid URL format', () => { @@ -169,8 +155,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with malformed URL', () => { @@ -186,8 +171,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) it('should reject task with URL containing only protocol', () => { @@ -203,8 +187,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) }) @@ -215,8 +198,7 @@ describe('Validation Tests - Complete Coverage', () => { queries: [], } - const result = validateTask(task) - expect(result).toBeTruthy() + expect(() => validateTask(task)).toThrow() }) }) @@ -234,8 +216,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with HTTPS URL', () => { @@ -251,8 +232,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with URL containing port', () => { @@ -268,8 +248,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with URL containing path', () => { @@ -285,8 +264,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with URL containing query parameters', () => { @@ -302,8 +280,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with URL containing fragment', () => { @@ -319,8 +296,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with localhost URL', () => { @@ -336,8 +312,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) }) @@ -355,8 +330,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with multiple queries', () => { @@ -385,8 +359,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with complex queries', () => { @@ -428,8 +401,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) }) }) @@ -886,8 +858,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with single query', () => { @@ -903,8 +874,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with attribute extraction query', () => { @@ -921,8 +891,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with mixed query types', () => { @@ -964,8 +933,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) it('should accept task with all supported target types', () => { @@ -1000,8 +968,7 @@ describe('Validation Tests - Complete Coverage', () => { ], } - const result = validateTask(task) - expect(result).toBeNull() + expect(() => validateTask(task)).not.toThrow() }) }) }) diff --git a/functions/src/test/template-integration.test.ts b/functions/src/test/template-integration.test.ts new file mode 100644 index 0000000..d69359b --- /dev/null +++ b/functions/src/test/template-integration.test.ts @@ -0,0 +1,380 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest' +import { Timestamp } from 'firebase-admin/firestore' +import { Task, TEMPLATE_KEY, URL_KEY, QUERIES_KEY } from '../types/Task' +import { TemplateData } from '../types/Template' +import { Query, QueryType, TargetType } from '../types/Query' +import { TaskStage } from '../types/TaskStage' + +// Mock all dependencies +const mockUpdate = vi.fn() +const mockDoc = vi.fn(() => ({ update: mockUpdate })) + +const mockTemplateGet = vi.fn() +const mockTemplateDoc = vi.fn(() => ({ get: mockTemplateGet })) + +vi.mock('../firebase', () => ({ + db: { + collection: vi.fn((collectionName: string) => { + if (collectionName === 'templates') { + return { doc: mockTemplateDoc } + } + return { doc: mockDoc } + }), + }, + initialize: vi.fn(), +})) + +vi.mock('../config', () => ({ + default: { + scrapeCollection: 'scrape-tasks', + templatesCollection: 'templates', + }, +})) + +vi.mock('../logger', () => ({ + logger: { + debug: vi.fn(), + info: vi.fn(), + error: vi.fn(), + }, +})) + +vi.mock('../events', () => ({ + recordErrorEvent: vi.fn(), + recordCompleteEvent: vi.fn(), +})) + +vi.mock('../http', () => ({ + sendHttpRequestTo: vi.fn(), +})) + +vi.mock('../validation/task-validation', () => ({ + validateTask: vi.fn(), +})) + +// Import the function after mocks are set up +import { processWrite } from '../index' +import { sendHttpRequestTo } from '../http' +import { validateTask } from '../validation/task-validation' + +describe('Template Integration Tests', () => { + const mockTaskId = 'test-task-id' + const mockTemplateId = 'test-template-id' + + const mockTemplateData: TemplateData = { + [URL_KEY]: 'https://template-url.com', + [QUERIES_KEY]: [ + { + id: 'template-title', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + { + id: 'template-description', + type: QueryType.CLASS, + value: 'description', + target: TargetType.TEXT, + }, + ], + } + + const mockTaskQueries: Query[] = [ + { + id: 'task-specific-query', + type: QueryType.ID, + value: 'content', + target: TargetType.HTML, + }, + ] + + beforeEach(() => { + vi.clearAllMocks() + + // Mock Firestore template retrieval + mockTemplateGet.mockResolvedValue({ + exists: true, + data: () => mockTemplateData, + }) + + // Mock HTTP response + ;(sendHttpRequestTo as any).mockResolvedValue({ + html: '

Test Title

Test Description
Test Content
', + multiQuery: vi.fn().mockReturnValue({ + 'template-title': 'Test Title', + 'template-description': 'Test Description', + 'task-specific-query': 'Test Content', + }), + }) + + // Mock task validation + ;(validateTask as any).mockImplementation(() => {}) // No-op, validation passes + }) + + describe('Task with Template Processing', () => { + it('should process task with template successfully', async () => { + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', // This should be overridden by template + [TEMPLATE_KEY]: mockTemplateId, + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify template was fetched + expect(mockTemplateDoc).toHaveBeenCalledWith(mockTemplateId) + expect(mockTemplateGet).toHaveBeenCalled() + + // Verify task was updated to processing state + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + startedAt: expect.any(Timestamp), + // URL should be from template + [URL_KEY]: taskWithTemplate[URL_KEY], + // Queries should be merged (template queries first, then task queries) + [QUERIES_KEY]: [...mockTemplateData[QUERIES_KEY]!, ...mockTaskQueries], + }) + ) + + // Verify final success update + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.SUCCESS, + data: { + 'template-title': 'Test Title', + 'template-description': 'Test Description', + 'task-specific-query': 'Test Content', + }, + concludedAt: expect.any(Timestamp), + }) + ) + }) + + it('should process task with template that only has URL', async () => { + const urlOnlyTemplate: TemplateData = { + [URL_KEY]: 'https://template-only-url.com', + } + + mockTemplateGet.mockResolvedValue({ + exists: true, + data: () => urlOnlyTemplate, + }) + + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: mockTemplateId, + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify task was updated with template URL but original queries + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + [URL_KEY]: taskWithTemplate[URL_KEY], + [QUERIES_KEY]: mockTaskQueries, // Should preserve original task queries + }) + ) + }) + + it('should process task with template that only has queries', async () => { + const queriesOnlyTemplate: TemplateData = { + [QUERIES_KEY]: mockTemplateData[QUERIES_KEY], + } + + mockTemplateGet.mockResolvedValue({ + exists: true, + data: () => queriesOnlyTemplate, + }) + + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: mockTemplateId, + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify task was updated with original URL but merged queries + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + [URL_KEY]: taskWithTemplate[URL_KEY], // Should preserve original task URL + [QUERIES_KEY]: [...queriesOnlyTemplate[QUERIES_KEY]!, ...mockTaskQueries], + }) + ) + }) + + it('should handle template not found error', async () => { + mockTemplateGet.mockResolvedValue({ + exists: false, + }) + + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: 'non-existent-template', + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify error was recorded + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.ERROR, + error: 'Template not found: non-existent-template', + startedAt: expect.any(Timestamp), + concludedAt: expect.any(Timestamp), + }) + ) + }) + + it('should process task without template normally', async () => { + const taskWithoutTemplate: Task = { + [URL_KEY]: 'https://task-url.com', + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithoutTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify no template was fetched + expect(mockTemplateGet).not.toHaveBeenCalled() + + // Verify task was processed with original data + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + [URL_KEY]: taskWithoutTemplate[URL_KEY], + [QUERIES_KEY]: taskWithoutTemplate[QUERIES_KEY], + }) + ) + }) + + it('should handle task with template but no original queries', async () => { + const taskWithTemplateNoQueries: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: mockTemplateId, + // No queries in the original task + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplateNoQueries), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify task was updated with template data + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + [URL_KEY]: taskWithTemplateNoQueries[URL_KEY], // Should keep the original task URL + [QUERIES_KEY]: mockTemplateData[QUERIES_KEY], // Should only have template queries + [TEMPLATE_KEY]: mockTemplateId, + }) + ) + }) + }) + + describe('Template Error Handling in Integration', () => { + it('should handle Firebase errors when fetching template', async () => { + const firebaseError = new Error('Firebase connection failed') + mockTemplateGet.mockRejectedValue(firebaseError) + + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: mockTemplateId, + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // Verify error was recorded + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.ERROR, + error: 'Firebase connection failed', + startedAt: expect.any(Timestamp), + concludedAt: expect.any(Timestamp), + }) + ) + }) + + it('should handle malformed template data', async () => { + const malformedTemplateData = { + invalidField: 'invalid-data', + // Missing expected URL_KEY and QUERIES_KEY + } + + mockTemplateGet.mockResolvedValue({ + exists: true, + data: () => malformedTemplateData, + }) + + const taskWithTemplate: Task = { + [URL_KEY]: 'https://original-task-url.com', + [TEMPLATE_KEY]: mockTemplateId, + [QUERIES_KEY]: mockTaskQueries, + } + + const mockSnapshot = { + data: vi.fn().mockReturnValue(taskWithTemplate), + id: mockTaskId, + exists: true, + } + + await processWrite(mockSnapshot as any) + + // The processing should continue with the malformed template data + // (the Template class should handle undefined values gracefully) + expect(mockUpdate).toHaveBeenCalledWith( + expect.objectContaining({ + stage: TaskStage.PROCESSING, + // URL should remain from original task since template has no URL + [URL_KEY]: taskWithTemplate[URL_KEY], + // Queries should remain from original task since template has no queries + [QUERIES_KEY]: taskWithTemplate[QUERIES_KEY], + }) + ) + }) + }) +}) diff --git a/functions/src/test/template.test.ts b/functions/src/test/template.test.ts new file mode 100644 index 0000000..2d7e881 --- /dev/null +++ b/functions/src/test/template.test.ts @@ -0,0 +1,456 @@ +import { describe, it, expect, vi, beforeEach } from 'vitest' +import { Template, TemplateData, URL_KEY, QUERIES_KEY } from '../types/Template' +import { validateTemplate } from '../validation/template-validation' +import { Task, TEMPLATE_KEY } from '../types/Task' +import { Query, QueryType, TargetType } from '../types/Query' + +// Mock Firebase using vi.hoisted to avoid initialization order issues +const { mockGet, mockDoc, mockCollection } = vi.hoisted(() => { + const mockGet = vi.fn() + const mockDoc = vi.fn(() => ({ get: mockGet })) + const mockCollection = vi.fn(() => ({ doc: mockDoc })) + + return { mockGet, mockDoc, mockCollection } +}) + +vi.mock('../firebase', () => ({ + db: { + collection: mockCollection, + }, +})) + +vi.mock('../config', () => ({ + default: { + templatesCollection: 'templates', + }, +})) + +describe('Template Class', () => { + const mockTemplateId = 'test-template-id' + const mockTemplateData: TemplateData = { + [URL_KEY]: 'https://example.com', + [QUERIES_KEY]: [ + { + id: 'title', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + { + id: 'description', + type: QueryType.CLASS, + value: 'description', + target: TargetType.TEXT, + }, + ], + } + + beforeEach(() => { + vi.clearAllMocks() + }) + + describe('constructor', () => { + it('should create a template instance with correct initial state', () => { + const template = new Template(mockTemplateId) + expect(template).toBeInstanceOf(Template) + expect(template['_templateId']).toBe(mockTemplateId) + expect(template['_initialized']).toBe(false) + }) + }) + + describe('initialize', () => { + it('should initialize template with data from Firestore', async () => { + mockGet.mockResolvedValue({ + exists: true, + data: () => mockTemplateData, + }) + + const template = new Template(mockTemplateId) + await template.initialize() + + expect(mockCollection).toHaveBeenCalledWith('templates') + expect(mockDoc).toHaveBeenCalledWith(mockTemplateId) + expect(mockGet).toHaveBeenCalled() + expect(template[URL_KEY]).toBe(mockTemplateData[URL_KEY]) + expect(template[QUERIES_KEY]).toEqual(mockTemplateData[QUERIES_KEY]) + expect(template['_initialized']).toBe(true) + }) + + it('should not reinitialize if already initialized', async () => { + mockGet.mockResolvedValue({ + exists: true, + data: () => mockTemplateData, + }) + + const template = new Template(mockTemplateId) + await template.initialize() + + // Clear mock calls + vi.clearAllMocks() + + // Try to initialize again + await template.initialize() + + // Should not make any Firebase calls + expect(mockCollection).not.toHaveBeenCalled() + expect(mockDoc).not.toHaveBeenCalled() + expect(mockGet).not.toHaveBeenCalled() + }) + + it('should throw error when template document does not exist', async () => { + mockGet.mockResolvedValue({ + exists: false, + }) + + const template = new Template(mockTemplateId) + + await expect(template.initialize()).rejects.toThrow(`Template not found: ${mockTemplateId}`) + expect(template['_initialized']).toBe(false) + }) + + it('should handle template with only URL', async () => { + const urlOnlyTemplate: TemplateData = { + [URL_KEY]: 'https://example.com', + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => urlOnlyTemplate, + }) + + const template = new Template(mockTemplateId) + await template.initialize() + + expect(template[URL_KEY]).toBe(urlOnlyTemplate[URL_KEY]) + expect(template[QUERIES_KEY]).toBeUndefined() + expect(template['_initialized']).toBe(true) + }) + + it('should handle template with only queries', async () => { + const queriesOnlyTemplate: TemplateData = { + [QUERIES_KEY]: mockTemplateData[QUERIES_KEY], + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => queriesOnlyTemplate, + }) + + const template = new Template(mockTemplateId) + await template.initialize() + + expect(template[URL_KEY]).toBeUndefined() + expect(template[QUERIES_KEY]).toEqual(queriesOnlyTemplate[QUERIES_KEY]) + expect(template['_initialized']).toBe(true) + }) + }) + + describe('mergeWithTask', () => { + let template: Template + + beforeEach(async () => { + mockGet.mockResolvedValue({ + exists: true, + data: () => mockTemplateData, + }) + + template = new Template(mockTemplateId) + await template.initialize() + }) + + it('should throw error if template is not initialized', () => { + const uninitializedTemplate = new Template('uninit-template') + const task: Task = { + [URL_KEY]: 'https://task-url.com', + } + + expect(() => uninitializedTemplate.mergeWithTask(task)).toThrow('Template not initialized') + }) + + it('should use task URL if provided, overriding template URL', () => { + const task: Task = { + [URL_KEY]: 'https://different-task-url.com', // Task URL should override template + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBe('https://different-task-url.com') + }) + + it('should use template URL if task does not provide one', () => { + // This test checks the behaviour when no url is provided in task, but only in the template + const task: Task = { + [URL_KEY]: undefined, + // No URL in task + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBe(mockTemplateData[URL_KEY]) + }) + + it('should merge template queries with existing task queries', () => { + const taskQueries: Query[] = [ + { + id: 'task-query', + type: QueryType.ID, + value: 'content', + target: TargetType.HTML, + }, + ] + + const task: Task = { + [URL_KEY]: 'https://task-url.com', + [QUERIES_KEY]: taskQueries, + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[QUERIES_KEY]).toHaveLength(3) // 2 from template + 1 from task + expect(mergedTask[QUERIES_KEY]).toEqual([...mockTemplateData[QUERIES_KEY]!, ...taskQueries]) + }) + + it('should handle task with no queries (undefined)', () => { + const task: Task = { + [URL_KEY]: 'https://task-url.com', + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[QUERIES_KEY]).toEqual(mockTemplateData[QUERIES_KEY]) + }) + + it('should handle task with empty queries array', () => { + const task: Task = { + [URL_KEY]: 'https://task-url.com', + [QUERIES_KEY]: [], + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[QUERIES_KEY]).toEqual(mockTemplateData[QUERIES_KEY]) + }) + + it('should preserve other task properties', () => { + const task: Task = { + [URL_KEY]: 'https://task-url.com', + [TEMPLATE_KEY]: 'some-template', + } + + const mergedTask = template.mergeWithTask(task) + + expect(mergedTask[TEMPLATE_KEY]).toBe(task[TEMPLATE_KEY]) + expect(mergedTask[URL_KEY]).toBe('https://task-url.com') // Task URL should override template + }) + + it('should handle template with no URL (preserving task URL)', async () => { + const templateWithoutUrl: TemplateData = { + [QUERIES_KEY]: mockTemplateData[QUERIES_KEY], + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => templateWithoutUrl, + }) + + const templateNoUrl = new Template('no-url-template') + await templateNoUrl.initialize() + + const task: Task = { + [URL_KEY]: 'https://task-url.com', + } + + const mergedTask = templateNoUrl.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBe(task[URL_KEY]) // Task URL should be used + expect(mergedTask[QUERIES_KEY]).toEqual(templateWithoutUrl[QUERIES_KEY]) + }) + + it('should handle template with no URL and no task URL', async () => { + const templateWithoutUrl: TemplateData = { + [QUERIES_KEY]: mockTemplateData[QUERIES_KEY], + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => templateWithoutUrl, + }) + + const templateNoUrl = new Template('no-url-template') + await templateNoUrl.initialize() + + const task: Task = { + [URL_KEY]: undefined, + // No URL + } + + const mergedTask = templateNoUrl.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBeUndefined() + expect(mergedTask[QUERIES_KEY]).toEqual(templateWithoutUrl[QUERIES_KEY]) + }) + + it('should handle template with no queries (preserving task queries)', async () => { + const templateWithoutQueries: TemplateData = { + [URL_KEY]: 'https://template-url.com', + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => templateWithoutQueries, + }) + + const templateNoQueries = new Template('no-queries-template') + await templateNoQueries.initialize() + + const taskQueries: Query[] = [ + { + id: 'task-query', + type: QueryType.TAG, + value: 'p', + target: TargetType.TEXT, + }, + ] + + const task: Task = { + [URL_KEY]: 'https://task-url.com', + [QUERIES_KEY]: taskQueries, + } + + const mergedTask = templateNoQueries.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBe('https://task-url.com') // Task URL should override template + expect(mergedTask[QUERIES_KEY]).toEqual(taskQueries) // Should preserve task queries + }) + + it('should handle template with no queries and no task queries', async () => { + const templateWithoutQueries: TemplateData = { + [URL_KEY]: 'https://template-url.com', + } + + mockGet.mockResolvedValue({ + exists: true, + data: () => templateWithoutQueries, + }) + + const templateNoQueries = new Template('no-queries-template') + await templateNoQueries.initialize() + + const task: Task = { + [URL_KEY]: 'https://task-url.com', + // No queries + } + + const mergedTask = templateNoQueries.mergeWithTask(task) + + expect(mergedTask[URL_KEY]).toBe('https://task-url.com') // Task URL should override template + expect(mergedTask[QUERIES_KEY]).toBeUndefined() + }) + }) +}) + +describe('Template Validation', () => { + describe('validateTemplate', () => { + it('should throw error for missing template', () => { + expect(() => validateTemplate(null as any)).toThrow('Template is missing') + expect(() => validateTemplate(undefined as any)).toThrow('Template is missing') + }) + + it('should validate template with valid URL', () => { + const template: TemplateData = { + [URL_KEY]: 'https://example.com', + [QUERIES_KEY]: [ + { + id: 'test', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + ], + } + + expect(() => validateTemplate(template)).not.toThrow() + }) + + it('should throw error for invalid URL type', () => { + const template: any = { + [URL_KEY]: 123, // Invalid type + [QUERIES_KEY]: [ + { + id: 'test', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + ], + } + + expect(() => validateTemplate(template)).toThrow( + `Template URL ('${URL_KEY}') must be provided as a string` + ) + }) + + it('should throw error for malformed URL', () => { + const template: TemplateData = { + [URL_KEY]: 'not-a-valid-url', + [QUERIES_KEY]: [ + { + id: 'test', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + ], + } + + expect(() => validateTemplate(template)).toThrow( + `Template URL ('${URL_KEY}') is not a valid URL` + ) + }) + + it('should validate template without URL', () => { + const template: TemplateData = { + [QUERIES_KEY]: [ + { + id: 'test', + type: QueryType.TAG, + value: 'h1', + target: TargetType.TEXT, + }, + ], + } + + expect(() => validateTemplate(template)).not.toThrow() + }) + + it('should throw error for invalid queries (not array)', () => { + const template: any = { + [URL_KEY]: 'https://example.com', + [QUERIES_KEY]: 'not-an-array', + } + + expect(() => validateTemplate(template)).toThrow( + `Task queries ('${QUERIES_KEY}') must be provided as an array` + ) + }) + + it('should throw error for empty queries array', () => { + const template: TemplateData = { + [URL_KEY]: 'https://example.com', + [QUERIES_KEY]: [], + } + + expect(() => validateTemplate(template)).toThrow(`Task queries ('${QUERIES_KEY}') are empty`) + }) + + it('should validate template with only URL (no queries)', () => { + const template: TemplateData = { + [URL_KEY]: 'https://example.com', + } + + // This should not throw because queries are optional in templates + // The validation only checks if queries exist, are they valid + expect(() => validateTemplate(template)).not.toThrow() + }) + }) +}) diff --git a/functions/src/types/Config.ts b/functions/src/types/Config.ts index 4b65714..8123c60 100644 --- a/functions/src/types/Config.ts +++ b/functions/src/types/Config.ts @@ -4,5 +4,6 @@ export interface Config { location: string database: string scrapeCollection: string + templatesCollection?: string logLevel: LogLevel } diff --git a/functions/src/types/Queriable.ts b/functions/src/types/Queriable.ts index 4ccce44..a74a0bb 100644 --- a/functions/src/types/Queriable.ts +++ b/functions/src/types/Queriable.ts @@ -2,7 +2,6 @@ import { JSDOM, DOMWindow } from 'jsdom' // import * as xpath from 'xpath'; import { Query, QueryType, TargetType } from './Query' -import { logger } from '../logger' import { validateQuery } from '../validation/query-validation' export class Queriable { diff --git a/functions/src/types/Task.ts b/functions/src/types/Task.ts index 8b98ba2..5a3ec70 100644 --- a/functions/src/types/Task.ts +++ b/functions/src/types/Task.ts @@ -2,9 +2,11 @@ import { Query } from './Query' // These are used for dynamic logging where the key of a property is output export const URL_KEY = 'url' +export const TEMPLATE_KEY = 'template' export const QUERIES_KEY = 'queries' export interface Task { [URL_KEY]: string // The target URL for the task. - [QUERIES_KEY]: Query[] // Element queries to run on the page html to extract data. + [TEMPLATE_KEY]?: string // The template to use for the task. + [QUERIES_KEY]?: Query[] // Element queries to run on the page html to extract data. } diff --git a/functions/src/types/Template.ts b/functions/src/types/Template.ts new file mode 100644 index 0000000..c939222 --- /dev/null +++ b/functions/src/types/Template.ts @@ -0,0 +1,78 @@ +import config from '../config' +import { Query } from './Query' +import { Task } from './Task' +import { db } from '../firebase' + +// These are used for dynamic logging where the key of a property is output +export const URL_KEY = 'url' +export const QUERIES_KEY = 'queries' + +export interface TemplateData { + [URL_KEY]?: string // The target URL for the task. + [QUERIES_KEY]?: Query[] // Element queries to run on the page html to extract data. +} + +export class Template implements TemplateData { + private _initialized = false + private _templateId: string; + [URL_KEY]?: string; + [QUERIES_KEY]?: Query[] + + constructor(templateId: string) { + this._initialized = false + this._templateId = templateId + } + + /** + * Initializes the template by loading the template from Firestore. + */ + public async initialize() { + if (this._initialized) { + return + } + + const templateDoc = await db.collection(config.templatesCollection).doc(this._templateId).get() + if (!templateDoc.exists) { + throw new Error(`Template not found: ${this._templateId}`) + } + + const templateData = templateDoc.data() as TemplateData + + this[URL_KEY] = templateData[URL_KEY] + this[QUERIES_KEY] = templateData[QUERIES_KEY] + this._initialized = true + } + + /** + * Merges the template with the task. + * @param task The task to merge the template with. + * @returns The task with the template merged in. + */ + public mergeWithTask(task: Task): Task { + if (!this._initialized) { + throw new Error('Template not initialized') + } + + // replace url with template url, if provided (task url will take precedence if both are provided) + if (this[URL_KEY] && !task[URL_KEY]) { + task[URL_KEY] = this[URL_KEY] + } + + // merge template queries with task queries (task queries may be undefined, so we need to check) + if (this[QUERIES_KEY]) { + const mergedQueries = [ + ...this[QUERIES_KEY], + ...(Array.isArray(task[QUERIES_KEY]) ? task[QUERIES_KEY] : []), + ] + + // deduplicate queries by id (first occurrence wins) + const deduplicatedQueries = mergedQueries.filter( + (query, index, self) => index === self.findIndex((t) => t.id === query.id) + ) + + task[QUERIES_KEY] = deduplicatedQueries + } + + return task + } +} diff --git a/functions/src/validation/common.ts b/functions/src/validation/common.ts new file mode 100644 index 0000000..4e07446 --- /dev/null +++ b/functions/src/validation/common.ts @@ -0,0 +1,18 @@ +import { QUERIES_KEY } from '../types/Task' +import { Task } from '../types/Task' +import { TemplateData } from '../types/Template' + +export function validateQueriesAttribute(task: Task | TemplateData): void { + // Only validate if queries exist - they are optional for templates + if (task[QUERIES_KEY] === undefined) { + return + } + + if (!Array.isArray(task[QUERIES_KEY])) { + throw new Error(`Task queries ('${QUERIES_KEY}') must be provided as an array`) + } + + if (task[QUERIES_KEY].length === 0) { + throw new Error(`Task queries ('${QUERIES_KEY}') are empty`) + } +} diff --git a/functions/src/validation/task-validation.ts b/functions/src/validation/task-validation.ts index 8897898..0b5a6f2 100644 --- a/functions/src/validation/task-validation.ts +++ b/functions/src/validation/task-validation.ts @@ -1,32 +1,47 @@ -import { QUERIES_KEY, Task, URL_KEY } from '../types/Task' +import { QUERIES_KEY, Task, TEMPLATE_KEY, URL_KEY } from '../types/Task' +import { validateQueriesAttribute } from './common' /** * Validates a task and returns an error message if the task is invalid. * @param task - The task to validate. * @returns An error message if the task is invalid, or null if the task is valid. */ -export function validateTask(task?: Task): string | null { +export function validateTask(task?: Task): void { if (!task) { - return 'Task is missing' + throw new Error('Task is missing') } if (typeof task[URL_KEY] !== 'string') { - return `Task URL ('${URL_KEY}') must be provided as a string` + throw new Error(`Task URL ('${URL_KEY}') must be provided as a string`) } try { new URL(task[URL_KEY]) } catch (error) { - return `Task URL ('${URL_KEY}') is not a valid URL` + throw new Error(`Task URL ('${URL_KEY}') is not a valid URL`) } - if (!Array.isArray(task[QUERIES_KEY])) { - return `Task queries ('${QUERIES_KEY}') must be provided as an array` + if (!task[QUERIES_KEY] && !task[TEMPLATE_KEY]) { + throw new Error( + `Task must have either template ('${TEMPLATE_KEY}') or queries ('${QUERIES_KEY}')` + ) } - if (task[QUERIES_KEY].length === 0) { - return `Task queries ('${QUERIES_KEY}') are empty` + if (task[QUERIES_KEY]) { + validateQueriesAttribute(task) } - return null + if (task[TEMPLATE_KEY]) { + validateTemplateAttribute(task) + } +} + +function validateTemplateAttribute(task: Task): void { + if (typeof task[TEMPLATE_KEY] !== 'string') { + throw new Error(`Task template ('${TEMPLATE_KEY}') must be provided as a string`) + } + + if (task[TEMPLATE_KEY].length === 0) { + throw new Error(`Task template ('${TEMPLATE_KEY}') is empty`) + } } diff --git a/functions/src/validation/template-validation.ts b/functions/src/validation/template-validation.ts new file mode 100644 index 0000000..294ed33 --- /dev/null +++ b/functions/src/validation/template-validation.ts @@ -0,0 +1,22 @@ +import { TemplateData, URL_KEY } from '../types/Template' +import { validateQueriesAttribute } from './common' + +export function validateTemplate(template: TemplateData): void { + if (!template) { + throw new Error('Template is missing') + } + + if (template[URL_KEY]) { + if (typeof template[URL_KEY] !== 'string') { + throw new Error(`Template URL ('${URL_KEY}') must be provided as a string`) + } + + try { + new URL(template[URL_KEY]) + } catch (error) { + throw new Error(`Template URL ('${URL_KEY}') is not a valid URL`) + } + } + + validateQueriesAttribute(template) +}