Merge pull request #73 from apify/feat/output

jancurn · web-flow · commit 1e6ff46bf204 · 2025-05-06T10:56:37.000+02:00
Finishing output schema
diff --git a/README.md b/README.md
@@ -1826,7 +1826,8 @@ For details, see the [Actor file specification](./pages/ACTOR_FILE.md) page.
   "title": "Screenshotter",
   "description": "Take a screenshot of any URL",
   "version": "0.0",
-  "input": "./input_schema.json",
+  "inputSchema": "./input_schema.json",
+  "outputSchema": "./output_schema.json",
   "dockerfile": "./Dockerfile"
 }
 ```
@@ -2013,21 +2014,11 @@ This is an example of the output schema file for the `bob/screenshotter` Actor:
   "title": "Output schema for Screenshotter Actor",
   "description": "The URL to the resulting screenshot",
   "properties": {
-
-    "currentProducts": {
-      "type": "$defaultDataset",
-      "views": ["productVariants"]
-    },
-    
     "screenshotUrl": {
-      "type": "$defaultKeyValueStore",
-      "keys": ["screenshot.png"],
-      "title": "Product page screenshot"
-    },
-
-    "productExplorer": {
-      "type": "$defaultWebServer",
-      "title": "API server"
+      "type": "string",
+      "title": "Web page screenshot",
+      "resourceType": "file",
+      "template": "{{actorRun.defaultKeyValueStoreUrl}}/screenshot.png"
     }
   }
 }
diff --git a/pages/ACTOR_FILE.md b/pages/ACTOR_FILE.md
@@ -2,17 +2,17 @@
 
 This JSON file must be present at `.actor/actor.json` and defines core properties of a single web Actor.
 
-The file has the following structure:
+The file contains a single JSON object with the following properties:
 
 ```jsonc
 {
-  // Required, indicates that this is an Actor definition file and the specific version of the Actor specification.
+  // Required field, indicates that this is an Actor definition file and the specific version of the Actor specification.
   "actorSpecification": 1,
-
-  // Required "technical" name of the Actor, must be a DNS-friendly text
+  
+  // Required "technical" name of the Actor, must be a DNS hostname-friendly text.
   "name": "google-search-scraper",
 
-  // Human-friendly name and description of the Actor
+  // Human-friendly name and description of the Actor.
   "title": "Google Search Scraper",
   "description": "A 200-char description",
 
@@ -22,56 +22,59 @@ The file has the following structure:
 
   // Optional tag that is applied to the builds of this Actor. If omitted, it defaults to "latest".
   "buildTag": "latest",
-
-  // An object with environment variables expected by the Actor.
+  
+  // An optional object with environment variables expected by the Actor.
   // Secret values are prefixed by @ and their actual values need to be registered with the CLI, for example:
   // $ apify secrets add mySecretPassword pwd1234
   "environmentVariables": {
     "MYSQL_USER": "my_username",
     "MYSQL_PASSWORD": "@mySecretPassword"
   },
-
-  // If true, the Actor indicates it can be run in the Standby mode,
+  
+  // Optional field. If true, the Actor indicates it can be run in the Standby mode,
   // to get started and be kept alive by the system to handle incoming HTTP REST requests by the Actor's web server.
   "usesStandbyMode": true,
-
-  // A metadata object enabling implementations to pass arbitrary additional properties.
+ 
+  // An optional metadata object enabling implementations to pass arbitrary additional properties.
+  // The properties and their values must be strings.
   "labels": {
     "something": "bla bla"
   },
 
   // Optional minimum and maximum memory for running the Actor.
   "minMemoryMbytes": 128,
   "maxMemoryMbytes": 4096,
-
-  // Link to the Actor Dockerfile. If omitted, the system looks for "./Dockerfile" or "../Dockerfile"
+  
+  // Optional link to the Actor Dockerfile.
+  // If omitted, the system looks for "./Dockerfile" or "../Dockerfile"
   "dockerfile": "./Dockerfile",
-
-  // Link to the Actor README file in Markdown format. If omitted, the system looks for "./ACTOR.md" and "../README.md"
+  
+  // Optional link to the Actor README file in Markdown format.
+  // If omitted, the system looks for "./ACTOR.md" and "../README.md"
   "readme": "./README.md",
 
   // Optional link to the Actor changelog file in Markdown format.
   "changelog": "../../../shared/CHANGELOG.md",
-
-  // Links to input/output extened JSON schema files or inlined objects.
-  // COMPATIBILITY: This used to be called "input", all implementations should support it
+  
+  // Optional link to Actor input or output schema file, or inlined schema object,
+  // which is a JSON schema with our extensions. For details see ./INPUT_SCHEMA.md or ./OUTPUT_SCHEMA.md, respectively.
+  // BACKWARDS COMPATIBILITY: "inputSchema" used to be called "input", all implementations should support this.
   "inputSchema": "./input_schema.json",
   "outputSchema": "./output_schema.json",
-
-  // Links to storages schema files, or inlined schema objects.
-  // These aren't standard JSON schema files, but our own format. See ./DATASET_SCHEMA.md
-  // COMPATIBILITY: This used to be "storages.keyValueStore", all implementations should support it
+  
+  // Optional path to Dataset or Key-value Store schema file or inlined schema object for the Actor's default dataset or key-value store. 
+  // For detail, see ./DATASET_SCHEMA.md or ./KEY_VALUE_STORE_SCHEMA.md, respectively.
+  // BACKWARDS COMPATIBILITY: "datasetSchema" used to be "storages.keyValueStore" sub-object, all implementations should support this.
   "datasetSchema": "../shared_schemas/generic_dataset_schema.json",
-
   "keyValueStoreSchema": "./key_value_store_schema.json",
-
-  // Optional link to an OpenAPI definition file or inlined object describing the Actor web server API
-  "webServerOpenapi": "./web_server_openapi.json",
-
-  // Optional URL path to the Model Context Protocol (MCP) server exposed on the Actor web server.
+   
+  // Optional path or inlined schema object of the Actor's web server in OpenAPI formation.
+  "webServerSchema": "./web_server_openapi.json",
+  
+  // Optional URL path and query parameters to the Model Context Protocol (MCP) server exposed by the Actor web server.
   // If present, the system knows the Actor provides an MCP server, which can be used by the platform
-  // and integrations to integrate the Actor from AI/LLM systems.
-  "webServerMcpPath": "/mcp?someVar=1",
+  // and integrations to integrate the Actor with various AI/LLM systems.
+  "webServerMcpPath": "/mcp?version=2",
 
   // Scripts can be used by tools like the CLI to do certain actions based on the commands you run.
   // The presence of this object in your Actor config is optional, but we recommend always defining at least the `run` key.
diff --git a/pages/INPUT_SCHEMA.md b/pages/INPUT_SCHEMA.md
@@ -5,99 +5,97 @@ Actor (see [Input](../README.md#input) for details).
 The file is referenced from the main [Actor file (.actor/actor.json)](ACTOR_FILE.md) using the `input` directive,
 and it is typically stored in `.actor/input_schema.json`.
 
-The file is a JSON schema with our extensions,
-which defines input properties for an Actor, including documentation, default value, and user interface definition.
+The file is a JSON schema with our extensions describing a single Actor input object
+and its properties, including documentation, default value, and user interface definition.
 
 **For full reference, see [Input schema specification](https://docs.apify.com/platform/actors/development/actor-definition/input-schema/specification/v1) in Apify documentation.**
 
-<!-- TODO: Move the full specs to this repo -->
+<!-- TODO: Move the full specs including JSON meta schema to this repo -->
+<!-- TODO: Consider renaming "editor" values to camelCase, for consistency -->
 
 ## Example Actor input schema
 
 ```jsonc
 {
-    "actorInputSchemaVersion": 1,
-    "title": "Input schema for Website Content Crawler",
-    "description": "Enter the start URL(s) of the website(s) to crawl, configure other optional settings, and run the Actor to crawl the pages and extract their text content.",
-    "type": "object",
-    "properties": {
-        "startUrls": {
-            "title": "Start URLs",
-            "type": "array",
-            "description": "One or more URLs of the pages where the crawler will start. Note that the Actor will additionally only crawl sub-pages of these URLs. For example, for the start URL `https://www.example.com/blog`, it will crawl pages like `https://example.com/blog/article-1`, but will skip `https://example.com/docs/something-else`.",
-            "editor": "requestListSources",
-            "prefill": [{ "url": "https://docs.apify.com/" }]
-        },
-        "crawlerType": {
-            "sectionCaption": "Crawler settings",
-            "title": "Crawler type",
-            "type": "string",
-            "enum": ["playwright:chrome", "cheerio", "jsdom"],
-            "enumTitles": ["Headless web browser (Chrome+Playwright)", "Raw HTTP client (Cheerio)", "Raw HTTP client with JS execution (JSDOM) (experimental!)"],
-            "description": "Select the crawling engine:\n- **Headless web browser** (default) - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.",
-            "default": "playwright:chrome"
-        },
-        "maxCrawlDepth": {
-            "title": "Max crawling depth",
-            "type": "integer",
-            "description": "The maximum number of links starting from the start URL that the crawler will recursively descend. The start URLs have a depth of 0, the pages linked directly from the start URLs have a depth of 1, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to 0, the Actor will only crawl start URLs.",
-            "minimum": 0,
-            "default": 20
-        },
-        "maxCrawlPages": {
-            "title": "Max pages",
-            "type": "integer",
-            "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
-            "minimum": 0,
-            "default": 9999999
-        },
-        // ...
+  "actorInputSchemaVersion": 1,
+
+  "title": "Input schema for an Actor",
+  "description": "Enter the start URL(s) of the website(s) to crawl, configure other optional settings, and run the Actor to crawl the pages and extract their text content.",
+  "type": "object",
+    
+  "properties": {
+
+    "startUrls": {
+      "title": "Start URLs",
+      "type": "array",
+      "description": "One or more URLs of the pages where the crawler will start. Note that the Actor will additionally only crawl sub-pages of these URLs. For example, for the start URL `https://www.example.com/blog`, it will crawl pages like `https://example.com/blog/article-1`, but will skip `https://example.com/docs/something-else`.",
+      "editor": "requestListSources",
+      "prefill": [{ "url": "https://docs.apify.com/" }]
+    },
+
+    // The input value is another Dataset. The system can generate an UI to make it easy to select the dataset.
+    "processDatasetId": {
+      "title": "Input dataset",
+      "type": "string",
+      "resourceType": "dataset",
+      "description": "Dataset to be processed by the Actor",
+      // Optional link to dataset schema, used by the system to validate the input dataset
+      "schema": "./input_dataset_schema.json"
+    },
+    
+    "screenshotsKeyValueStoreId": {
+      "title": "Screenshots to process",
+      "type": "string",
+      "resourceType": "keyValueStore",
+      "description": "Screenshots to be compressed",
+      "schema": "./input_key_value_store_schema.json"
+    },
+    
+    "singleFileUrl": {
+      "title": "Some file",
+      "type": "string",
+      "editor": "fileupload",
+      "description": "Screenshots to be compressed",
+      "schema": "./input_key_value_store_schema.json"
+    },
+  
+    "crawlerType": {
+      "sectionCaption": "Crawler settings",
+      "title": "Crawler type",
+      "type": "string",
+      "enum": ["playwright:chrome", "cheerio", "jsdom"],
+      "enumTitles": ["Headless web browser (Chrome+Playwright)", "Raw HTTP client (Cheerio)", "Raw HTTP client with JS execution (JSDOM) (experimental!)"],
+      "description": "Select the crawling engine:\n- **Headless web browser** (default) - Useful for modern websites with anti-scraping protections and JavaScript rendering. It recognizes common blocking patterns like CAPTCHAs and automatically retries blocked requests through new sessions. However, running web browsers is more expensive as it requires more computing resources and is slower. It is recommended to use at least 8 GB of RAM.\n- **Raw HTTP client** - High-performance crawling mode that uses raw HTTP requests to fetch the pages. It is faster and cheaper, but it might not work on all websites.",
+      "default": "playwright:chrome"
+    },
+
+    "maxCrawlDepth": {
+      "title": "Max crawling depth",
+      "type": "integer",
+      "description": "The maximum number of links starting from the start URL that the crawler will recursively descend. The start URLs have a depth of 0, the pages linked directly from the start URLs have a depth of 1, and so on.\n\nThis setting is useful to prevent accidental crawler runaway. By setting it to 0, the Actor will only crawl start URLs.",
+      "minimum": 0,
+      "default": 20
+    },
+    
+    "maxCrawlPages": {
+      "title": "Max pages",
+      "type": "integer",
+      "description": "The maximum number pages to crawl. It includes the start URLs, pagination pages, pages with no content, etc. The crawler will automatically finish after reaching this number. This setting is useful to prevent accidental crawler runaway.",
+      "minimum": 0,
+      "default": 9999999
     }
+     
+  }
 }
 ```
 
 ## Random notes
 
-To make Actors easier to pipeline, we could add e.g. 
-`dataset`, `keyValueStore` and `requestQueue` types, each optionally
-restricted by the referenced schema to make sure that selected storage is compatible.
 
-Another idea is to add type `actor`. The use case could be for example a testing Actor with 3 inputs:
+We could also add an `actor` resource type. The use case could be for example a testing Actor with three inputs:
 - Actor to be tested
 - test function containing for example Jest unit test over the output
 - input for the Actor
 
 ...and the testing Actor would call the given Actor with a given output and in the end execute tests if the results are correct.
 
-
-
-For example:
-
-```jsonc
-  "inputDataset": {
-    "title": "Input dataset",
-    "type": "string",
-    "resourceType": "Dataset",
-    "schema": "./input_dataset_schema.json",
-    "description": "Dataset to be processed",
-  },
-
-  "inputScreenshots": {
-    "title": "Input screenshots",
-    "type": "string",
-    "resourceType": "KeyValueStore",
-    "description": "Screenshots to be compressed",
-    "schema": "./input_key_value_store_schema.json",
-    // Specify records groups from the schema that Actor is interested in.
-    // Note that a recordGroup can be a single file too!
-    "recordGroups": ["screenshots", "images"]
-  }
-```
-
-This example would be rendered in Input UI as a search/dropdown that would only list named
-datasets or key-value stores with matching schema. This feature will make it easy to integrate Actors,
-and pipe results from one to another.
-Note from Franta: It would be cool to have an option in the dropdown to create a
-new dataset/key-value store with the right schema,
-if it's the first time you're running some Actor,
-and then in the next runs you could reuse it.
diff --git a/pages/OUTPUT_SCHEMA.md b/pages/OUTPUT_SCHEMA.md