fix(specs): Update Crawler spec in line with doc site updates (#4508)

gazconroy · Gary Conroy · web-flow · commit 231de9bff956 · 2025-02-27T14:47:39.000+01:00
Co-authored-by: Gary Conroy &lt;gary.conroy@LON-M3P-GConroy.local&gt;
diff --git a/specs/crawler/common/schemas/action.yml b/specs/crawler/common/schemas/action.yml
@@ -21,9 +21,9 @@ Action:
     discoveryPatterns:
       type: array
       description: |
-        Indicates additional pages that the crawler should visit.
+        Indicates _intermediary_ pages that the crawler should visit.
 
-        For more information, see the [`discoveryPatterns` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/discovery-patterns/).
+        For more information, see the [`discoveryPatterns` documentation](https://www.algolia.com/doc/tools/crawler/apis/discoverypatterns/).
       items:
         $ref: '#/urlPattern'
     fileTypesToMatch:
@@ -71,7 +71,7 @@ Action:
         Function for extracting information from a crawled page and transforming it into Algolia records for indexing.
         The Crawler has an [editor](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#the-editor) with autocomplete and validation to help you update the `recordExtractor` property.
 
-        For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/actions/#parameter-param-recordextractor).
+        For details, consult the [`recordExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/recordextractor/).
       properties:
         __type:
           $ref: '#/configurationRecordExtractorType'
@@ -140,7 +140,7 @@ hostnameAliases:
     Key-value pairs to replace matching hostnames found in a sitemap,
     on a page, in canonical links, or redirects.
 
-    For more information, see the [`hostnameAliases` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/hostname-aliases/).
+    For more information, see the [`hostnameAliases` documentation](https://www.algolia.com/doc/tools/crawler/apis/hostnamealiases/).
   additionalProperties:
     type: string
     description: Hostname that should be added in the records.
@@ -174,7 +174,7 @@ cache:
   description: |
     Whether the crawler should cache crawled pages.
 
-    For more information, see the [`cache` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/cache/).
+    For more information, see the [`cache` documentation](https://www.algolia.com/doc/tools/crawler/apis/cache/).
   properties:
     enabled:
       type: boolean
diff --git a/specs/crawler/common/schemas/configuration.yml b/specs/crawler/common/schemas/configuration.yml
@@ -18,7 +18,7 @@ Configuration:
       description: |
         Algolia API key for indexing the records.
 
-        For more information, see the [`apiKey` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/api-key/).
+        For more information, see the [`apiKey` documentation](https://www.algolia.com/doc/tools/crawler/apis/apikey/).
     appId:
       $ref: '../parameters.yml#/applicationID'
     exclusionPatterns:
@@ -50,9 +50,8 @@ Configuration:
       type: array
       maxItems: 9999
       description: |
-        URLs from where to start crawling.
-
-        For more information, see the [`extraUrls` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/extra-urls/).
+        The Crawler treats `extraUrls` the same as `startUrls`.
+        Specify `extraUrls` if you want to differentiate between URLs you manually added to fix site crawling from those you initially specified in `startUrls`.
       items:
         type: string
     ignoreCanonicalTo:
@@ -62,7 +61,7 @@ Configuration:
       description: |
         Whether to ignore the `nofollow` meta tag or link attribute.
 
-        For more information, see the [`ignoreNoFollowTo` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/ignore-no-follow-to/).
+        For more information, see the [`ignoreNoFollowTo` documentation](https://www.algolia.com/doc/tools/crawler/apis/ignorenofollowto/).
     ignoreNoIndex:
       type: boolean
       description: |
@@ -97,7 +96,9 @@ Configuration:
       description: |
         Crawler index settings.
 
-        For more information, see the [`initialIndexSettings` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/initial-index-settings/).
+        These index settings are only applied during the first crawl of an index.
+        Any subsequent changes won't be applied to the index.
+        Instead, make changes to your index settings in the [Algolia dashboard](https://dashboard.algolia.com/explorer/configuration/).
       additionalProperties:
         $ref: '../../../common/schemas/IndexSettings.yml#/indexSettings'
         x-additionalPropertiesName: indexName
@@ -107,7 +108,7 @@ Configuration:
       description: |
         Function for extracting URLs from links on crawled pages.
 
-        For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/link-extractor/).
+        For more information, see the [`linkExtractor` documentation](https://www.algolia.com/doc/tools/crawler/apis/linkextractor/).
       properties:
         __type:
           $ref: './action.yml#/configurationRecordExtractorType'
@@ -136,10 +137,12 @@ Configuration:
     maxUrls:
       type: number
       description: |
-        Maximum number of crawled URLs.
+        Limits the number of URLs your crawler processes.
+
+        Change it to a low value, such as 100, for quick crawling tests.
+        Change it to a higher explicit value for full crawls to prevent it from getting "lost" in complex site structures.
 
-        Setting `maxUrls` doesn't guarantee consistency between crawls
-        because the crawler processes URLs in parallel.
+        Because the Crawler works on many pages simultaneously, `maxUrls` doesn't guarantee finding the same pages each time it runs.
       minimum: 1
       maximum: 15000000
     rateLimit:
@@ -194,9 +197,12 @@ ignoreCanonicalTo:
   oneOf:
     - type: boolean
       description: |
-        Whether to ignore canonical redirects.
+        Determines if the crawler should extract records from a page with a [canonical URL](https://www.algolia.com/doc/tools/crawler/getting-started/crawler-configuration/#canonical-urls-and-crawler-behaviorr).
 
-        If true, canonical URLs for pages are ignored.
+        If ignoreCanonicalTo is set to:
+
+        - `true` all canonical URLs are ignored.
+        - One or more URL patterns, the crawler will ignore the canonical URL if it matches a pattern.
     - type: array
       description: |
         Canonical URLs or URL patterns to ignore.
@@ -209,9 +215,10 @@ ignoreCanonicalTo:
 
 renderJavaScript:
   description: |
-    Crawl JavaScript-rendered pages with a headless browser.
+    If `true`, use a Chrome headless browser to crawl pages.
 
-    For more information, see the [`renderJavaScript` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/render-java-script/).
+    Because crawling JavaScript-based web pages is slower than crawling regular HTML pages, you can apply this setting to a specific list of pages. 
+    Use [micromatch](https://github.com/micromatch/micromatch) to define URL patterns, including negations and wildcards.
   oneOf:
     - type: boolean
       description: Whether to render all pages.
@@ -220,24 +227,29 @@ renderJavaScript:
       items:
         type: string
         description: URL or URL pattern to render.
-        example: https://www.example.com
+        example:
+          - http://www.mysite.com/dynamic-pages/**
     - title: headlessBrowserConfig
       type: object
       description: Configuration for rendering HTML.
       properties:
         enabled:
           type: boolean
-          description: Whether to render matching URLs.
+          description: Whether to enable JavaScript rendering.
+          example: true
         patterns:
           type: array
           description: URLs or URL patterns to render.
           items:
             type: string
+          example:
+            - http://www.mysite.com/dynamic-pages/**
         adBlock:
           type: boolean
+          default: false
           description: |
-            Whether to turn on the built-in adblocker.
-            This blocks most ads and tracking scripts but can break some sites.
+            Whether to use the Crawler's ad blocker.
+            It blocks most ads and tracking scripts but can break some sites.
         waitTime:
           $ref: '#/waitTime'
       required:
@@ -246,7 +258,7 @@ renderJavaScript:
 
 requestOptions:
   type: object
-  description: Options to add to all HTTP requests made by the crawler.
+  description: Lets you add options to HTTP requests made by the crawler.
   properties:
     proxy:
       type: string
@@ -270,10 +282,12 @@ waitTime:
       type: number
       default: 0
       description: Minimum waiting time in milliseconds.
+      example: 7000
     max:
       type: number
       default: 20000
       description: Maximum waiting time in milliseconds.
+      example: 15000
 
 initialIndexSettings:
   type: object
@@ -450,5 +464,5 @@ schedule:
   description: |
     Schedule for running the crawl.
 
-    For more information, see the [`schedule` documentation](https://www.algolia.com/doc/tools/crawler/apis/configuration/schedule/).
+    For more information, see the [`schedule` documentation](https://www.algolia.com/doc/tools/crawler/apis/schedule/).
   example: every weekday at 12:00 pm
diff --git a/specs/crawler/spec.yml b/specs/crawler/spec.yml
@@ -65,9 +65,9 @@ security:
   - BasicAuth: []
 tags:
   - name: actions
-    x-displayName: Actions
+    x-displayName: State
     description: |
-      Actions change the state of crawlers, such as pausing and unpausing schedules or testing the crawler with specific URLs.
+      Change the state of crawlers, such as pausing crawl schedules or testing the crawler with specific URLs.
   - name: config
     x-displayName: Configuration
     description: |
@@ -78,7 +78,7 @@ tags:
       It's easiest to make configuration changes on the [Crawler page](https://dashboard.algolia.com/crawler) in the Algolia dashboard.
       The editor has autocomplete and built-in validation so you can try your configuration changes before committing them.
   - name: crawlers
-    x-displayName: Crawler
+    x-displayName: Manage
     description: |
       A crawler is an object with a name and a [configuration](#tag/config).
       Use these endpoints to create, rename, and delete crawlers.