docs: improve docstrings of storages (#465)

vdusek · web-flow · commit 8dd28fc84963 · 2024-08-27T13:15:54.000+02:00
### Description - Improve docstrings of storage classes. - I also changed the list of main classes to reflect at least "somehow" the current public interface. ### Issues - Relates: #304 ### Testing - Website was rendered locally. ### Checklist - [x] CI passed
diff --git a/.github/workflows/check_pr_title.yaml b/.github/workflows/check_pr_title.yaml
@@ -2,11 +2,11 @@ name: Check PR title
 
 on:
   pull_request_target:
-    types: [ opened, edited, synchronize ]
+    types: [opened, edited, synchronize]
 
 jobs:
   check_pr_title:
-    name: 'Check PR title'
+    name: Check PR title
     runs-on: ubuntu-latest
     steps:
       - uses: amannn/action-semantic-pull-request@v5.5.3
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -0,0 +1,63 @@
+name: docs
+
+on:
+  push:
+    branches:
+      - master
+  workflow_dispatch:
+
+jobs:
+  build:
+    environment:
+      name: github-pages
+    permissions:
+      contents: write
+      pages: write
+      id-token: write
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Use Node.js 20
+        uses: actions/setup-node@v4
+        with:
+          node-version: 20
+
+      - name: Enable corepack
+        run: |
+          corepack enable
+          corepack prepare yarn@stable --activate
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.12
+
+      - name: Install Python dependencies
+        run: make install-dev
+
+      - name: Build generated API reference
+        run: make build-api-reference
+
+      - name: Build & deploy docs
+        run: |
+          # go to website dir
+          cd website
+          # install website deps
+          yarn
+          # build the docs
+          yarn build
+        env:
+          APIFY_SIGNING_TOKEN: ${{ secrets.APIFY_SIGNING_TOKEN }}
+
+      - name: Set up GitHub Pages
+        uses: actions/configure-pages@v5
+
+      - name: Upload GitHub Pages artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./website/build
+
+      - name: Deploy artifact to GitHub Pages
+        uses: actions/deploy-pages@v4
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
diff --git a/.github/workflows/run_release.yaml b/.github/workflows/run_release.yaml
@@ -23,7 +23,7 @@ on:
         description: The custom version to bump to (only for "custom" type)
         required: false
         type: string
-        default: ''
+        default: ""
 
 jobs:
   # This job determines if the conditions are met for a release to occur. It will proceed if triggered manually,
@@ -110,7 +110,7 @@ jobs:
         with:
           author_name: Apify Release Bot
           author_email: noreply@apify.com
-          message: 'chore(release): Update changelog and package version [skip ci]'
+          message: "chore(release): Update changelog and package version [skip ci]"
 
   create_github_release:
     name: Create github release
diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py
@@ -37,14 +37,14 @@ class BaseRequestData(BaseModel):
     """URL of the web page to crawl"""
 
     unique_key: Annotated[str, Field(alias='uniqueKey')]
-    """A unique key identifying the request. Two requests with the same `uniqueKey` are considered as pointing to the
-    same URL.
+    """A unique key identifying the request. Two requests with the same `unique_key` are considered as pointing
+    to the same URL.
 
-    If `uniqueKey` is not provided, then it is automatically generated by normalizing the URL.
-    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `uniqueKey`
+    If `unique_key` is not provided, then it is automatically generated by normalizing the URL.
+    For example, the URL of `HTTP://www.EXAMPLE.com/something/` will produce the `unique_key`
     of `http://www.example.com/something`.
 
-    Pass an arbitrary non-empty text value to the `uniqueKey` property
+    Pass an arbitrary non-empty text value to the `unique_key` property
     to override the default behavior and specify which URLs shall be considered equal.
     """
 
diff --git a/src/crawlee/storages/_dataset.py b/src/crawlee/storages/_dataset.py
@@ -75,22 +75,32 @@ class ExportToKwargs(TypedDict):
 
 
 class Dataset(BaseStorage):
-    """Represents an append-only structured storage, ideal for tabular data akin to database tables.
+    """Represents an append-only structured storage, ideal for tabular data similar to database tables.
 
-    Represents a structured data store similar to a table, where each object (row) has consistent attributes (columns).
-    Datasets operate on an append-only basis, allowing for the addition of new records without the modification or
-    removal of existing ones. This class is typically used for storing crawling results.
+    The `Dataset` class is designed to store structured data, where each entry (row) maintains consistent attributes
+    (columns) across the dataset. It operates in an append-only mode, allowing new records to be added, but not
+    modified or deleted. This makes it particularly useful for storing results from web crawling operations.
 
-    Data can be stored locally or in the cloud, with local storage paths formatted as:
-    `{CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json`. Here, `{DATASET_ID}` is either "default" or
-    a specific dataset ID, and `{INDEX}` represents the zero-based index of the item in the dataset.
+    Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client.
+    By default a `MemoryStorageClient` is used, but it can be changed to a different one.
 
-    To open a dataset, use the `open` class method with an `id`, `name`, or `config`. If unspecified, the default
-    dataset for the current crawler run is used. Opening a non-existent dataset by `id` raises an error, while
-    by `name`, it is created.
+    By default, data is stored using the following path structure:
+    ```
+    {CRAWLEE_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json
+    ```
+    - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.
+    - `{DATASET_ID}`: Specifies the dataset, either "default" or a custom dataset ID.
+    - `{INDEX}`: Represents the zero-based index of the record within the dataset.
+
+    To open a dataset, use the `open` class method by specifying an `id`, `name`, or `configuration`. If none are
+    provided, the default dataset for the current crawler run is used. Attempting to open a dataset by `id` that does
+    not exist will raise an error; however, if accessed by `name`, the dataset will be created if it doesn't already
+    exist.
 
     Usage:
-        dataset = await Dataset.open(id='my_dataset_id')
+    ```python
+    dataset = await Dataset.open(name='my_dataset')
+    ```
     """
 
     _MAX_PAYLOAD_SIZE = ByteSize.from_mb(9)
diff --git a/src/crawlee/storages/_key_value_store.py b/src/crawlee/storages/_key_value_store.py
@@ -15,24 +15,34 @@
 
 
 class KeyValueStore(BaseStorage):
-    """Represents a key-value based storage for reading data records or files.
-
-    Each record is identified by a unique key and associated with a MIME content type. This class is used within
-    crawler runs to store inputs and outputs, typically in JSON format, but supports other types as well.
-
-    The data can be stored on a local filesystem or in the cloud, determined by the `CRAWLEE_STORAGE_DIR`
-    environment variable.
-
-    By default, data is stored in `{CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{INDEX}.{EXT}`, where
-    `{STORE_ID}` is either "default" or specified by `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`, `{KEY}` is the record key,
-    and `{EXT}` is the MIME type.
-
-    To open a key-value store, use the class method `open`, providing either an `id` or `name` along with optional
-    `config`. If neither is provided, the default store for the crawler run is used. Opening a non-existent store by
-    `id` raises an error, while a non-existent store by `name` is created.
+    """Represents a key-value based storage for reading and writing data records or files.
+
+    Each data record is identified by a unique key and associated with a specific MIME content type. This class is
+    commonly used in crawler runs to store inputs and outputs, typically in JSON format, but it also supports other
+    content types.
+
+    Data can be stored either locally or in the cloud. It depends on the setup of underlying storage client.
+    By default a `MemoryStorageClient` is used, but it can be changed to a different one.
+
+    By default, data is stored using the following path structure:
+    ```
+    {CRAWLEE_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT}
+    ```
+    - `{CRAWLEE_STORAGE_DIR}`: The root directory for all storage data specified by the environment variable.
+    - `{STORE_ID}`: The identifier for the key-value store, either "default" or as specified by
+      `CRAWLEE_DEFAULT_KEY_VALUE_STORE_ID`.
+    - `{KEY}`: The unique key for the record.
+    - `{EXT}`: The file extension corresponding to the MIME type of the content.
+
+    To open a key-value store, use the `open` class method, providing an `id`, `name`, or optional `configuration`.
+    If none are specified, the default store for the current crawler run is used. Attempting to open a store by `id`
+    that does not exist will raise an error; however, if accessed by `name`, the store will be created if it does not
+    already exist.
 
     Usage:
-        kvs = await KeyValueStore.open(id='my_kvs_id')
+    ```python
+    kvs = await KeyValueStore.open(name='my_kvs')
+    ```
     """
 
     def __init__(
diff --git a/src/crawlee/storages/_request_queue.py b/src/crawlee/storages/_request_queue.py
diff --git a/website/transformDocs.js b/website/transformDocs.js