[ENHANCEMENT] expose dataset progress values within the python SDK (#5479)

frascuchon · pre-commit-ci[bot] · nataliaElv · web-flow · commit bafd92fa1650 · 2024-09-16T09:18:32.000+02:00
# Description  Closes #5476 **Type of change**  - Improvement (change adding some improvement to an existing functionality) - Documentation update **How Has This Been Tested**  **Checklist**  - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Natalia Elvira <126158523+nataliaElv@users.noreply.github.com>
diff --git a/argilla/docs/how_to_guides/annotate.md b/argilla/docs/how_to_guides/annotate.md
@@ -125,6 +125,9 @@ You can track the progress of an annotation task in the progress bar shown in th
 
 You can also track your own progress in real time expanding the right-bottom panel inside the dataset page. There you can see the number of records for which you have `Pending`, `Draft`, `Submitted` and `Discarded` responses.
 
+!!! note
+    You can also explore the dataset progress from the SDK. Check the [Track your team's progress](./distribution.md#track-your-teams-progress) to know more about it.
+
 ## Use search, filters, and sort
 
 The UI offers various features designed for data exploration and understanding. Combining these features with bulk labelling can save you and your team hours of time.
diff --git a/argilla/docs/how_to_guides/distribution.md b/argilla/docs/how_to_guides/distribution.md
@@ -77,4 +77,61 @@ dataset = client.datasets("my_dataset")
 dataset.settings.distribution.min_submitted = 4
 
 dataset.update()
-```
+```
+
+## Track your team's progress
+
+You can check the progress of the annotation task by using the `dataset.progress` method.
+This method will return the number of records that have the status `completed`, `pending`, and the
+total number of records in the dataset.
+
+```python
+import argilla as rg
+
+client = rg.Argilla(api_url="<api_url>", api_key="<api_key>")
+
+dataset = client.datasets("my_dataset")
+
+progress = dataset.progress()
+```
+```json
+{
+    "total": 100,
+    "completed": 10,
+    "pending": 90
+}
+```
+
+You can see also include to the progress the users distribution by setting the `with_users_distribution` parameter to `True`.
+This will return the number of records that have the status `completed`, `pending`, and the total number of records in the dataset,
+as well as the number of completed submissions per user. You can visit the [Annotation Progress](../how_to_guides/annotate.md#annotation-progress) section for more information.
+
+```python
+import argilla as rg
+
+client = rg.Argilla(api_url="<api_url>", api_key="<api_key>")
+
+dataset = client.datasets("my_dataset")
+
+progress = dataset.progress(with_users_distribution=True)
+```
+```json
+{
+    "total": 100,
+    "completed": 50,
+    "pending": 50,
+    "users": {
+        "user1": {
+           "completed": { "submitted": 10, "draft": 5, "discarded": 5},
+           "pending": { "submitted": 5, "draft": 10, "discarded": 10},
+        },
+        "user2": {
+           "completed": { "submitted": 20, "draft": 10, "discarded": 5},
+           "pending": { "submitted": 2, "draft": 25, "discarded": 0},
+        },
+        ...
+}
+```
+
+!!! note
+    Since the completed records can contain submissions from multiple users, the number of completed submissions per user may not match the total number of completed records.
diff --git a/argilla/src/argilla/_api/_datasets.py b/argilla/src/argilla/_api/_datasets.py
@@ -22,6 +22,8 @@
 
 __all__ = ["DatasetsAPI"]
 
+from argilla._models._dataset_progress import UserProgressModel, DatasetProgressModel
+
 
 class DatasetsAPI(ResourceAPI[DatasetModel]):
     """Manage datasets via the API"""
@@ -80,6 +82,24 @@ def exists(self, dataset_id: UUID) -> bool:
     # Utility methods #
     ####################
 
+    @api_error_handler
+    def get_progress(self, dataset_id: UUID) -> DatasetProgressModel:
+        response = self.http_client.get(f"{self.url_stub}/{dataset_id}/progress")
+        response.raise_for_status()
+        response_json = response.json()
+
+        self._log_message(message=f"Got progress for dataset {dataset_id}")
+        return DatasetProgressModel.model_validate(response_json)
+
+    @api_error_handler
+    def list_users_progress(self, dataset_id: UUID) -> List[UserProgressModel]:
+        response = self.http_client.get(f"{self.url_stub}/{dataset_id}/users/progress")
+        response.raise_for_status()
+        response_json = response.json()
+
+        self._log_message(message=f"Got users progress for dataset {dataset_id}")
+        return [UserProgressModel.model_validate(data) for data in response_json["users"]]
+
     @api_error_handler
     def publish(self, dataset_id: UUID) -> "DatasetModel":
         response = self.http_client.put(url=f"{self.url_stub}/{dataset_id}/publish")
diff --git a/argilla/src/argilla/_models/_dataset_progress.py b/argilla/src/argilla/_models/_dataset_progress.py
@@ -0,0 +1,39 @@
+# Copyright 2024-present, Argilla, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pydantic import BaseModel
+
+
+class DatasetProgressModel(BaseModel):
+    """Dataset progress model."""
+
+    total: int = 0
+    completed: int = 0
+    pending: int = 0
+
+
+class RecordResponseDistributionModel(BaseModel):
+    """Response distribution model."""
+
+    submitted: int = 0
+    draft: int = 0
+    discarded: int = 0
+
+
+class UserProgressModel(BaseModel):
+    """User progress model."""
+
+    username: str
+    completed: RecordResponseDistributionModel = RecordResponseDistributionModel()
+    pending: RecordResponseDistributionModel = RecordResponseDistributionModel()
diff --git a/argilla/src/argilla/datasets/_resource.py b/argilla/src/argilla/datasets/_resource.py
@@ -174,6 +174,53 @@ def update(self) -> "Dataset":
         self.settings.update()
         return self
 
+    def progress(self, with_users_distribution: bool = False) -> dict:
+        """Returns the team's progress on the dataset.
+
+        Parameters:
+            with_users_distribution (bool): If True, the progress of the dataset is returned
+                with users distribution. This includes the number of responses made by each user.
+
+        Returns:
+            dict: The team's progress on the dataset.
+
+        An example of a response when `with_users_distribution` is `True`:
+        ```json
+        {
+            "total": 100,
+            "completed": 50,
+            "pending": 50,
+            "users": {
+                "user1": {
+                   "completed": { "submitted": 10, "draft": 5, "discarded": 5},
+                   "pending": { "submitted": 5, "draft": 10, "discarded": 10},
+                },
+                "user2": {
+                   "completed": { "submitted": 20, "draft": 10, "discarded": 5},
+                   "pending": { "submitted": 2, "draft": 25, "discarded": 0},
+                },
+                ...
+        }
+        ```
+
+        """
+
+        progress = self._api.get_progress(dataset_id=self._model.id).model_dump()
+
+        if with_users_distribution:
+            users_progress = self._api.list_users_progress(dataset_id=self._model.id)
+            users_distribution = {
+                user.username: {
+                    "completed": user.completed.model_dump(),
+                    "pending": user.pending.model_dump(),
+                }
+                for user in users_progress
+            }
+
+            progress.update({"users": users_distribution})
+
+        return progress
+
     @classmethod
     def from_model(cls, model: DatasetModel, client: "Argilla") -> "Dataset":
         instance = cls(client=client, workspace=model.workspace_id, name=model.name)