MigoXLab · e06084 · Feb 11, 2025 · Feb 11, 2025
diff --git a/dingo/exec/local.py b/dingo/exec/local.py
@@ -94,7 +94,7 @@ def evaluate(self):
         with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as thread_executor, \
              concurrent.futures.ProcessPoolExecutor(max_workers=self.input_args.max_workers) as process_executor:
             data_iter = self.load_data()
-            data_iter = itertools.islice(data_iter, self.input_args.start_index, None)
+            data_iter = itertools.islice(data_iter, self.input_args.start_index, self.input_args.end_index if self.input_args.end_index >= 0 else None )
             pbar = tqdm(total=None, unit='items')
 
             def process_batch(batch: List):

diff --git a/dingo/io/input/InputArgs.py b/dingo/io/input/InputArgs.py
@@ -23,6 +23,7 @@ class InputArgs(BaseModel):
 
     # Resume settings
     start_index: int = 0
+    end_index: int = -1
     interval_size: int = 1000
 
     # Concurrent settings
@@ -85,6 +86,9 @@ def check_args(self):
         if self.start_index < 0:
             raise ValueError("start_index must be non negative.")
 
+        if self.end_index >= 0 and self.end_index < self.start_index:
+            raise ValueError("if end_index is non negative, end_index must be greater than start_index")
+
         # check interval size
         if self.interval_size <= 0:
             raise ValueError("interval_size must be positive.")

diff --git a/dingo/run/cli.py b/dingo/run/cli.py
@@ -28,6 +28,8 @@ def parse_args():
                         default=None, help="Save raw data in output path")
     parser.add_argument("--start_index", type=int,
                         default=None, help="The number of data start to check.")
+    parser.add_argument("--end_index", type=int,
+                        default=None, help="The number of data end to check.")
     parser.add_argument("--interval_size", type=int,
                         default=None, help="The number of size to save while checking.")
     parser.add_argument("--max_workers", type=int,
@@ -108,6 +110,8 @@ def parse_args():
             input_data['save_raw'] = args.save_raw
         if args.start_index:
             input_data['start_index'] = args.start_index
+        if args.end_index:
+            input_data['end_index'] = args.end_index
         if args.interval_size:
             input_data['interval_size'] = args.interval_size
         if args.max_workers:

diff --git a/docs/config.md b/docs/config.md
@@ -6,57 +6,59 @@
 
 用户在命令行输入指令启动项目时会使用到的参数，本质是为了实例化`InputArgs`类：
 
-| Parameter                 | Type |             Default              | Required | Description                                                                           |
-|---------------------------|------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------|
-| --task_name / -n          | str  |             "dingo"              |    No    | task name.                                                                            |
-| --eval_group / -e         | str  |                ""                |   Yes    | Eval models, can be specified multiple times like '-e default' or '-e pretrain'       |
-| --input_path / -i         | str  | "test/data/test_local_json.json" |   Yes    | file or directory path to check.                                                      |
-| --output_path             | str  |            "outputs/"            |    No    | output path of result.                                                                |
-| --save_data               | bool |              False               |    No    | whether save results into files.                                                      |
-| --save_correct            | bool |              False               |    No    | whether save correct data.                                                            |
-| --save_raw                | bool |              False               |    No    | whether save raw data.                                                                |
-| --start_index             | int  |                0                 |    No    | the number of data start to check.                                                    |
-| --interval_size           | int  |               1000               |    No    | the number of size to save while checking.                                            |
-| --max_workers             | int  |                1                 |    No    | the number of max workers to concurrent check.                                        |
-| --batch_size              | int  |                1                 |    No    | the number of max data for concurrent check.                                          |
-| --dataset                 | str  |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                            |
-| --data_format             | str  |              "json"              |   Yes    | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson'].                     |
-| --huggingface_split       | str  |                ""                |    No    | Huggingface split, default is 'train'                                                 |
-| --huggingface_config_name | str  |               None               |    No    | Huggingface config name                                                               |
-| --column_id               | str  |                ""                | Depends  | Column name of id in the input file. If exists multiple levels, use '.' separate      |
-| --column_prompt           | str  |                ""                | Depends  | Column name of prompt in the input file. If exists multiple levels, use '.' separate  |
-| --column_content          | str  |                ""                |   Yes    | Column name of content in the input file. If exists multiple levels, use '.' separate |
-| --column_image            | str  |                ""                | Depends  | Column name of image in the input file. If exists multiple levels, use '.' separate   |
-| --custom_config           | str  |               None               | Depends  | Custom config file path                                                               |
-| --log_level               | str  |            "WARNING"             |    No    | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR']                      |
+| Parameter                 | Type |             Default              | Required | Description                                                                                  |
+|---------------------------|------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------|
+| --task_name / -n          | str  |             "dingo"              |    No    | task name.                                                                                   |
+| --eval_group / -e         | str  |                ""                |   Yes    | Eval models, can be specified multiple times like '-e default' or '-e pretrain'              |
+| --input_path / -i         | str  | "test/data/test_local_json.json" |   Yes    | file or directory path to check.                                                             |
+| --output_path             | str  |            "outputs/"            |    No    | output path of result.                                                                       |
+| --save_data               | bool |              False               |    No    | whether save results into files.                                                             |
+| --save_correct            | bool |              False               |    No    | whether save correct data.                                                                   |
+| --save_raw                | bool |              False               |    No    | whether save raw data.                                                                       |
+| --start_index             | int  |                0                 |    No    | the number of data start to check.                                                           |
+| --end_index               | int  |                -1                |    No    | the number of data end to check. if it's negative, include the data from start_index to end. |
+| --interval_size           | int  |               1000               |    No    | the number of size to save while checking.                                                   |
+| --max_workers             | int  |                1                 |    No    | the number of max workers to concurrent check.                                               |
+| --batch_size              | int  |                1                 |    No    | the number of max data for concurrent check.                                                 |
+| --dataset                 | str  |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                                   |
+| --data_format             | str  |              "json"              |   Yes    | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson'].                            |
+| --huggingface_split       | str  |                ""                |    No    | Huggingface split, default is 'train'                                                        |
+| --huggingface_config_name | str  |               None               |    No    | Huggingface config name                                                                      |
+| --column_id               | str  |                ""                | Depends  | Column name of id in the input file. If exists multiple levels, use '.' separate             |
+| --column_prompt           | str  |                ""                | Depends  | Column name of prompt in the input file. If exists multiple levels, use '.' separate         |
+| --column_content          | str  |                ""                |   Yes    | Column name of content in the input file. If exists multiple levels, use '.' separate        |
+| --column_image            | str  |                ""                | Depends  | Column name of image in the input file. If exists multiple levels, use '.' separate          |
+| --custom_config           | str  |               None               | Depends  | Custom config file path                                                                      |
+| --log_level               | str  |            "WARNING"             |    No    | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR']                             |
 
  ## SDK Config
 
 用户通过SDK方式启动项目时会使用到的参数，即`InputArgs`类：
 
-| Parameter               | Type                  |             Default              | Required | Description                                                                           |
-|-------------------------|-----------------------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------|
-| task_name               | str                   |             "dingo"              |    No    | task name .                                                                           |
-| eval_group              | str                   |                ""                |   Yes    | eval model.                                                                           |
-| input_path              | str                   | "test/data/test_local_json.json" |   Yes    | file or directory path to check.                                                      |
-| output_path             | str                   |            "outputs/"            |    No    | output path of result.                                                                |
-| save_data               | bool                  |              False               |    No    | whether save results into files.                                                      |
-| save_correct            | bool                  |              False               |    No    | whether save correct data.                                                            |
-| save_raw                | bool                  |              False               |    No    | whether save raw data.                                                                |
-| start_index             | int                   |                0                 |    No    | the number of data start to check.                                                    |
-| interval_size           | int                   |               1000               |    No    | the number of size to save while checking.                                            |
-| max_workers             | int                   |                1                 |    No    | the number of max workers to concurrent check.                                        |
-| batch_size              | int                   |                1                 |    No    | the number of max data for concurrent check.                                          |
-| dataset                 | str                   |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                            |
-| data_format             | str                   |              "json"              |   Yes    | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson'].                     |
-| huggingface_split       | str                   |                ""                |    No    | Huggingface split                                                                     |
-| huggingface_config_name | Optional[str]         |               None               |    No    | Huggingface config name                                                               |
-| column_id               | str                   |                ""                | Depends  | Column name of id in the input file. If exists multiple levels, use '.' separate      |
-| column_prompt           | str                   |                ""                | Depends  | Column name of prompt in the input file. If exists multiple levels, use '.' separate  |
-| column_content          | str                   |                ""                |   Yes    | Column name of content in the input file. If exists multiple levels, use '.' separate |
-| column_image            | str                   |                ""                | Depends  | Column name of image in the input file. If exists multiple levels, use '.' separate   |
-| custom_config           | Optional[str \| dict] |               None               | Depends  | custom config, file path or dict                                                      |
-| log_level               | str                   |            "WARNING"             |    No    | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR']                      |
+| Parameter               | Type                  |             Default              | Required | Description                                                                                  |
+|-------------------------|-----------------------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------|
+| task_name               | str                   |             "dingo"              |    No    | task name .                                                                                  |
+| eval_group              | str                   |                ""                |   Yes    | eval model.                                                                                  |
+| input_path              | str                   | "test/data/test_local_json.json" |   Yes    | file or directory path to check.                                                             |
+| output_path             | str                   |            "outputs/"            |    No    | output path of result.                                                                       |
+| save_data               | bool                  |              False               |    No    | whether save results into files.                                                             |
+| save_correct            | bool                  |              False               |    No    | whether save correct data.                                                                   |
+| save_raw                | bool                  |              False               |    No    | whether save raw data.                                                                       |
+| start_index             | int                   |                0                 |    No    | the number of data start to check.                                                           |
+| end_index               | int                   |                -1                |    No    | the number of data end to check. if it's negative, include the data from start_index to end. |
+| interval_size           | int                   |               1000               |    No    | the number of size to save while checking.                                                   |
+| max_workers             | int                   |                1                 |    No    | the number of max workers to concurrent check.                                               |
+| batch_size              | int                   |                1                 |    No    | the number of max data for concurrent check.                                                 |
+| dataset                 | str                   |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                                   |
+| data_format             | str                   |              "json"              |   Yes    | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson'].                            |
+| huggingface_split       | str                   |                ""                |    No    | Huggingface split                                                                            |
+| huggingface_config_name | Optional[str]         |               None               |    No    | Huggingface config name                                                                      |
+| column_id               | str                   |                ""                | Depends  | Column name of id in the input file. If exists multiple levels, use '.' separate             |
+| column_prompt           | str                   |                ""                | Depends  | Column name of prompt in the input file. If exists multiple levels, use '.' separate         |
+| column_content          | str                   |                ""                |   Yes    | Column name of content in the input file. If exists multiple levels, use '.' separate        |
+| column_image            | str                   |                ""                | Depends  | Column name of image in the input file. If exists multiple levels, use '.' separate          |
+| custom_config           | Optional[str \| dict] |               None               | Depends  | custom config, file path or dict                                                             |
+| log_level               | str                   |            "WARNING"             |    No    | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR']                             |
 
 ## Custom Config