diff --git a/dingo/exec/local.py b/dingo/exec/local.py index 4e3b28e1..c996afb1 100644 --- a/dingo/exec/local.py +++ b/dingo/exec/local.py @@ -94,7 +94,7 @@ def evaluate(self): with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as thread_executor, \ concurrent.futures.ProcessPoolExecutor(max_workers=self.input_args.max_workers) as process_executor: data_iter = self.load_data() - data_iter = itertools.islice(data_iter, self.input_args.start_index, None) + data_iter = itertools.islice(data_iter, self.input_args.start_index, self.input_args.end_index if self.input_args.end_index >= 0 else None ) pbar = tqdm(total=None, unit='items') def process_batch(batch: List): diff --git a/dingo/io/input/InputArgs.py b/dingo/io/input/InputArgs.py index 07d7a778..2d5d2c06 100644 --- a/dingo/io/input/InputArgs.py +++ b/dingo/io/input/InputArgs.py @@ -23,6 +23,7 @@ class InputArgs(BaseModel): # Resume settings start_index: int = 0 + end_index: int = -1 interval_size: int = 1000 # Concurrent settings @@ -85,6 +86,9 @@ def check_args(self): if self.start_index < 0: raise ValueError("start_index must be non negative.") + if self.end_index >= 0 and self.end_index < self.start_index: + raise ValueError("if end_index is non negative, end_index must be greater than start_index") + # check interval size if self.interval_size <= 0: raise ValueError("interval_size must be positive.") diff --git a/dingo/run/cli.py b/dingo/run/cli.py index ec27709d..cad2059c 100644 --- a/dingo/run/cli.py +++ b/dingo/run/cli.py @@ -28,6 +28,8 @@ def parse_args(): default=None, help="Save raw data in output path") parser.add_argument("--start_index", type=int, default=None, help="The number of data start to check.") + parser.add_argument("--end_index", type=int, + default=None, help="The number of data end to check.") parser.add_argument("--interval_size", type=int, default=None, help="The number of size to save while checking.") parser.add_argument("--max_workers", type=int, @@ -108,6 +110,8 @@ def parse_args(): input_data['save_raw'] = args.save_raw if args.start_index: input_data['start_index'] = args.start_index + if args.end_index: + input_data['end_index'] = args.end_index if args.interval_size: input_data['interval_size'] = args.interval_size if args.max_workers: diff --git a/docs/config.md b/docs/config.md index ac1daf8b..90bb287e 100644 --- a/docs/config.md +++ b/docs/config.md @@ -6,57 +6,59 @@ 用户在命令行输入指令启动项目时会使用到的参数,本质是为了实例化`InputArgs`类: -| Parameter | Type | Default | Required | Description | -|---------------------------|------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------| -| --task_name / -n | str | "dingo" | No | task name. | -| --eval_group / -e | str | "" | Yes | Eval models, can be specified multiple times like '-e default' or '-e pretrain' | -| --input_path / -i | str | "test/data/test_local_json.json" | Yes | file or directory path to check. | -| --output_path | str | "outputs/" | No | output path of result. | -| --save_data | bool | False | No | whether save results into files. | -| --save_correct | bool | False | No | whether save correct data. | -| --save_raw | bool | False | No | whether save raw data. | -| --start_index | int | 0 | No | the number of data start to check. | -| --interval_size | int | 1000 | No | the number of size to save while checking. | -| --max_workers | int | 1 | No | the number of max workers to concurrent check. | -| --batch_size | int | 1 | No | the number of max data for concurrent check. | -| --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | -| --data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. | -| --huggingface_split | str | "" | No | Huggingface split, default is 'train' | -| --huggingface_config_name | str | None | No | Huggingface config name | -| --column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate | -| --column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate | -| --column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate | -| --column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate | -| --custom_config | str | None | Depends | Custom config file path | -| --log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] | +| Parameter | Type | Default | Required | Description | +|---------------------------|------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------| +| --task_name / -n | str | "dingo" | No | task name. | +| --eval_group / -e | str | "" | Yes | Eval models, can be specified multiple times like '-e default' or '-e pretrain' | +| --input_path / -i | str | "test/data/test_local_json.json" | Yes | file or directory path to check. | +| --output_path | str | "outputs/" | No | output path of result. | +| --save_data | bool | False | No | whether save results into files. | +| --save_correct | bool | False | No | whether save correct data. | +| --save_raw | bool | False | No | whether save raw data. | +| --start_index | int | 0 | No | the number of data start to check. | +| --end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. | +| --interval_size | int | 1000 | No | the number of size to save while checking. | +| --max_workers | int | 1 | No | the number of max workers to concurrent check. | +| --batch_size | int | 1 | No | the number of max data for concurrent check. | +| --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | +| --data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. | +| --huggingface_split | str | "" | No | Huggingface split, default is 'train' | +| --huggingface_config_name | str | None | No | Huggingface config name | +| --column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate | +| --column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate | +| --column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate | +| --column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate | +| --custom_config | str | None | Depends | Custom config file path | +| --log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] | ## SDK Config 用户通过SDK方式启动项目时会使用到的参数,即`InputArgs`类: -| Parameter | Type | Default | Required | Description | -|-------------------------|-----------------------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------| -| task_name | str | "dingo" | No | task name . | -| eval_group | str | "" | Yes | eval model. | -| input_path | str | "test/data/test_local_json.json" | Yes | file or directory path to check. | -| output_path | str | "outputs/" | No | output path of result. | -| save_data | bool | False | No | whether save results into files. | -| save_correct | bool | False | No | whether save correct data. | -| save_raw | bool | False | No | whether save raw data. | -| start_index | int | 0 | No | the number of data start to check. | -| interval_size | int | 1000 | No | the number of size to save while checking. | -| max_workers | int | 1 | No | the number of max workers to concurrent check. | -| batch_size | int | 1 | No | the number of max data for concurrent check. | -| dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | -| data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. | -| huggingface_split | str | "" | No | Huggingface split | -| huggingface_config_name | Optional[str] | None | No | Huggingface config name | -| column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate | -| column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate | -| column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate | -| column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate | -| custom_config | Optional[str \| dict] | None | Depends | custom config, file path or dict | -| log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] | +| Parameter | Type | Default | Required | Description | +|-------------------------|-----------------------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------| +| task_name | str | "dingo" | No | task name . | +| eval_group | str | "" | Yes | eval model. | +| input_path | str | "test/data/test_local_json.json" | Yes | file or directory path to check. | +| output_path | str | "outputs/" | No | output path of result. | +| save_data | bool | False | No | whether save results into files. | +| save_correct | bool | False | No | whether save correct data. | +| save_raw | bool | False | No | whether save raw data. | +| start_index | int | 0 | No | the number of data start to check. | +| end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. | +| interval_size | int | 1000 | No | the number of size to save while checking. | +| max_workers | int | 1 | No | the number of max workers to concurrent check. | +| batch_size | int | 1 | No | the number of max data for concurrent check. | +| dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | +| data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. | +| huggingface_split | str | "" | No | Huggingface split | +| huggingface_config_name | Optional[str] | None | No | Huggingface config name | +| column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate | +| column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate | +| column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate | +| column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate | +| custom_config | Optional[str \| dict] | None | Depends | custom config, file path or dict | +| log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] | ## Custom Config