Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dingo/exec/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def evaluate(self):
with concurrent.futures.ThreadPoolExecutor(max_workers=self.input_args.max_workers) as thread_executor, \
concurrent.futures.ProcessPoolExecutor(max_workers=self.input_args.max_workers) as process_executor:
data_iter = self.load_data()
data_iter = itertools.islice(data_iter, self.input_args.start_index, None)
data_iter = itertools.islice(data_iter, self.input_args.start_index, self.input_args.end_index if self.input_args.end_index >= 0 else None )
pbar = tqdm(total=None, unit='items')

def process_batch(batch: List):
Expand Down
4 changes: 4 additions & 0 deletions dingo/io/input/InputArgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class InputArgs(BaseModel):

# Resume settings
start_index: int = 0
end_index: int = -1
interval_size: int = 1000

# Concurrent settings
Expand Down Expand Up @@ -85,6 +86,9 @@ def check_args(self):
if self.start_index < 0:
raise ValueError("start_index must be non negative.")

if self.end_index >= 0 and self.end_index < self.start_index:
raise ValueError("if end_index is non negative, end_index must be greater than start_index")

# check interval size
if self.interval_size <= 0:
raise ValueError("interval_size must be positive.")
Expand Down
4 changes: 4 additions & 0 deletions dingo/run/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ def parse_args():
default=None, help="Save raw data in output path")
parser.add_argument("--start_index", type=int,
default=None, help="The number of data start to check.")
parser.add_argument("--end_index", type=int,
default=None, help="The number of data end to check.")
parser.add_argument("--interval_size", type=int,
default=None, help="The number of size to save while checking.")
parser.add_argument("--max_workers", type=int,
Expand Down Expand Up @@ -108,6 +110,8 @@ def parse_args():
input_data['save_raw'] = args.save_raw
if args.start_index:
input_data['start_index'] = args.start_index
if args.end_index:
input_data['end_index'] = args.end_index
if args.interval_size:
input_data['interval_size'] = args.interval_size
if args.max_workers:
Expand Down
94 changes: 48 additions & 46 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,57 +6,59 @@

用户在命令行输入指令启动项目时会使用到的参数,本质是为了实例化`InputArgs`类:

| Parameter | Type | Default | Required | Description |
|---------------------------|------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------|
| --task_name / -n | str | "dingo" | No | task name. |
| --eval_group / -e | str | "" | Yes | Eval models, can be specified multiple times like '-e default' or '-e pretrain' |
| --input_path / -i | str | "test/data/test_local_json.json" | Yes | file or directory path to check. |
| --output_path | str | "outputs/" | No | output path of result. |
| --save_data | bool | False | No | whether save results into files. |
| --save_correct | bool | False | No | whether save correct data. |
| --save_raw | bool | False | No | whether save raw data. |
| --start_index | int | 0 | No | the number of data start to check. |
| --interval_size | int | 1000 | No | the number of size to save while checking. |
| --max_workers | int | 1 | No | the number of max workers to concurrent check. |
| --batch_size | int | 1 | No | the number of max data for concurrent check. |
| --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
| --data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. |
| --huggingface_split | str | "" | No | Huggingface split, default is 'train' |
| --huggingface_config_name | str | None | No | Huggingface config name |
| --column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate |
| --column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate |
| --column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate |
| --column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate |
| --custom_config | str | None | Depends | Custom config file path |
| --log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] |
| Parameter | Type | Default | Required | Description |
|---------------------------|------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------|
| --task_name / -n | str | "dingo" | No | task name. |
| --eval_group / -e | str | "" | Yes | Eval models, can be specified multiple times like '-e default' or '-e pretrain' |
| --input_path / -i | str | "test/data/test_local_json.json" | Yes | file or directory path to check. |
| --output_path | str | "outputs/" | No | output path of result. |
| --save_data | bool | False | No | whether save results into files. |
| --save_correct | bool | False | No | whether save correct data. |
| --save_raw | bool | False | No | whether save raw data. |
| --start_index | int | 0 | No | the number of data start to check. |
| --end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. |
| --interval_size | int | 1000 | No | the number of size to save while checking. |
| --max_workers | int | 1 | No | the number of max workers to concurrent check. |
| --batch_size | int | 1 | No | the number of max data for concurrent check. |
| --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
| --data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. |
| --huggingface_split | str | "" | No | Huggingface split, default is 'train' |
| --huggingface_config_name | str | None | No | Huggingface config name |
| --column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate |
| --column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate |
| --column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate |
| --column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate |
| --custom_config | str | None | Depends | Custom config file path |
| --log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] |

## SDK Config

用户通过SDK方式启动项目时会使用到的参数,即`InputArgs`类:

| Parameter | Type | Default | Required | Description |
|-------------------------|-----------------------|:--------------------------------:|:--------:|---------------------------------------------------------------------------------------|
| task_name | str | "dingo" | No | task name . |
| eval_group | str | "" | Yes | eval model. |
| input_path | str | "test/data/test_local_json.json" | Yes | file or directory path to check. |
| output_path | str | "outputs/" | No | output path of result. |
| save_data | bool | False | No | whether save results into files. |
| save_correct | bool | False | No | whether save correct data. |
| save_raw | bool | False | No | whether save raw data. |
| start_index | int | 0 | No | the number of data start to check. |
| interval_size | int | 1000 | No | the number of size to save while checking. |
| max_workers | int | 1 | No | the number of max workers to concurrent check. |
| batch_size | int | 1 | No | the number of max data for concurrent check. |
| dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
| data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. |
| huggingface_split | str | "" | No | Huggingface split |
| huggingface_config_name | Optional[str] | None | No | Huggingface config name |
| column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate |
| column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate |
| column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate |
| column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate |
| custom_config | Optional[str \| dict] | None | Depends | custom config, file path or dict |
| log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] |
| Parameter | Type | Default | Required | Description |
|-------------------------|-----------------------|:--------------------------------:|:--------:|----------------------------------------------------------------------------------------------|
| task_name | str | "dingo" | No | task name . |
| eval_group | str | "" | Yes | eval model. |
| input_path | str | "test/data/test_local_json.json" | Yes | file or directory path to check. |
| output_path | str | "outputs/" | No | output path of result. |
| save_data | bool | False | No | whether save results into files. |
| save_correct | bool | False | No | whether save correct data. |
| save_raw | bool | False | No | whether save raw data. |
| start_index | int | 0 | No | the number of data start to check. |
| end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. |
| interval_size | int | 1000 | No | the number of size to save while checking. |
| max_workers | int | 1 | No | the number of max workers to concurrent check. |
| batch_size | int | 1 | No | the number of max data for concurrent check. |
| dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
| data_format | str | "json" | Yes | data format, such as: ['json', 'jsonl', 'plaintext', 'listjson']. |
| huggingface_split | str | "" | No | Huggingface split |
| huggingface_config_name | Optional[str] | None | No | Huggingface config name |
| column_id | str | "" | Depends | Column name of id in the input file. If exists multiple levels, use '.' separate |
| column_prompt | str | "" | Depends | Column name of prompt in the input file. If exists multiple levels, use '.' separate |
| column_content | str | "" | Yes | Column name of content in the input file. If exists multiple levels, use '.' separate |
| column_image | str | "" | Depends | Column name of image in the input file. If exists multiple levels, use '.' separate |
| custom_config | Optional[str \| dict] | None | Depends | custom config, file path or dict |
| log_level | str | "WARNING" | No | printing level of logs, in ['DEBUG', 'INFO', 'WARNING', 'ERROR'] |

## Custom Config

Expand Down