|
1 | | -read: |
2 | | - input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples |
3 | | -split: |
4 | | - chunk_size: 1024 # chunk size for text splitting |
5 | | - chunk_overlap: 100 # chunk overlap for text splitting |
6 | | -search: # web search configuration |
7 | | - enabled: false # whether to enable web search |
8 | | - search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia |
9 | | -quiz_and_judge: # quiz and test whether the LLM masters the knowledge points |
10 | | - enabled: false |
11 | | - quiz_samples: 2 # number of quiz samples to generate |
12 | | - re_judge: false # whether to re-judge the existing quiz samples |
13 | | -partition: # graph partition configuration |
14 | | - method: ece # ece is a custom partition method based on comprehension loss |
15 | | - method_params: |
16 | | - max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3 |
17 | | - min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3 |
18 | | - max_tokens_per_community: 10240 # max tokens per community |
19 | | - unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss |
20 | | -generate: |
21 | | - mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa |
22 | | - data_format: ChatML # Alpaca, Sharegpt, ChatML |
| 1 | +pipeline: |
| 2 | + - name: read |
| 3 | + params: |
| 4 | + input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples |
| 5 | + |
| 6 | + - name: chunk |
| 7 | + params: |
| 8 | + chunk_size: 1024 # chunk size for text splitting |
| 9 | + chunk_overlap: 100 # chunk overlap for text splitting |
| 10 | + |
| 11 | + - name: build_kg |
| 12 | + |
| 13 | + - name: partition |
| 14 | + params: |
| 15 | + method: ece # ece is a custom partition method based on comprehension loss |
| 16 | + method_params: |
| 17 | + max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3 |
| 18 | + min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3 |
| 19 | + max_tokens_per_community: 10240 # max tokens per community |
| 20 | + unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss |
| 21 | + |
| 22 | + - name: generate |
| 23 | + params: |
| 24 | + method: multi_hop # atomic, aggregated, multi_hop, cot, vqa |
| 25 | + data_format: ChatML # Alpaca, Sharegpt, ChatML |
0 commit comments