Skip to content

Commit 244deb4

Browse files
refactor: refactor op generate
1 parent ea1603b commit 244deb4

27 files changed

+182
-107
lines changed
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
global_params:
2+
working_dir: cache
3+
4+
nodes:
5+
- id: read_files
6+
op_name: read
7+
type: source
8+
dependencies: []
9+
params:
10+
input_path:
11+
- resources/input_examples/jsonl_demo.jsonl
12+
13+
- id: chunk_documents
14+
op_name: chunk
15+
type: map_batch
16+
dependencies:
17+
- read_files
18+
execution_params:
19+
replicas: 4
20+
params:
21+
chunk_size: 1024
22+
chunk_overlap: 100
23+
24+
- id: build_kg
25+
op_name: build_kg
26+
type: map_batch
27+
dependencies:
28+
- chunk_documents
29+
execution_params:
30+
replicas: 1
31+
batch_size: 128
32+
33+
- id: quiz
34+
op_name: quiz
35+
type: aggregate
36+
dependencies:
37+
- build_kg
38+
execution_params:
39+
replicas: 1
40+
batch_size: 128
41+
params:
42+
quiz_samples: 2
43+
concurrency_limit: 200
44+
45+
- id: judge
46+
op_name: judge
47+
type: map_batch
48+
dependencies:
49+
- quiz
50+
execution_params:
51+
replicas: 1
52+
batch_size: 16
53+
54+
- id: partition
55+
op_name: partition
56+
type: aggregate
57+
dependencies:
58+
- judge
59+
params:
60+
method: ece
61+
method_params:
62+
max_units_per_community: 20
63+
min_units_per_community: 5
64+
max_tokens_per_community: 10240
65+
unit_sampling: max_loss
66+
67+
- id: generate
68+
op_name: generate
69+
type: map_batch
70+
dependencies:
71+
- partition
72+
execution_params:
73+
replicas: 1
74+
batch_size: 16
75+
params:
76+
method: aggregated
77+
data_format: ChatML
78+
79+
#pipeline:
80+
# - name: read_step # step name is unique in the pipeline, and can be referenced by other steps
81+
# op_key: read
82+
# params:
83+
# input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
84+
#
85+
# - name: chunk_step
86+
# op_key: chunk
87+
# deps: [read_step] # chunk_step depends on read_step
88+
# params:
89+
# chunk_size: 1024 # chunk size for text splitting
90+
# chunk_overlap: 100 # chunk overlap for text splitting
91+
#
92+
# - name: build_kg_step
93+
# op_key: build_kg
94+
# deps: [chunk_step] # build_kg_step depends on chunk_step
95+
#
96+
# - name: quiz_and_judge_step
97+
# op_key: quiz_and_judge
98+
# deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
99+
# params:
100+
# quiz_samples: 2 # number of quiz samples to generate
101+
# re_judge: false # whether to re-judge the existing quiz samples
102+
#
103+
# - name: partition_step
104+
# op_key: partition
105+
# deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
106+
# params:
107+
# method: ece # ece is a custom partition method based on comprehension loss
108+
# method_params:
109+
# max_units_per_community: 20 # max nodes and edges per community
110+
# min_units_per_community: 5 # min nodes and edges per community
111+
# max_tokens_per_community: 10240 # max tokens per community
112+
# unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
113+
#
114+
# - name: generate_step
115+
# op_key: generate
116+
# deps: [partition_step] # generate_step depends on partition_step
117+
# params:
118+
# method: aggregated # atomic, aggregated, multi_hop, cot, vqa
119+
# data_format: ChatML # Alpaca, Sharegpt, ChatML

scripts/generate/generate_aggregated.sh renamed to examples/generate/generate_aggregated_qa/generate_aggregated.sh

File renamed without changes.
File renamed without changes.

resources/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg renamed to examples/input_examples/images/0f25783fdfa99042db274ba9f6b3064cf17c5435814edfbee42ae6b19aac37d2.jpg

File renamed without changes.

resources/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg renamed to examples/input_examples/images/390516e39e77030092027ded523ee99e96ffa8b6df4476c9b12d7bb1dd20d635.jpg

File renamed without changes.

resources/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg renamed to examples/input_examples/images/4abc534d1dea2b706e44aaac26fe2ae309fee014082db00bc2d87187a6bb5dca.jpg

File renamed without changes.

resources/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg renamed to examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg

File renamed without changes.

resources/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg renamed to examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg

File renamed without changes.

0 commit comments

Comments
 (0)