Skip to content

Commit 48d44c5

Browse files
Merge pull request #103 from open-sciencelab/feature/enable-complex-configs
feat: support complex configs
2 parents f8a3b9c + 676fb59 commit 48d44c5

File tree

11 files changed

+128
-66
lines changed

11 files changed

+128
-66
lines changed
Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,30 @@
11
pipeline:
2-
- name: read
2+
- name: read_step # step name is unique in the pipeline, and can be referenced by other steps
3+
op_key: read
34
params:
45
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 1024 # chunk size for text splitting
912
chunk_overlap: 100 # chunk overlap for text splitting
1013

11-
- name: build_kg
14+
- name: build_kg_step
15+
op_key: build_kg
16+
deps: [chunk_step] # build_kg_step depends on chunk_step
1217

13-
- name: quiz_and_judge
18+
- name: quiz_and_judge_step
19+
op_key: quiz_and_judge
20+
deps: [build_kg_step] # quiz_and_judge depends on build_kg_step
1421
params:
1522
quiz_samples: 2 # number of quiz samples to generate
1623
re_judge: false # whether to re-judge the existing quiz samples
1724

18-
- name: partition
19-
deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
25+
- name: partition_step
26+
op_key: partition
27+
deps: [quiz_and_judge_step] # partition_step depends on quiz_and_judge_step
2028
params:
2129
method: ece # ece is a custom partition method based on comprehension loss
2230
method_params:
@@ -25,7 +33,9 @@ pipeline:
2533
max_tokens_per_community: 10240 # max tokens per community
2634
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
2735

28-
- name: generate
36+
- name: generate_step
37+
op_key: generate
38+
deps: [partition_step] # generate_step depends on partition_step
2939
params:
3040
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
3141
data_format: ChatML # Alpaca, Sharegpt, ChatML
Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,31 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 1024 # chunk size for text splitting
912
chunk_overlap: 100 # chunk overlap for text splitting
1013

11-
- name: build_kg
14+
- name: build_kg_step
15+
op_key: build_kg
16+
deps: [chunk_step] # build_kg depends on chunk_step
1217

13-
- name: partition
18+
- name: partition_step
19+
op_key: partition
20+
deps: [build_kg] # partition_step depends on build_kg
1421
params:
1522
method: dfs # partition method, support: dfs, bfs, ece, leiden
1623
method_params:
1724
max_units_per_community: 1 # atomic partition, one node or edge per community
18-
- name: generate
25+
26+
- name: generate_step
27+
op_key: generate
28+
deps: [partition_step] # generate_step depends on partition_step
1929
params:
2030
method: atomic # atomic, aggregated, multi_hop, cot, vqa
2131
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,33 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 1024 # chunk size for text splitting
912
chunk_overlap: 100 # chunk overlap for text splitting
1013

11-
- name: build_kg
14+
- name: build_kg_step
15+
op_key: build_kg
16+
deps: [chunk_step] # build_kg depends on chunk_step
1217

13-
- name: partition
18+
- name: partition_step
19+
op_key: partition
20+
deps: [build_kg_step] # partition_step depends on build_kg
1421
params:
1522
method: leiden # leiden is a partitioner detection algorithm
1623
method_params:
1724
max_size: 20 # Maximum size of communities
1825
use_lcc: false # whether to use the largest connected component
1926
random_seed: 42 # random seed for partitioning
2027

21-
- name: generate
28+
- name: generate_step
29+
op_key: generate
30+
deps: [partition_step] # generate_step depends on partition_step
2231
params:
2332
method: cot # atomic, aggregated, multi_hop, cot, vqa
2433
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 1024 # chunk size for text splitting
912
chunk_overlap: 100 # chunk overlap for text splitting
1013

11-
- name: build_kg
14+
- name: build_kg_step
15+
op_key: build_kg
16+
deps: [chunk_step] # build_kg_step depends on chunk_step
1217

13-
- name: partition
18+
- name: partition_step
19+
op_key: partition
20+
deps: [build_kg_step] # partition_step depends on build_kg_step
1421
params:
1522
method: ece # ece is a custom partition method based on comprehension loss
1623
method_params:
@@ -19,7 +26,9 @@ pipeline:
1926
max_tokens_per_community: 10240 # max tokens per community
2027
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
2128

22-
- name: generate
29+
- name: generate_step
30+
op_key: generate
31+
deps: [partition_step] # generate_step depends on partition_step
2332
params:
2433
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
2534
data_format: ChatML # Alpaca, Sharegpt, ChatML
Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,20 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 20480
912
chunk_overlap: 2000
1013
separators: []
1114

12-
- name: extract
15+
- name: extract_step
16+
op_key: extract
17+
deps: [chunk_step] # extract_step depends on chunk_step
1318
params:
1419
method: schema_guided # extraction method, support: schema_guided
1520
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method

graphgen/configs/search_config.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/search_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: search
7+
- name: search_step
8+
op_key: search
9+
deps: [read_step] # search_step depends on read_step
710
params:
811
data_sources: [uniprot] # data source for searcher, support: wikipedia, google, uniprot
912
uniprot_params:

graphgen/configs/vqa_config.yaml

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,32 @@
11
pipeline:
2-
- name: read
2+
- name: read_step
3+
op_key: read
34
params:
45
input_file: resources/input_examples/vqa_demo.json # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
56

6-
- name: chunk
7+
- name: chunk_step
8+
op_key: chunk
9+
deps: [read_step] # chunk_step depends on read_step
710
params:
811
chunk_size: 1024 # chunk size for text splitting
912
chunk_overlap: 100 # chunk overlap for text splitting
1013

11-
- name: build_kg
14+
- name: build_kg_step
15+
op_key: build_kg
16+
deps: [chunk_step] # build_kg depends on chunk_step
1217

13-
- name: partition
18+
- name: partition_step
19+
op_key: partition
20+
deps: [build_kg_step] # partition_step depends on build_kg_step
1421
params:
1522
method: anchor_bfs # partition method
1623
method_params:
1724
anchor_type: image # node type to select anchor nodes
1825
max_units_per_community: 10 # atomic partition, one node or edge per community
1926

20-
- name: generate
27+
- name: generate_step
28+
op_key: generate
29+
deps: [partition_step] # generate_step depends on partition_step
2130
params:
2231
method: vqa # atomic, aggregated, multi_hop, cot, vqa
2332
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/engine.py

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
import threading
66
import traceback
7-
from functools import wraps
87
from typing import Any, Callable, List
98

109

@@ -27,25 +26,12 @@ def __init__(
2726
self.name, self.deps, self.func = name, deps, func
2827

2928

30-
def op(name: str, deps=None):
31-
deps = deps or []
32-
33-
def decorator(func):
34-
@wraps(func)
35-
def _wrapper(*args, **kwargs):
36-
return func(*args, **kwargs)
37-
38-
_wrapper.op_node = OpNode(name, deps, lambda self, ctx: func(self, **ctx))
39-
return _wrapper
40-
41-
return decorator
42-
43-
4429
class Engine:
4530
def __init__(self, max_workers: int = 4):
4631
self.max_workers = max_workers
4732

4833
def run(self, ops: List[OpNode], ctx: Context):
34+
self._validate(ops)
4935
name2op = {operation.name: operation for operation in ops}
5036

5137
# topological sort
@@ -81,7 +67,7 @@ def _exec(n: str):
8167
return
8268
try:
8369
name2op[n].func(name2op[n], ctx)
84-
except Exception: # pylint: disable=broad-except
70+
except Exception:
8571
exc[n] = traceback.format_exc()
8672
done[n].set()
8773

@@ -96,6 +82,20 @@ def _exec(n: str):
9682
+ "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
9783
)
9884

85+
@staticmethod
86+
def _validate(ops: List[OpNode]):
87+
name_set = set()
88+
for op in ops:
89+
if op.name in name_set:
90+
raise ValueError(f"Duplicate operation name: {op.name}")
91+
name_set.add(op.name)
92+
for op in ops:
93+
for dep in op.deps:
94+
if dep not in name_set:
95+
raise ValueError(
96+
f"Operation {op.name} has unknown dependency: {dep}"
97+
)
98+
9999

100100
def collect_ops(config: dict, graph_gen) -> List[OpNode]:
101101
"""
@@ -106,16 +106,20 @@ def collect_ops(config: dict, graph_gen) -> List[OpNode]:
106106
ops: List[OpNode] = []
107107
for stage in config["pipeline"]:
108108
name = stage["name"]
109-
method = getattr(graph_gen, name)
110-
op_node = method.op_node
111-
112-
# if there are runtime dependencies, override them
113-
runtime_deps = stage.get("deps", op_node.deps)
114-
op_node.deps = runtime_deps
109+
method_name = stage.get("op_key")
110+
method = getattr(graph_gen, method_name)
111+
deps = stage.get("deps", [])
115112

116113
if "params" in stage:
117-
op_node.func = lambda self, ctx, m=method, sc=stage: m(sc.get("params", {}))
114+
115+
def func(self, ctx, _method=method, _params=stage.get("params", {})):
116+
return _method(_params)
117+
118118
else:
119-
op_node.func = lambda self, ctx, m=method: m()
119+
120+
def func(self, ctx, _method=method):
121+
return _method()
122+
123+
op_node = OpNode(name=name, deps=deps, func=func)
120124
ops.append(op_node)
121125
return ops

0 commit comments

Comments
 (0)