Skip to content

Commit 9c6f678

Browse files
feat: add more qa generation types (#159)
* feat: add multi choice qa generation * feat: add multi answer qa generation * feat: add fill-in-blank qa generation * Update graphgen/bases/base_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update graphgen/models/generator/multi_answer_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update graphgen/models/generator/multi_answer_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update graphgen/models/generator/multi_choice_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update graphgen/templates/generation/multi_choice_generation.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * Update graphgen/models/generator/multi_answer_generator.py Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> * fix: fix typo --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 02dcafe commit 9c6f678

23 files changed

+954
-76
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Generate Fill-in-blank QAs
2+
3+
Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
global_params:
2+
working_dir: cache
3+
graph_backend: kuzu # graph database backend, support: kuzu, networkx
4+
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
5+
6+
nodes:
7+
- id: read_files # id is unique in the pipeline, and can be referenced by other steps
8+
op_name: read
9+
type: source
10+
dependencies: []
11+
params:
12+
input_path:
13+
- examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
14+
15+
- id: chunk_documents
16+
op_name: chunk
17+
type: map_batch
18+
dependencies:
19+
- read_files
20+
execution_params:
21+
replicas: 4
22+
params:
23+
chunk_size: 1024 # chunk size for text splitting
24+
chunk_overlap: 100 # chunk overlap for text splitting
25+
26+
- id: build_kg
27+
op_name: build_kg
28+
type: map_batch
29+
dependencies:
30+
- chunk_documents
31+
execution_params:
32+
replicas: 1
33+
batch_size: 128
34+
35+
- id: quiz
36+
op_name: quiz
37+
type: map_batch
38+
dependencies:
39+
- build_kg
40+
execution_params:
41+
replicas: 1
42+
batch_size: 128
43+
params:
44+
quiz_samples: 2 # number of quiz samples to generate
45+
46+
- id: judge
47+
op_name: judge
48+
type: map_batch
49+
dependencies:
50+
- quiz
51+
execution_params:
52+
replicas: 1
53+
batch_size: 128
54+
55+
- id: partition
56+
op_name: partition
57+
type: aggregate
58+
dependencies:
59+
- judge
60+
params:
61+
method: ece # ece is a custom partition method based on comprehension loss
62+
method_params:
63+
max_units_per_community: 20 # max nodes and edges per community
64+
min_units_per_community: 5 # min nodes and edges per community
65+
max_tokens_per_community: 10240 # max tokens per community
66+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
67+
68+
- id: generate
69+
op_name: generate
70+
type: map_batch
71+
dependencies:
72+
- partition
73+
execution_params:
74+
replicas: 1
75+
batch_size: 128
76+
save_output: true # save output
77+
params:
78+
method: fill_in_blank
79+
num_of_questions: 5
80+
data_format: Alpaca # Alpaca, Sharegpt, ChatML
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python3 -m graphgen.run \
2+
--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Generate Multi-Answer QAs
2+
3+
Multi-answer question answering (QA) involves generating questions that can have multiple valid answers. This is particularly useful in educational settings, surveys, and research where diverse perspectives are valuable.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python3 -m graphgen.run \
2+
--config_file examples/generate/generate_multi_answer_qa/multi_answer_config.yaml
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
global_params:
2+
working_dir: cache
3+
graph_backend: kuzu # graph database backend, support: kuzu, networkx
4+
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
5+
6+
nodes:
7+
- id: read_files # id is unique in the pipeline, and can be referenced by other steps
8+
op_name: read
9+
type: source
10+
dependencies: []
11+
params:
12+
input_path:
13+
- examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
14+
15+
- id: chunk_documents
16+
op_name: chunk
17+
type: map_batch
18+
dependencies:
19+
- read_files
20+
execution_params:
21+
replicas: 4
22+
params:
23+
chunk_size: 1024 # chunk size for text splitting
24+
chunk_overlap: 100 # chunk overlap for text splitting
25+
26+
- id: build_kg
27+
op_name: build_kg
28+
type: map_batch
29+
dependencies:
30+
- chunk_documents
31+
execution_params:
32+
replicas: 1
33+
batch_size: 128
34+
35+
- id: quiz
36+
op_name: quiz
37+
type: map_batch
38+
dependencies:
39+
- build_kg
40+
execution_params:
41+
replicas: 1
42+
batch_size: 128
43+
params:
44+
quiz_samples: 2 # number of quiz samples to generate
45+
46+
- id: judge
47+
op_name: judge
48+
type: map_batch
49+
dependencies:
50+
- quiz
51+
execution_params:
52+
replicas: 1
53+
batch_size: 128
54+
55+
- id: partition
56+
op_name: partition
57+
type: aggregate
58+
dependencies:
59+
- judge
60+
params:
61+
method: ece # ece is a custom partition method based on comprehension loss
62+
method_params:
63+
max_units_per_community: 20 # max nodes and edges per community
64+
min_units_per_community: 5 # min nodes and edges per community
65+
max_tokens_per_community: 10240 # max tokens per community
66+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
67+
68+
- id: generate
69+
op_name: generate
70+
type: map_batch
71+
dependencies:
72+
- partition
73+
execution_params:
74+
replicas: 1
75+
batch_size: 128
76+
save_output: true # save output
77+
params:
78+
method: multi_answer
79+
num_of_questions: 5
80+
data_format: Alpaca # Alpaca, Sharegpt, ChatML
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Generate Multi-Choice QAs
2+
3+
Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
python3 -m graphgen.run \
2+
--config_file examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
global_params:
2+
working_dir: cache
3+
graph_backend: kuzu # graph database backend, support: kuzu, networkx
4+
kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
5+
6+
nodes:
7+
- id: read_files # id is unique in the pipeline, and can be referenced by other steps
8+
op_name: read
9+
type: source
10+
dependencies: []
11+
params:
12+
input_path:
13+
- examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
14+
15+
- id: chunk_documents
16+
op_name: chunk
17+
type: map_batch
18+
dependencies:
19+
- read_files
20+
execution_params:
21+
replicas: 4
22+
params:
23+
chunk_size: 1024 # chunk size for text splitting
24+
chunk_overlap: 100 # chunk overlap for text splitting
25+
26+
- id: build_kg
27+
op_name: build_kg
28+
type: map_batch
29+
dependencies:
30+
- chunk_documents
31+
execution_params:
32+
replicas: 1
33+
batch_size: 128
34+
35+
- id: quiz
36+
op_name: quiz
37+
type: map_batch
38+
dependencies:
39+
- build_kg
40+
execution_params:
41+
replicas: 1
42+
batch_size: 128
43+
params:
44+
quiz_samples: 2 # number of quiz samples to generate
45+
46+
- id: judge
47+
op_name: judge
48+
type: map_batch
49+
dependencies:
50+
- quiz
51+
execution_params:
52+
replicas: 1
53+
batch_size: 128
54+
55+
- id: partition
56+
op_name: partition
57+
type: aggregate
58+
dependencies:
59+
- judge
60+
params:
61+
method: ece # ece is a custom partition method based on comprehension loss
62+
method_params:
63+
max_units_per_community: 20 # max nodes and edges per community
64+
min_units_per_community: 5 # min nodes and edges per community
65+
max_tokens_per_community: 10240 # max tokens per community
66+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
67+
68+
- id: generate
69+
op_name: generate
70+
type: map_batch
71+
dependencies:
72+
- partition
73+
execution_params:
74+
replicas: 1
75+
batch_size: 128
76+
save_output: true # save output
77+
params:
78+
method: multi_choice
79+
num_of_questions: 5
80+
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/bases/base_generator.py

Lines changed: 44 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -46,38 +46,47 @@ async def generate(
4646
def format_generation_results(
4747
results: list[dict], output_data_format: str
4848
) -> list[dict[str, Any]]:
49-
if output_data_format == "Alpaca":
50-
results = [
51-
{
52-
"instruction": v["question"],
53-
"input": "",
54-
"output": v["answer"],
55-
}
56-
for item in results
57-
for k, v in item.items()
58-
]
59-
elif output_data_format == "Sharegpt":
60-
results = [
61-
{
62-
"conversations": [
63-
{"from": "human", "value": v["question"]},
64-
{"from": "gpt", "value": v["answer"]},
65-
]
66-
}
67-
for item in results
68-
for k, v in item.items()
69-
]
70-
elif output_data_format == "ChatML":
71-
results = [
72-
{
73-
"messages": [
74-
{"role": "user", "content": v["question"]},
75-
{"role": "assistant", "content": v["answer"]},
76-
]
77-
}
78-
for item in results
79-
for k, v in item.items()
80-
]
81-
else:
82-
raise ValueError(f"Unknown output data format: {output_data_format}")
83-
return results
49+
50+
flat_results = []
51+
for item in results:
52+
for _, qa_data in item.items():
53+
question = qa_data.get("question", "")
54+
answer = qa_data.get("answer", "")
55+
if "options" in qa_data and qa_data["options"]:
56+
options = qa_data["options"]
57+
options_str = "\n".join(
58+
[f"{key}. {options[key]}" for key in sorted(options.keys())]
59+
)
60+
question += f"\nOptions:\n{options_str}"
61+
62+
if output_data_format == "Alpaca":
63+
flat_results.append(
64+
{
65+
"instruction": question,
66+
"input": "",
67+
"output": answer,
68+
}
69+
)
70+
elif output_data_format == "Sharegpt":
71+
flat_results.append(
72+
{
73+
"conversations": [
74+
{"from": "human", "value": question},
75+
{"from": "gpt", "value": answer},
76+
]
77+
}
78+
)
79+
elif output_data_format == "ChatML":
80+
flat_results.append(
81+
{
82+
"messages": [
83+
{"role": "user", "content": question},
84+
{"role": "assistant", "content": answer},
85+
]
86+
}
87+
)
88+
else:
89+
raise ValueError(
90+
f"Unknown output data format: {output_data_format}"
91+
)
92+
return flat_results

0 commit comments

Comments
 (0)