InternScience
diff --git a/‎examples/generate/generate_fill_in_blank_qa/README.md‎
Lines changed: 3 additions & 0 deletions b/‎examples/generate/generate_fill_in_blank_qa/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml‎
Lines changed: 80 additions & 0 deletions b/‎examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh‎
Lines changed: 2 additions & 0 deletions b/‎examples/generate/generate_fill_in_blank_qa/generate_fill_in_blank.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_answer_qa/README.md‎
Lines changed: 3 additions & 0 deletions b/‎examples/generate/generate_multi_answer_qa/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_answer_qa/generate_multi_answer.sh‎
Lines changed: 2 additions & 0 deletions b/‎examples/generate/generate_multi_answer_qa/generate_multi_answer.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_answer_qa/multi_answer_config.yaml‎
Lines changed: 80 additions & 0 deletions b/‎examples/generate/generate_multi_answer_qa/multi_answer_config.yaml‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_choice_qa/README.md‎
Lines changed: 3 additions & 0 deletions b/‎examples/generate/generate_multi_choice_qa/README.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_choice_qa/generate_multi_choice.sh‎
Lines changed: 2 additions & 0 deletions b/‎examples/generate/generate_multi_choice_qa/generate_multi_choice.sh‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/generate/generate_multi_choice_qa/multi_choice_config.yaml‎
Lines changed: 80 additions & 0 deletions b/‎examples/generate/generate_multi_choice_qa/multi_choice_config.yaml‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎graphgen/bases/base_generator.py‎
Lines changed: 44 additions & 35 deletions b/‎graphgen/bases/base_generator.py‎
Lines changed: 44 additions & 35 deletions
@@ -0,0 +1,3 @@
+# Generate Fill-in-blank QAs
+
+Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: fill_in_blank
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml
@@ -0,0 +1,3 @@
+# Generate Multi-Answer QAs
+
+Multi-answer question answering (QA) involves generating questions that can have multiple valid answers. This is particularly useful in educational settings, surveys, and research where diverse perspectives are valuable. 
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_answer_qa/multi_answer_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: multi_answer
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -0,0 +1,3 @@
+# Generate Multi-Choice QAs
+
+Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.
@@ -0,0 +1,2 @@
+python3 -m graphgen.run \
+--config_file examples/generate/generate_multi_choice_qa/multi_choice_config.yaml
@@ -0,0 +1,80 @@
+global_params:
+  working_dir: cache
+  graph_backend: kuzu # graph database backend, support: kuzu, networkx
+  kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv
+
+nodes:
+  - id: read_files # id is unique in the pipeline, and can be referenced by other steps
+    op_name: read
+    type: source
+    dependencies: []
+    params:
+        input_path:
+          - examples/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See examples/input_examples for examples
+
+  - id: chunk_documents
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read_files
+    execution_params:
+      replicas: 4
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - id: build_kg
+    op_name: build_kg
+    type: map_batch
+    dependencies:
+      - chunk_documents
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: quiz
+    op_name: quiz
+    type: map_batch
+    dependencies:
+      - build_kg
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+
+  - id: judge
+    op_name: judge
+    type: map_batch
+    dependencies:
+      - quiz
+    execution_params:
+      replicas: 1
+      batch_size: 128
+
+  - id: partition
+    op_name: partition
+    type: aggregate
+    dependencies:
+      - judge
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - id: generate
+    op_name: generate
+    type: map_batch
+    dependencies:
+      - partition
+    execution_params:
+      replicas: 1
+      batch_size: 128
+    save_output: true # save output
+    params:
+      method: multi_choice
+      num_of_questions: 5
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -46,38 +46,47 @@ async def generate(
     def format_generation_results(
         results: list[dict], output_data_format: str
     ) -> list[dict[str, Any]]:
-        if output_data_format == "Alpaca":
-            results = [
-                {
-                    "instruction": v["question"],
-                    "input": "",
-                    "output": v["answer"],
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "Sharegpt":
-            results = [
-                {
-                    "conversations": [
-                        {"from": "human", "value": v["question"]},
-                        {"from": "gpt", "value": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        elif output_data_format == "ChatML":
-            results = [
-                {
-                    "messages": [
-                        {"role": "user", "content": v["question"]},
-                        {"role": "assistant", "content": v["answer"]},
-                    ]
-                }
-                for item in results
-                for k, v in item.items()
-            ]
-        else:
-            raise ValueError(f"Unknown output data format: {output_data_format}")
-        return results
+
+        flat_results = []
+        for item in results:
+            for _, qa_data in item.items():
+                question = qa_data.get("question", "")
+                answer = qa_data.get("answer", "")
+                if "options" in qa_data and qa_data["options"]:
+                    options = qa_data["options"]
+                    options_str = "\n".join(
+                        [f"{key}. {options[key]}" for key in sorted(options.keys())]
+                    )
+                    question += f"\nOptions:\n{options_str}"
+
+                if output_data_format == "Alpaca":
+                    flat_results.append(
+                        {
+                            "instruction": question,
+                            "input": "",
+                            "output": answer,
+                        }
+                    )
+                elif output_data_format == "Sharegpt":
+                    flat_results.append(
+                        {
+                            "conversations": [
+                                {"from": "human", "value": question},
+                                {"from": "gpt", "value": answer},
+                            ]
+                        }
+                    )
+                elif output_data_format == "ChatML":
+                    flat_results.append(
+                        {
+                            "messages": [
+                                {"role": "user", "content": question},
+                                {"role": "assistant", "content": answer},
+                            ]
+                        }
+                    )
+                else:
+                    raise ValueError(
+                        f"Unknown output data format: {output_data_format}"
+                    )
+        return flat_results
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Generate Fill-in-blank QAs`
	`2`	`+`
	`3`	`+Fill-in-blank question answering (QA) involves creating questions where a key piece of information is omitted, requiring the respondent to fill in the missing word or phrase. This format is commonly used in educational assessments to test knowledge and comprehension.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+python3 -m graphgen.run \`
	`2`	`+--config_file examples/generate/generate_fill_in_blank_qa/fill_in_blank_config.yaml`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Generate Multi-Answer QAs`
	`2`	`+`
	`3`	`+Multi-answer question answering (QA) involves generating questions that can have multiple valid answers. This is particularly useful in educational settings, surveys, and research where diverse perspectives are valuable.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# Generate Multi-Choice QAs`
	`2`	`+`
	`3`	`+Multi-choice question answering (QA) tasks involve providing a question along with several answer options, where the goal is to select the correct answer from the given choices.`