InternScience
diff --git a/‎.github/contributing.md‎
Lines changed: 34 additions & 0 deletions b/‎.github/contributing.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎graphgen/bases/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎graphgen/bases/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/bases/base_extractor.py‎
Lines changed: 22 additions & 0 deletions b/‎graphgen/bases/base_extractor.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎graphgen/bases/base_llm_wrapper.py‎
Lines changed: 10 additions & 4 deletions b/‎graphgen/bases/base_llm_wrapper.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎graphgen/bases/base_searcher.py‎
Lines changed: 18 additions & 0 deletions b/‎graphgen/bases/base_searcher.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎graphgen/bases/base_storage.py‎
Lines changed: 3 additions & 0 deletions b/‎graphgen/bases/base_storage.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 31 additions & 22 deletions b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 31 additions & 22 deletions
diff --git a/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 21 additions & 19 deletions b/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 21 additions & 19 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 24 additions & 19 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 24 additions & 19 deletions
diff --git a/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 25 additions & 22 deletions b/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 25 additions & 22 deletions
@@ -0,0 +1,34 @@
+## Contribution Guide
+Here are the steps to contribute to this project:
+
+1. Star this repository.
+2. Fork this repository.
+   
+   Type the following command on Git bash console:
+   ```bash
+   git clone https://github.com/open-sciencelab/GraphGen.git
+   ```
+
+3. Create a new branch
+   
+    Now before making changes to the files, go to your terminal under the repo you just cloned, and type the following:
+    
+    ```bash
+    git checkout -b add-my-name
+    ```
+  
+    By running the above command, you just created a new branch called add-my-name and checked it out, what this does is that it creates a new branch with the commit history of the master branch or the branch that you were on previously.
+
+4. Make your changes and push your code.
+
+   ```
+   git add .
+   git commit -m "xxx"
+   git push
+   ```
+
+   This will create a new commit with the changes you made.
+
+5. Now create a pull request and add the title.
+
+     Sit back and relax while your pull request is being reviewed and merged.
@@ -1,8 +1,10 @@
+from .base_extractor import BaseExtractor
 from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_wrapper import BaseLLMWrapper
 from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
+from .base_searcher import BaseSearcher
 from .base_splitter import BaseSplitter
 from .base_storage import (
     BaseGraphStorage,
 
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
+
+
+class BaseExtractor(ABC):
+    """
+    Extract information from given text.
+
+    """
+
+    def __init__(self, llm_client: BaseLLMWrapper):
+        self.llm_client = llm_client
+
+    @abstractmethod
+    async def extract(self, chunk: dict) -> Any:
+        """Extract information from the given text"""
+
+    @abstractmethod
+    def build_prompt(self, text: str) -> str:
+        """Build prompt for LLM based on the given text"""
@@ -61,11 +61,17 @@ async def generate_inputs_prob(
     def filter_think_tags(text: str, think_tag: str = "think") -> str:
         """
         Remove <think> tags from the text.
-        If the text contains <think> and </think>, it removes everything between them and the tags themselves.
+        - If the text contains <think> and </think>, it removes everything between them and the tags themselves.
+        - If the text contains only </think>, it removes content before the tag.
         """
-        think_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
-        filtered_text = think_pattern.sub("", text).strip()
-        return filtered_text if filtered_text else text.strip()
+        paired_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
+        filtered = paired_pattern.sub("", text)
+
+        orphan_pattern = re.compile(rf"^.*?</{think_tag}>", re.DOTALL)
+        filtered = orphan_pattern.sub("", filtered)
+
+        filtered = filtered.strip()
+        return filtered if filtered else text.strip()
 
     def shutdown(self) -> None:
         """Shutdown the LLM engine if applicable."""
 
@@ -0,0 +1,18 @@
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List
+
+
+class BaseSearcher(ABC):
+    """
+    Abstract base class for searching and retrieving data.
+    """
+
+    @abstractmethod
+    async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]:
+        """
+        Search for data based on the given query.
+
+        :param query: The searcher query.
+        :param kwargs: Additional keyword arguments for the searcher.
+        :return: List of dictionaries containing the searcher results.
+        """
@@ -45,6 +45,9 @@ async def get_by_ids(
     ) -> list[Union[T, None]]:
         raise NotImplementedError
 
+    async def get_all(self) -> dict[str, T]:
+        raise NotImplementedError
+
     async def filter_keys(self, data: list[str]) -> set[str]:
         """return un-exist keys"""
         raise NotImplementedError
 
@@ -1,22 +1,31 @@
-read:
-  input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 20 # max nodes and edges per community
-    min_units_per_community: 5 # min nodes and edges per community
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: quiz_and_judge
+    params:
+      quiz_samples: 2 # number of quiz samples to generate
+      re_judge: false # whether to re-judge the existing quiz samples
+
+  - name: partition
+    deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 20 # max nodes and edges per community
+        min_units_per_community: 5 # min nodes and edges per community
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: aggregated # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -1,19 +1,21 @@
-read:
-  input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: true
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: dfs # partition method, support: dfs, bfs, ece, leiden
-  method_params:
-    max_units_per_community: 1 # atomic partition, one node or edge per community
-generate:
-  mode: atomic # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Alpaca # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: dfs # partition method, support: dfs, bfs, ece, leiden
+      method_params:
+        max_units_per_community: 1 # atomic partition, one node or edge per community
+  - name: generate
+    params:
+      method: atomic # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -1,19 +1,24 @@
-read:
-  input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-partition: # graph partition configuration
-  method: leiden # leiden is a partitioner detection algorithm
-  method_params:
-    max_size: 20 # Maximum size of communities
-    use_lcc: false # whether to use the largest connected component
-    random_seed: 42 # random seed for partitioning
-generate:
-  mode: cot # atomic, aggregated, multi_hop, cot, vqa
-  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/txt_demo.txt  # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
+        chunk_size: 1024 # chunk size for text splitting
+        chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: leiden # leiden is a partitioner detection algorithm
+      method_params:
+        max_size: 20 # Maximum size of communities
+        use_lcc: false # whether to use the largest connected component
+        random_seed: 42 # random seed for partitioning
+
+  - name: generate
+    params:
+      method: cot # atomic, aggregated, multi_hop, cot, vqa
+      data_format: Sharegpt # Alpaca, Sharegpt, ChatML
@@ -1,22 +1,25 @@
-read:
-  input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
-split:
-  chunk_size: 1024 # chunk size for text splitting
-  chunk_overlap: 100 # chunk overlap for text splitting
-search: # web search configuration
-  enabled: false # whether to enable web search
-  search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
-  enabled: false
-  quiz_samples: 2 # number of quiz samples to generate
-  re_judge: false # whether to re-judge the existing quiz samples
-partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
-  method_params:
-    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
-    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
-    max_tokens_per_community: 10240 # max tokens per community
-    unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
-generate:
-  mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
-  data_format: ChatML # Alpaca, Sharegpt, ChatML
+pipeline:
+  - name: read
+    params:
+      input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+
+  - name: chunk
+    params:
+      chunk_size: 1024 # chunk size for text splitting
+      chunk_overlap: 100 # chunk overlap for text splitting
+
+  - name: build_kg
+
+  - name: partition
+    params:
+      method: ece # ece is a custom partition method based on comprehension loss
+      method_params:
+        max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+        min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+        max_tokens_per_community: 10240 # max tokens per community
+        unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
+
+  - name: generate
+    params:
+      method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
+      data_format: ChatML # Alpaca, Sharegpt, ChatML