Skip to content

Commit 607ef6a

Browse files
committed
Merge branch 'main' into fix/quiz-refactor
2 parents 0b0279c + 866dc7c commit 607ef6a

File tree

78 files changed

+1477
-918
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+1477
-918
lines changed

.github/contributing.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
## Contribution Guide
2+
Here are the steps to contribute to this project:
3+
4+
1. Star this repository.
5+
2. Fork this repository.
6+
7+
Type the following command on Git bash console:
8+
```bash
9+
git clone https://github.com/open-sciencelab/GraphGen.git
10+
```
11+
12+
3. Create a new branch
13+
14+
Now before making changes to the files, go to your terminal under the repo you just cloned, and type the following:
15+
16+
```bash
17+
git checkout -b add-my-name
18+
```
19+
20+
By running the above command, you just created a new branch called add-my-name and checked it out, what this does is that it creates a new branch with the commit history of the master branch or the branch that you were on previously.
21+
22+
4. Make your changes and push your code.
23+
24+
```
25+
git add .
26+
git commit -m "xxx"
27+
git push
28+
```
29+
30+
This will create a new commit with the changes you made.
31+
32+
5. Now create a pull request and add the title.
33+
34+
Sit back and relax while your pull request is being reviewed and merged.

graphgen/bases/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1+
from .base_extractor import BaseExtractor
12
from .base_generator import BaseGenerator
23
from .base_kg_builder import BaseKGBuilder
34
from .base_llm_wrapper import BaseLLMWrapper
45
from .base_partitioner import BasePartitioner
56
from .base_reader import BaseReader
7+
from .base_searcher import BaseSearcher
68
from .base_splitter import BaseSplitter
79
from .base_storage import (
810
BaseGraphStorage,

graphgen/bases/base_extractor.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Any
3+
4+
from graphgen.bases.base_llm_wrapper import BaseLLMWrapper
5+
6+
7+
class BaseExtractor(ABC):
8+
"""
9+
Extract information from given text.
10+
11+
"""
12+
13+
def __init__(self, llm_client: BaseLLMWrapper):
14+
self.llm_client = llm_client
15+
16+
@abstractmethod
17+
async def extract(self, chunk: dict) -> Any:
18+
"""Extract information from the given text"""
19+
20+
@abstractmethod
21+
def build_prompt(self, text: str) -> str:
22+
"""Build prompt for LLM based on the given text"""

graphgen/bases/base_llm_wrapper.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,17 @@ async def generate_inputs_prob(
6161
def filter_think_tags(text: str, think_tag: str = "think") -> str:
6262
"""
6363
Remove <think> tags from the text.
64-
If the text contains <think> and </think>, it removes everything between them and the tags themselves.
64+
- If the text contains <think> and </think>, it removes everything between them and the tags themselves.
65+
- If the text contains only </think>, it removes content before the tag.
6566
"""
66-
think_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
67-
filtered_text = think_pattern.sub("", text).strip()
68-
return filtered_text if filtered_text else text.strip()
67+
paired_pattern = re.compile(rf"<{think_tag}>.*?</{think_tag}>", re.DOTALL)
68+
filtered = paired_pattern.sub("", text)
69+
70+
orphan_pattern = re.compile(rf"^.*?</{think_tag}>", re.DOTALL)
71+
filtered = orphan_pattern.sub("", filtered)
72+
73+
filtered = filtered.strip()
74+
return filtered if filtered else text.strip()
6975

7076
def shutdown(self) -> None:
7177
"""Shutdown the LLM engine if applicable."""

graphgen/bases/base_searcher.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from abc import ABC, abstractmethod
2+
from typing import Any, Dict, List
3+
4+
5+
class BaseSearcher(ABC):
6+
"""
7+
Abstract base class for searching and retrieving data.
8+
"""
9+
10+
@abstractmethod
11+
async def search(self, query: str, **kwargs) -> List[Dict[str, Any]]:
12+
"""
13+
Search for data based on the given query.
14+
15+
:param query: The searcher query.
16+
:param kwargs: Additional keyword arguments for the searcher.
17+
:return: List of dictionaries containing the searcher results.
18+
"""

graphgen/bases/base_storage.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ async def get_by_ids(
4545
) -> list[Union[T, None]]:
4646
raise NotImplementedError
4747

48+
async def get_all(self) -> dict[str, T]:
49+
raise NotImplementedError
50+
4851
async def filter_keys(self, data: list[str]) -> set[str]:
4952
"""return un-exist keys"""
5053
raise NotImplementedError
Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,31 @@
1-
read:
2-
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: true
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: ece # ece is a custom partition method based on comprehension loss
15-
method_params:
16-
max_units_per_community: 20 # max nodes and edges per community
17-
min_units_per_community: 5 # min nodes and edges per community
18-
max_tokens_per_community: 10240 # max tokens per community
19-
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
20-
generate:
21-
mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
22-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: read
3+
params:
4+
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
6+
- name: chunk
7+
params:
8+
chunk_size: 1024 # chunk size for text splitting
9+
chunk_overlap: 100 # chunk overlap for text splitting
10+
11+
- name: build_kg
12+
13+
- name: quiz_and_judge
14+
params:
15+
quiz_samples: 2 # number of quiz samples to generate
16+
re_judge: false # whether to re-judge the existing quiz samples
17+
18+
- name: partition
19+
deps: [quiz_and_judge] # ece depends on quiz_and_judge steps
20+
params:
21+
method: ece # ece is a custom partition method based on comprehension loss
22+
method_params:
23+
max_units_per_community: 20 # max nodes and edges per community
24+
min_units_per_community: 5 # min nodes and edges per community
25+
max_tokens_per_community: 10240 # max tokens per community
26+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
27+
28+
- name: generate
29+
params:
30+
method: aggregated # atomic, aggregated, multi_hop, cot, vqa
31+
data_format: ChatML # Alpaca, Sharegpt, ChatML
Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
1-
read:
2-
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: true
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: dfs # partition method, support: dfs, bfs, ece, leiden
15-
method_params:
16-
max_units_per_community: 1 # atomic partition, one node or edge per community
17-
generate:
18-
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Alpaca # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: read
3+
params:
4+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
5+
6+
- name: chunk
7+
params:
8+
chunk_size: 1024 # chunk size for text splitting
9+
chunk_overlap: 100 # chunk overlap for text splitting
10+
11+
- name: build_kg
12+
13+
- name: partition
14+
params:
15+
method: dfs # partition method, support: dfs, bfs, ece, leiden
16+
method_params:
17+
max_units_per_community: 1 # atomic partition, one node or edge per community
18+
- name: generate
19+
params:
20+
method: atomic # atomic, aggregated, multi_hop, cot, vqa
21+
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,24 @@
1-
read:
2-
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
partition: # graph partition configuration
12-
method: leiden # leiden is a partitioner detection algorithm
13-
method_params:
14-
max_size: 20 # Maximum size of communities
15-
use_lcc: false # whether to use the largest connected component
16-
random_seed: 42 # random seed for partitioning
17-
generate:
18-
mode: cot # atomic, aggregated, multi_hop, cot, vqa
19-
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: read
3+
params:
4+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
6+
- name: chunk
7+
params:
8+
chunk_size: 1024 # chunk size for text splitting
9+
chunk_overlap: 100 # chunk overlap for text splitting
10+
11+
- name: build_kg
12+
13+
- name: partition
14+
params:
15+
method: leiden # leiden is a partitioner detection algorithm
16+
method_params:
17+
max_size: 20 # Maximum size of communities
18+
use_lcc: false # whether to use the largest connected component
19+
random_seed: 42 # random seed for partitioning
20+
21+
- name: generate
22+
params:
23+
method: cot # atomic, aggregated, multi_hop, cot, vqa
24+
data_format: Sharegpt # Alpaca, Sharegpt, ChatML
Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
1-
read:
2-
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3-
split:
4-
chunk_size: 1024 # chunk size for text splitting
5-
chunk_overlap: 100 # chunk overlap for text splitting
6-
search: # web search configuration
7-
enabled: false # whether to enable web search
8-
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9-
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10-
enabled: false
11-
quiz_samples: 2 # number of quiz samples to generate
12-
re_judge: false # whether to re-judge the existing quiz samples
13-
partition: # graph partition configuration
14-
method: ece # ece is a custom partition method based on comprehension loss
15-
method_params:
16-
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
17-
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
18-
max_tokens_per_community: 10240 # max tokens per community
19-
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
20-
generate:
21-
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
22-
data_format: ChatML # Alpaca, Sharegpt, ChatML
1+
pipeline:
2+
- name: read
3+
params:
4+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
5+
6+
- name: chunk
7+
params:
8+
chunk_size: 1024 # chunk size for text splitting
9+
chunk_overlap: 100 # chunk overlap for text splitting
10+
11+
- name: build_kg
12+
13+
- name: partition
14+
params:
15+
method: ece # ece is a custom partition method based on comprehension loss
16+
method_params:
17+
max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
18+
min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
19+
max_tokens_per_community: 10240 # max tokens per community
20+
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
21+
22+
- name: generate
23+
params:
24+
method: multi_hop # atomic, aggregated, multi_hop, cot, vqa
25+
data_format: ChatML # Alpaca, Sharegpt, ChatML

0 commit comments

Comments
 (0)