Skip to content

Commit 9217306

Browse files
Merge pull request #65 from open-sciencelab/feat/pdf-reader
feat: add pdf_reader & tests for MinerUParser
2 parents 9739775 + 6d6f160 commit 9217306

32 files changed

+451
-70
lines changed

graphgen/configs/aggregated_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/jsonl_demo.jsonl # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
1818
max_tokens_per_community: 10240 # max tokens per community
1919
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
2020
generate:
21-
mode: aggregated # atomic, aggregated, multi_hop, cot
21+
mode: aggregated # atomic, aggregated, multi_hop, cot, vqa
2222
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/atomic_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv. See resources/input_examples for examples
2+
input_file: resources/input_examples/json_demo.json # input file path, support json, jsonl, txt, csv, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
1515
method_params:
1616
max_units_per_community: 1 # atomic partition, one node or edge per community
1717
generate:
18-
mode: atomic # atomic, aggregated, multi_hop, cot
18+
mode: atomic # atomic, aggregated, multi_hop, cot, vqa
1919
data_format: Alpaca # Alpaca, Sharegpt, ChatML

graphgen/configs/cot_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/txt_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -15,5 +15,5 @@ partition: # graph partition configuration
1515
use_lcc: false # whether to use the largest connected component
1616
random_seed: 42 # random seed for partitioning
1717
generate:
18-
mode: cot # atomic, aggregated, multi_hop, cot
18+
mode: cot # atomic, aggregated, multi_hop, cot, vqa
1919
data_format: Sharegpt # Alpaca, Sharegpt, ChatML

graphgen/configs/multi_hop_config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
read:
2-
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt. See resources/input_examples for examples
2+
input_file: resources/input_examples/csv_demo.csv # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
33
split:
44
chunk_size: 1024 # chunk size for text splitting
55
chunk_overlap: 100 # chunk overlap for text splitting
@@ -18,5 +18,5 @@ partition: # graph partition configuration
1818
max_tokens_per_community: 10240 # max tokens per community
1919
unit_sampling: random # unit sampling strategy, support: random, max_loss, min_loss
2020
generate:
21-
mode: multi_hop # strategy for generating multi-hop QA pairs
21+
mode: multi_hop # atomic, aggregated, multi_hop, cot, vqa
2222
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/configs/vqa_config.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
read:
2+
input_file: resources/input_examples/pdf_demo.pdf # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
3+
split:
4+
chunk_size: 1024 # chunk size for text splitting
5+
chunk_overlap: 100 # chunk overlap for text splitting
6+
search: # web search configuration
7+
enabled: false # whether to enable web search
8+
search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
9+
quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
10+
enabled: true
11+
quiz_samples: 2 # number of quiz samples to generate
12+
re_judge: false # whether to re-judge the existing quiz samples
13+
partition: # graph partition configuration
14+
method: ece # ece is a custom partition method based on comprehension loss
15+
method_params:
16+
max_units_per_community: 20 # max nodes and edges per community
17+
min_units_per_community: 5 # min nodes and edges per community
18+
max_tokens_per_community: 10240 # max tokens per community
19+
unit_sampling: max_loss # unit sampling strategy, support: random, max_loss, min_loss
20+
generate:
21+
mode: vqa # atomic, aggregated, multi_hop, cot, vqa
22+
data_format: ChatML # Alpaca, Sharegpt, ChatML

graphgen/generate.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -72,24 +72,11 @@ def main():
7272

7373
graph_gen.search(search_config=config["search"])
7474

75-
# Use pipeline according to the output data type
76-
if mode in ["atomic", "aggregated", "multi_hop"]:
77-
logger.info("Generation mode set to '%s'. Start generation.", mode)
78-
if "quiz_and_judge" in config and config["quiz_and_judge"]["enabled"]:
79-
graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
80-
else:
81-
logger.warning(
82-
"Quiz and Judge strategy is disabled. Edge sampling falls back to random."
83-
)
84-
assert (
85-
config["partition"]["method"] == "ece"
86-
and "method_params" in config["partition"]
87-
), "Only ECE partition with edge sampling is supported."
88-
config["partition"]["method_params"]["edge_sampling"] = "random"
89-
elif mode == "cot":
90-
logger.info("Generation mode set to 'cot'. Start generation.")
91-
else:
92-
raise ValueError(f"Unsupported output data type: {mode}")
75+
if config.get("quiz_and_judge", {}).get("enabled"):
76+
graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
77+
78+
# TODO: add data filtering step here in the future
79+
# graph_gen.filter(filter_config=config["filter"])
9380

9481
graph_gen.generate(
9582
partition_config=config["partition"],

graphgen/graphgen.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ async def insert(self, read_config: Dict, split_config: Dict):
9191
insert chunks into the graph
9292
"""
9393
# Step 1: Read files
94-
data = read_files(read_config["input_file"])
94+
data = read_files(read_config["input_file"], self.working_dir)
9595
if len(data) == 0:
9696
logger.warning("No data to process")
9797
return
@@ -105,6 +105,7 @@ async def insert(self, read_config: Dict, split_config: Dict):
105105
"content": doc["content"]
106106
}
107107
for doc in data
108+
if doc.get("type", "text") == "text"
108109
}
109110
_add_doc_keys = await self.full_docs_storage.filter_keys(list(new_docs.keys()))
110111
new_docs = {k: v for k, v in new_docs.items() if k in _add_doc_keys}

graphgen/models/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
AtomicGenerator,
55
CoTGenerator,
66
MultiHopGenerator,
7+
VQAGenerator,
78
)
89
from .kg_builder import LightRAGKGBuilder
910
from .llm.openai_client import OpenAIClient
@@ -14,7 +15,7 @@
1415
ECEPartitioner,
1516
LeidenPartitioner,
1617
)
17-
from .reader import CsvReader, JsonlReader, JsonReader, TxtReader
18+
from .reader import CSVReader, JSONLReader, JSONReader, PDFReader, TXTReader
1819
from .search.db.uniprot_search import UniProtSearch
1920
from .search.kg.wiki_search import WikiSearch
2021
from .search.web.bing_search import BingSearch

graphgen/models/generator/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
from .atomic_generator import AtomicGenerator
33
from .cot_generator import CoTGenerator
44
from .multi_hop_generator import MultiHopGenerator
5+
from .vqa_generator import VQAGenerator
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
from dataclasses import dataclass
2+
from typing import Any
3+
4+
from graphgen.bases import BaseGenerator
5+
6+
7+
@dataclass
8+
class VQAGenerator(BaseGenerator):
9+
@staticmethod
10+
def build_prompt(
11+
batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
12+
) -> str:
13+
raise NotImplementedError(
14+
"VQAGenerator.build_prompt is not implemented. "
15+
"Please provide an implementation for VQA prompt construction."
16+
)
17+
18+
@staticmethod
19+
def parse_response(response: str) -> Any:
20+
raise NotImplementedError(
21+
"VQAGenerator.parse_response is not implemented. "
22+
"Please provide an implementation for VQA response parsing."
23+
)

0 commit comments

Comments
 (0)