Skip to content

Commit 37cbfcf

Browse files
feat: refactor schema_guided_extraction & add examples
1 parent 73f70a5 commit 37cbfcf

File tree

13 files changed

+85
-73
lines changed

13 files changed

+85
-73
lines changed

examples/configs/README.md

Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Extract Schema-Guided Information from Documents
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
python3 -m graphgen.run \
2-
--config_file graphgen/configs/schema_guided_extraction_config.yaml \
2+
--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml \
33
--output_dir cache/
Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,34 @@
1-
pipeline:
2-
- name: read_step
3-
op_key: read
1+
global_params:
2+
working_dir: cache
3+
4+
nodes:
5+
- id: read
6+
op_name: read
7+
type: source
8+
dependencies: []
49
params:
5-
input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
10+
input_path:
11+
- examples/input_examples/extract_demo.txt
612

7-
- name: chunk_step
8-
op_key: chunk
9-
deps: [read_step] # chunk_step depends on read_step
13+
- id: chunk
14+
op_name: chunk
15+
type: map_batch
16+
dependencies:
17+
- read
18+
execution_params:
19+
replicas: 4
1020
params:
11-
chunk_size: 20480
21+
chunk_size: 20480 # larger chunk size for better context
1222
chunk_overlap: 2000
13-
separators: []
1423

15-
- name: extract_step
16-
op_key: extract
17-
deps: [chunk_step] # extract_step depends on chunk_step
24+
- id: extract
25+
op_name: extract
26+
type: map_batch
27+
dependencies:
28+
- chunk
29+
execution_params:
30+
replicas: 1
31+
batch_size: 128
1832
params:
19-
method: schema_guided # extraction method, support: schema_guided
20-
schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
33+
method: schema_guided
34+
schema_path: graphgen/templates/extraction/schemas/legal_contract.json
File renamed without changes.

graphgen/models/extractor/schema_guided_extractor.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ def build_prompt(self, text: str) -> str:
6060
return prompt
6161

6262
async def extract(self, chunk: dict) -> dict:
63-
_chunk_id = list(chunk.keys())[0]
64-
text = chunk[_chunk_id].get("content", "")
63+
_chunk_id = chunk.get("_chunk_id", "")
64+
text = chunk.get("content", "")
6565

6666
prompt = self.build_prompt(text)
6767
response = await self.llm_client.generate_answer(prompt)
@@ -88,9 +88,7 @@ async def extract(self, chunk: dict) -> dict:
8888
return {}
8989

9090
@staticmethod
91-
async def merge_extractions(
92-
extraction_list: List[Dict[str, dict]]
93-
) -> Dict[str, dict]:
91+
def merge_extractions(extraction_list: List[Dict[str, dict]]) -> Dict[str, dict]:
9492
"""
9593
Merge multiple extraction results based on their hashes.
9694
:param extraction_list: List of extraction results, each is a dict with hash as key and record as value.

graphgen/models/reader/txt_reader.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,15 @@ def read(
1616
:param input_path: Path to the input text file or list of text files.
1717
:return: Ray Dataset containing the read text data.
1818
"""
19-
docs_ds = ray.data.read_text(input_path, encoding="utf-8")
19+
docs_ds = ray.data.read_binary_files(
20+
input_path,
21+
include_paths=False,
22+
)
2023

2124
docs_ds = docs_ds.map(
2225
lambda row: {
2326
"type": "text",
24-
self.text_column: row["text"],
27+
self.text_column: row["bytes"].decode("utf-8"),
2528
}
2629
)
2730

graphgen/operators/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from .build_kg import BuildKGService
22
from .chunk import ChunkService
3-
from .extract import extract
3+
from .extract import ExtractService
44
from .generate import GenerateService
55
from .judge import JudgeService
66
from .partition import PartitionService
@@ -14,7 +14,7 @@
1414
"build_kg": BuildKGService,
1515
"quiz": QuizService,
1616
"judge": JudgeService,
17-
"extract": extract,
17+
"extract": ExtractService,
1818
"search": search_all,
1919
"partition": PartitionService,
2020
"generate": GenerateService,

0 commit comments

Comments
 (0)