InternScience
diff --git a/‎examples/configs/README.md‎
Lines changed: 0 additions & 1 deletion b/‎examples/configs/README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/extract/extract_schema_guided/README.md‎
Lines changed: 1 addition & 0 deletions b/‎examples/extract/extract_schema_guided/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/extract/extract_schema_guided/extract_schema_guided.sh‎
Lines changed: 1 addition & 1 deletion b/‎examples/extract/extract_schema_guided/extract_schema_guided.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml‎
Lines changed: 28 additions & 14 deletions b/‎examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎examples/configs/search_dna_config.yaml‎ ‎examples/search/search_dna_config.yaml‎examples/configs/search_dna_config.yaml renamed to examples/search/search_dna_config.yaml b/‎examples/configs/search_dna_config.yaml‎ ‎examples/search/search_dna_config.yaml‎examples/configs/search_dna_config.yaml renamed to examples/search/search_dna_config.yaml
diff --git a/‎…mples/configs/search_protein_config.yaml‎ ‎…amples/search/search_protein_config.yaml‎examples/configs/search_protein_config.yaml renamed to examples/search/search_protein_config.yaml b/‎…mples/configs/search_protein_config.yaml‎ ‎…amples/search/search_protein_config.yaml‎examples/configs/search_protein_config.yaml renamed to examples/search/search_protein_config.yaml
diff --git a/‎examples/configs/search_rna_config.yaml‎ ‎examples/search/search_rna_config.yaml‎examples/configs/search_rna_config.yaml renamed to examples/search/search_rna_config.yaml b/‎examples/configs/search_rna_config.yaml‎ ‎examples/search/search_rna_config.yaml‎examples/configs/search_rna_config.yaml renamed to examples/search/search_rna_config.yaml
diff --git a/‎graphgen/models/extractor/schema_guided_extractor.py‎
Lines changed: 3 additions & 5 deletions b/‎graphgen/models/extractor/schema_guided_extractor.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎graphgen/models/reader/txt_reader.py‎
Lines changed: 5 additions & 2 deletions b/‎graphgen/models/reader/txt_reader.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎graphgen/operators/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/operators/__init__.py‎
Lines changed: 2 additions & 2 deletions
@@ -0,0 +1 @@
+# Extract Schema-Guided Information from Documents
@@ -1,3 +1,3 @@
 python3 -m graphgen.run \
---config_file graphgen/configs/schema_guided_extraction_config.yaml \
+--config_file examples/extract/extract_schema_guided/schema_guided_extraction_config.yaml \
 --output_dir cache/
@@ -1,20 +1,34 @@
-pipeline:
-  - name: read_step
-    op_key: read
+global_params:
+  working_dir: cache
+
+nodes:
+  - id: read
+    op_name: read
+    type: source
+    dependencies: []
     params:
-      input_file: resources/input_examples/extract_demo.txt # input file path, support json, jsonl, txt, pdf. See resources/input_examples for examples
+      input_path:
+        - examples/input_examples/extract_demo.txt
 
-  - name: chunk_step
-    op_key: chunk
-    deps: [read_step] # chunk_step depends on read_step
+  - id: chunk
+    op_name: chunk
+    type: map_batch
+    dependencies:
+      - read
+    execution_params:
+      replicas: 4
     params:
-      chunk_size: 20480
+      chunk_size: 20480 # larger chunk size for better context
       chunk_overlap: 2000
-      separators: []
 
-  - name: extract_step
-    op_key: extract
-    deps: [chunk_step] # extract_step depends on chunk_step
+  - id: extract
+    op_name: extract
+    type: map_batch
+    dependencies:
+      - chunk
+    execution_params:
+      replicas: 1
+      batch_size: 128
     params:
-      method: schema_guided # extraction method, support: schema_guided
-      schema_file: graphgen/templates/extraction/schemas/legal_contract.json # schema file path for schema_guided method
+      method: schema_guided
+      schema_path: graphgen/templates/extraction/schemas/legal_contract.json
@@ -60,8 +60,8 @@ def build_prompt(self, text: str) -> str:
         return prompt
 
     async def extract(self, chunk: dict) -> dict:
-        _chunk_id = list(chunk.keys())[0]
-        text = chunk[_chunk_id].get("content", "")
+        _chunk_id = chunk.get("_chunk_id", "")
+        text = chunk.get("content", "")
 
         prompt = self.build_prompt(text)
         response = await self.llm_client.generate_answer(prompt)
@@ -88,9 +88,7 @@ async def extract(self, chunk: dict) -> dict:
             return {}
 
     @staticmethod
-    async def merge_extractions(
-        extraction_list: List[Dict[str, dict]]
-    ) -> Dict[str, dict]:
+    def merge_extractions(extraction_list: List[Dict[str, dict]]) -> Dict[str, dict]:
         """
         Merge multiple extraction results based on their hashes.
         :param extraction_list: List of extraction results, each is a dict with hash as key and record as value.
 
@@ -16,12 +16,15 @@ def read(
         :param input_path: Path to the input text file or list of text files.
         :return: Ray Dataset containing the read text data.
         """
-        docs_ds = ray.data.read_text(input_path, encoding="utf-8")
+        docs_ds = ray.data.read_binary_files(
+            input_path,
+            include_paths=False,
+        )
 
         docs_ds = docs_ds.map(
             lambda row: {
                 "type": "text",
-                self.text_column: row["text"],
+                self.text_column: row["bytes"].decode("utf-8"),
             }
         )
 
 
@@ -1,6 +1,6 @@
 from .build_kg import BuildKGService
 from .chunk import ChunkService
-from .extract import extract
+from .extract import ExtractService
 from .generate import GenerateService
 from .judge import JudgeService
 from .partition import PartitionService
@@ -14,7 +14,7 @@
     "build_kg": BuildKGService,
     "quiz": QuizService,
     "judge": JudgeService,
-    "extract": extract,
+    "extract": ExtractService,
     "search": search_all,
     "partition": PartitionService,
     "generate": GenerateService,
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Extract Schema-Guided Information from Documents`