refactor: refactor txt_reader using ray data

ChenZiHong-Gavin · ChenZiHong-Gavin · commit bd2f7c471b16 · 2025-11-21T20:26:32.000+08:00
diff --git a/graphgen/bases/base_reader.py b/graphgen/bases/base_reader.py
@@ -1,8 +1,9 @@
 import os
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Union
 
 import requests
+from ray.data import Dataset
 
 
 class BaseReader(ABC):
@@ -14,52 +15,50 @@ def __init__(self, text_column: str = "content"):
         self.text_column = text_column
 
     @abstractmethod
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
+    def read(self, input_path: Union[str, List[str]]) -> Dataset:
         """
         Read data from the specified file path.
 
-        :param file_path: Path to the input file.
-        :return: List of dictionaries containing the data.
+        :param input_path: Path to the input file or list of file paths.
+        :return: Ray Dataset containing the read data.
         """
 
-    @staticmethod
-    def filter(data: List[dict]) -> List[dict]:
+    def _should_keep_item(self, item: Dict[str, Any]) -> bool:
         """
-        Filter out entries with empty or missing text in the specified column.
+        Determine whether to keep the given item based on the text column.
 
-        :param data: List of dictionaries containing the data.
-        :return: Filtered list of dictionaries.
+        :param item: Dictionary representing a data entry.
+        :return: True if the item should be kept, False otherwise.
         """
+        item_type = item.get("type")
+        assert item_type in [
+            "text",
+            "image",
+            "table",
+            "equation",
+            "protein",
+        ], f"Unsupported item type: {item_type}"
+        if item_type == "text":
+            content = item.get(self.text_column, "").strip()
+            return bool(content)
+        return True
 
-        def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
-            """
-            Check if an image exists at the given local path or URL.
-            :param path_or_url: Local file path or remote URL of the image.
-            :param timeout: Timeout for remote URL requests in seconds.
-            :return: True if the image exists, False otherwise.
-            """
-            if not path_or_url:
-                return False
-            if not path_or_url.startswith(("http://", "https://", "ftp://")):
-                path = path_or_url.replace("file://", "", 1)
-                path = os.path.abspath(path)
-                return os.path.isfile(path)
-            try:
-                resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
-                return resp.status_code == 200
-            except requests.RequestException:
-                return False
-
-        filtered_data = []
-        for item in data:
-            if item.get("type") == "text":
-                content = item.get("content", "").strip()
-                if content:
-                    filtered_data.append(item)
-            elif item.get("type") in ("image", "table", "equation"):
-                img_path = item.get("img_path")
-                if _image_exists(img_path):
-                    filtered_data.append(item)
-            else:
-                filtered_data.append(item)
-        return filtered_data
+    @staticmethod
+    def _image_exists(path_or_url: str, timeout: int = 3) -> bool:
+        """
+        Check if an image exists at the given local path or URL.
+        :param path_or_url: Local file path or remote URL of the image.
+        :param timeout: Timeout for remote URL requests in seconds.
+        :return: True if the image exists, False otherwise.
+        """
+        if not path_or_url:
+            return False
+        if not path_or_url.startswith(("http://", "https://", "ftp://")):
+            path = path_or_url.replace("file://", "", 1)
+            path = os.path.abspath(path)
+            return os.path.isfile(path)
+        try:
+            resp = requests.head(path_or_url, allow_redirects=True, timeout=timeout)
+            return resp.status_code == 200
+        except requests.RequestException:
+            return False
diff --git a/graphgen/models/reader/txt_reader.py b/graphgen/models/reader/txt_reader.py
@@ -1,10 +1,33 @@
-from typing import Any, Dict, List
+from typing import List, Union
+
+import ray
+from ray.data import Dataset
 
 from graphgen.bases.base_reader import BaseReader
 
 
 class TXTReader(BaseReader):
-    def read(self, file_path: str) -> List[Dict[str, Any]]:
-        with open(file_path, "r", encoding="utf-8") as f:
-            docs = [{"type": "text", self.text_column: f.read()}]
-        return self.filter(docs)
+    def read(
+        self,
+        input_path: Union[str, List[str]],
+        override_num_blocks: int = 4,
+    ) -> Dataset:
+        """
+        Read text files from the specified input path.
+        :param input_path: Path to the input text file or list of text files.
+        :param override_num_blocks: Number of blocks to override for Ray Dataset reading.
+        :return: Ray Dataset containing the read text data.
+        """
+        docs_ds = ray.data.read_text(
+            input_path, encoding="utf-8", override_num_blocks=override_num_blocks
+        )
+
+        docs_ds = docs_ds.map(
+            lambda row: {
+                "type": "text",
+                self.text_column: row["text"],
+            }
+        )
+
+        docs_ds = docs_ds.filter(self._should_keep_item)
+        return docs_ds