|
52 | 52 | DEFAULT_CHUNK_SIZE = int(os.getenv("FILE_PARSER_CHUNK_SIZE", "1000")) |
53 | 53 | DEFAULT_CHUNK_OVERLAP = int(os.getenv("FILE_PARSER_CHUNK_OVERLAP", "200")) |
54 | 54 |
|
| 55 | + |
| 56 | +def _simple_split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]: |
| 57 | + """ |
| 58 | + Simple text splitter as fallback when langchain is not available. |
| 59 | +
|
| 60 | + Args: |
| 61 | + text: Text to split |
| 62 | + chunk_size: Maximum size of chunks |
| 63 | + chunk_overlap: Overlap between chunks |
| 64 | +
|
| 65 | + Returns: |
| 66 | + List of text chunks |
| 67 | + """ |
| 68 | + if not text or len(text) <= chunk_size: |
| 69 | + return [text] if text.strip() else [] |
| 70 | + |
| 71 | + chunks = [] |
| 72 | + start = 0 |
| 73 | + text_len = len(text) |
| 74 | + |
| 75 | + while start < text_len: |
| 76 | + # Calculate end position |
| 77 | + end = min(start + chunk_size, text_len) |
| 78 | + |
| 79 | + # If not the last chunk, try to break at a good position |
| 80 | + if end < text_len: |
| 81 | + # Try to break at newline, sentence end, or space |
| 82 | + for separator in ["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " "]: |
| 83 | + last_sep = text.rfind(separator, start, end) |
| 84 | + if last_sep != -1: |
| 85 | + end = last_sep + len(separator) |
| 86 | + break |
| 87 | + |
| 88 | + chunk = text[start:end].strip() |
| 89 | + if chunk: |
| 90 | + chunks.append(chunk) |
| 91 | + |
| 92 | + # Move start position with overlap |
| 93 | + start = max(start + 1, end - chunk_overlap) |
| 94 | + |
| 95 | + return chunks |
| 96 | + |
| 97 | + |
55 | 98 | # Initialize parser instance |
56 | 99 | file_parser = None |
57 | 100 | try: |
|
64 | 107 |
|
65 | 108 | # Initialize text splitter instance |
66 | 109 | text_splitter = None |
| 110 | +_use_simple_splitter = False |
| 111 | + |
67 | 112 | try: |
68 | 113 | try: |
69 | 114 | from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
82 | 127 | separators=["\n\n", "\n", "。", "!", "?", ". ", "! ", "? ", " ", ""], |
83 | 128 | ) |
84 | 129 | logger.debug( |
85 | | - f"[FileContentParser] Initialized text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, " |
| 130 | + f"[FileContentParser] Initialized langchain text splitter with chunk_size={DEFAULT_CHUNK_SIZE}, " |
86 | 131 | f"chunk_overlap={DEFAULT_CHUNK_OVERLAP}" |
87 | 132 | ) |
88 | 133 | except ImportError as e: |
89 | 134 | logger.warning( |
90 | | - f"[FileContentParser] langchain not available, text splitting will be disabled: {e}. " |
| 135 | + f"[FileContentParser] langchain not available, using simple text splitter as fallback: {e}. " |
91 | 136 | "Install with: pip install langchain or pip install langchain-text-splitters" |
92 | 137 | ) |
93 | 138 | text_splitter = None |
| 139 | + _use_simple_splitter = True |
94 | 140 | except Exception as e: |
95 | | - logger.error(f"[FileContentParser] Failed to initialize text splitter: {e}") |
| 141 | + logger.error( |
| 142 | + f"[FileContentParser] Failed to initialize text splitter: {e}, using simple splitter as fallback" |
| 143 | + ) |
96 | 144 | text_splitter = None |
| 145 | + _use_simple_splitter = True |
| 146 | + |
| 147 | + |
| 148 | +def get_parser() -> Any: |
| 149 | + """ |
| 150 | + Get parser instance. |
| 151 | +
|
| 152 | + Returns: |
| 153 | + Parser instance (from ParserFactory) or None if not available |
| 154 | + """ |
| 155 | + return file_parser |
| 156 | + |
| 157 | + |
| 158 | +def get_text_splitter(chunk_size: int | None = None, chunk_overlap: int | None = None) -> Any: |
| 159 | + """ |
| 160 | + Get text splitter instance or a callable that uses simple splitter. |
| 161 | +
|
| 162 | + Args: |
| 163 | + chunk_size: Maximum size of chunks when splitting text (used for simple splitter fallback) |
| 164 | + chunk_overlap: Overlap between chunks when splitting text (used for simple splitter fallback) |
| 165 | +
|
| 166 | + Returns: |
| 167 | + Text splitter instance (RecursiveCharacterTextSplitter) or a callable wrapper for simple splitter |
| 168 | + """ |
| 169 | + if text_splitter is not None: |
| 170 | + return text_splitter |
| 171 | + |
| 172 | + # Return a callable wrapper that uses simple splitter |
| 173 | + if _use_simple_splitter: |
| 174 | + actual_chunk_size = chunk_size or DEFAULT_CHUNK_SIZE |
| 175 | + actual_chunk_overlap = chunk_overlap or DEFAULT_CHUNK_OVERLAP |
| 176 | + |
| 177 | + class SimpleTextSplitter: |
| 178 | + """Simple text splitter wrapper.""" |
| 179 | + |
| 180 | + def __init__(self, chunk_size: int, chunk_overlap: int): |
| 181 | + self.chunk_size = chunk_size |
| 182 | + self.chunk_overlap = chunk_overlap |
| 183 | + |
| 184 | + def split_text(self, text: str) -> list[str]: |
| 185 | + return _simple_split_text(text, self.chunk_size, self.chunk_overlap) |
| 186 | + |
| 187 | + return SimpleTextSplitter(actual_chunk_size, actual_chunk_overlap) |
| 188 | + |
| 189 | + return None |
97 | 190 |
|
98 | 191 |
|
99 | 192 | def extract_role(message: dict[str, Any]) -> str: |
|
0 commit comments