Skip to content

Commit 802d2bf

Browse files
authored
text-splitters: Add ruff rule UP (pyupgrade) (#31841)
See https://docs.astral.sh/ruff/rules/#pyupgrade-up All auto-fixed except `typing.AbstractSet` -> `collections.abc.Set`
1 parent 911b0b6 commit 802d2bf

File tree

13 files changed

+106
-115
lines changed

13 files changed

+106
-115
lines changed

libs/text-splitters/langchain_text_splitters/base.py

Lines changed: 16 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,14 @@
33
import copy
44
import logging
55
from abc import ABC, abstractmethod
6+
from collections.abc import Collection, Iterable, Sequence, Set
67
from dataclasses import dataclass
78
from enum import Enum
89
from typing import (
9-
AbstractSet,
1010
Any,
1111
Callable,
12-
Collection,
13-
Iterable,
14-
List,
1512
Literal,
1613
Optional,
17-
Sequence,
18-
Type,
1914
TypeVar,
2015
Union,
2116
)
@@ -64,12 +59,12 @@ def __init__(
6459
self._strip_whitespace = strip_whitespace
6560

6661
@abstractmethod
67-
def split_text(self, text: str) -> List[str]:
62+
def split_text(self, text: str) -> list[str]:
6863
"""Split text into multiple components."""
6964

7065
def create_documents(
7166
self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
72-
) -> List[Document]:
67+
) -> list[Document]:
7368
"""Create documents from a list of texts."""
7469
_metadatas = metadatas or [{}] * len(texts)
7570
documents = []
@@ -87,15 +82,15 @@ def create_documents(
8782
documents.append(new_doc)
8883
return documents
8984

90-
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
85+
def split_documents(self, documents: Iterable[Document]) -> list[Document]:
9186
"""Split documents."""
9287
texts, metadatas = [], []
9388
for doc in documents:
9489
texts.append(doc.page_content)
9590
metadatas.append(doc.metadata)
9691
return self.create_documents(texts, metadatas=metadatas)
9792

98-
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
93+
def _join_docs(self, docs: list[str], separator: str) -> Optional[str]:
9994
text = separator.join(docs)
10095
if self._strip_whitespace:
10196
text = text.strip()
@@ -104,13 +99,13 @@ def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
10499
else:
105100
return text
106101

107-
def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
102+
def _merge_splits(self, splits: Iterable[str], separator: str) -> list[str]:
108103
# We now want to combine these smaller pieces into medium size
109104
# chunks to send to the LLM.
110105
separator_len = self._length_function(separator)
111106

112107
docs = []
113-
current_doc: List[str] = []
108+
current_doc: list[str] = []
114109
total = 0
115110
for d in splits:
116111
_len = self._length_function(d)
@@ -169,10 +164,10 @@ def _huggingface_tokenizer_length(text: str) -> int:
169164

170165
@classmethod
171166
def from_tiktoken_encoder(
172-
cls: Type[TS],
167+
cls: type[TS],
173168
encoding_name: str = "gpt2",
174169
model_name: Optional[str] = None,
175-
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
170+
allowed_special: Union[Literal["all"], Set[str]] = set(),
176171
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
177172
**kwargs: Any,
178173
) -> TS:
@@ -225,7 +220,7 @@ def __init__(
225220
self,
226221
encoding_name: str = "gpt2",
227222
model_name: Optional[str] = None,
228-
allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
223+
allowed_special: Union[Literal["all"], Set[str]] = set(),
229224
disallowed_special: Union[Literal["all"], Collection[str]] = "all",
230225
**kwargs: Any,
231226
) -> None:
@@ -248,7 +243,7 @@ def __init__(
248243
self._allowed_special = allowed_special
249244
self._disallowed_special = disallowed_special
250245

251-
def split_text(self, text: str) -> List[str]:
246+
def split_text(self, text: str) -> list[str]:
252247
"""Splits the input text into smaller chunks based on tokenization.
253248
254249
This method uses a custom tokenizer configuration to encode the input text
@@ -264,7 +259,7 @@ def split_text(self, text: str) -> List[str]:
264259
of the input text based on the tokenization and chunking rules.
265260
"""
266261

267-
def _encode(_text: str) -> List[int]:
262+
def _encode(_text: str) -> list[int]:
268263
return self._tokenizer.encode(
269264
_text,
270265
allowed_special=self._allowed_special,
@@ -320,15 +315,15 @@ class Tokenizer:
320315
"""Overlap in tokens between chunks"""
321316
tokens_per_chunk: int
322317
"""Maximum number of tokens per chunk"""
323-
decode: Callable[[List[int]], str]
318+
decode: Callable[[list[int]], str]
324319
""" Function to decode a list of token ids to a string"""
325-
encode: Callable[[str], List[int]]
320+
encode: Callable[[str], list[int]]
326321
""" Function to encode a string to a list of token ids"""
327322

328323

329-
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
324+
def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]:
330325
"""Split incoming text and return chunks using tokenizer."""
331-
splits: List[str] = []
326+
splits: list[str] = []
332327
input_ids = tokenizer.encode(text)
333328
start_idx = 0
334329
cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))

libs/text-splitters/langchain_text_splitters/character.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from __future__ import annotations
22

33
import re
4-
from typing import Any, List, Literal, Optional, Union
4+
from typing import Any, Literal, Optional, Union
55

66
from langchain_text_splitters.base import Language, TextSplitter
77

@@ -17,7 +17,7 @@ def __init__(
1717
self._separator = separator
1818
self._is_separator_regex = is_separator_regex
1919

20-
def split_text(self, text: str) -> List[str]:
20+
def split_text(self, text: str) -> list[str]:
2121
"""Split into chunks without re-inserting lookaround separators."""
2222
# 1. Determine split pattern: raw regex or escaped literal
2323
sep_pattern = (
@@ -46,7 +46,7 @@ def split_text(self, text: str) -> List[str]:
4646

4747
def _split_text_with_regex(
4848
text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
49-
) -> List[str]:
49+
) -> list[str]:
5050
# Now that we have the separator, split the text
5151
if separator:
5252
if keep_separator:
@@ -80,7 +80,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
8080

8181
def __init__(
8282
self,
83-
separators: Optional[List[str]] = None,
83+
separators: Optional[list[str]] = None,
8484
keep_separator: Union[bool, Literal["start", "end"]] = True,
8585
is_separator_regex: bool = False,
8686
**kwargs: Any,
@@ -90,7 +90,7 @@ def __init__(
9090
self._separators = separators or ["\n\n", "\n", " ", ""]
9191
self._is_separator_regex = is_separator_regex
9292

93-
def _split_text(self, text: str, separators: List[str]) -> List[str]:
93+
def _split_text(self, text: str, separators: list[str]) -> list[str]:
9494
"""Split incoming text and return chunks."""
9595
final_chunks = []
9696
# Get appropriate separator to use
@@ -130,7 +130,7 @@ def _split_text(self, text: str, separators: List[str]) -> List[str]:
130130
final_chunks.extend(merged_text)
131131
return final_chunks
132132

133-
def split_text(self, text: str) -> List[str]:
133+
def split_text(self, text: str) -> list[str]:
134134
"""Split the input text into smaller chunks based on predefined separators.
135135
136136
Args:
@@ -161,7 +161,7 @@ def from_language(
161161
return cls(separators=separators, is_separator_regex=True, **kwargs)
162162

163163
@staticmethod
164-
def get_separators_for_language(language: Language) -> List[str]:
164+
def get_separators_for_language(language: Language) -> list[str]:
165165
"""Retrieve a list of separators specific to the given language.
166166
167167
Args:

0 commit comments

Comments
 (0)