langchain-ai
diff --git a/‎libs/text-splitters/langchain_text_splitters/base.py
Lines changed: 16 additions & 21 deletions b/‎libs/text-splitters/langchain_text_splitters/base.py
Lines changed: 16 additions & 21 deletions
diff --git a/‎libs/text-splitters/langchain_text_splitters/character.py
Lines changed: 7 additions & 7 deletions b/‎libs/text-splitters/langchain_text_splitters/character.py
Lines changed: 7 additions & 7 deletions
@@ -3,19 +3,14 @@
 import copy
 import logging
 from abc import ABC, abstractmethod
+from collections.abc import Collection, Iterable, Sequence, Set
 from dataclasses import dataclass
 from enum import Enum
 from typing import (
-    AbstractSet,
     Any,
     Callable,
-    Collection,
-    Iterable,
-    List,
     Literal,
     Optional,
-    Sequence,
-    Type,
     TypeVar,
     Union,
 )
@@ -64,12 +59,12 @@ def __init__(
         self._strip_whitespace = strip_whitespace
 
     @abstractmethod
-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
         """Split text into multiple components."""
 
     def create_documents(
         self, texts: list[str], metadatas: Optional[list[dict[Any, Any]]] = None
-    ) -> List[Document]:
+    ) -> list[Document]:
         """Create documents from a list of texts."""
         _metadatas = metadatas or [{}] * len(texts)
         documents = []
@@ -87,15 +82,15 @@ def create_documents(
                 documents.append(new_doc)
         return documents
 
-    def split_documents(self, documents: Iterable[Document]) -> List[Document]:
+    def split_documents(self, documents: Iterable[Document]) -> list[Document]:
         """Split documents."""
         texts, metadatas = [], []
         for doc in documents:
             texts.append(doc.page_content)
             metadatas.append(doc.metadata)
         return self.create_documents(texts, metadatas=metadatas)
 
-    def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
+    def _join_docs(self, docs: list[str], separator: str) -> Optional[str]:
         text = separator.join(docs)
         if self._strip_whitespace:
             text = text.strip()
@@ -104,13 +99,13 @@ def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
         else:
             return text
 
-    def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
+    def _merge_splits(self, splits: Iterable[str], separator: str) -> list[str]:
         # We now want to combine these smaller pieces into medium size
         # chunks to send to the LLM.
         separator_len = self._length_function(separator)
 
         docs = []
-        current_doc: List[str] = []
+        current_doc: list[str] = []
         total = 0
         for d in splits:
             _len = self._length_function(d)
@@ -169,10 +164,10 @@ def _huggingface_tokenizer_length(text: str) -> int:
 
     @classmethod
     def from_tiktoken_encoder(
-        cls: Type[TS],
+        cls: type[TS],
         encoding_name: str = "gpt2",
         model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+        allowed_special: Union[Literal["all"], Set[str]] = set(),
         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
         **kwargs: Any,
     ) -> TS:
@@ -225,7 +220,7 @@ def __init__(
         self,
         encoding_name: str = "gpt2",
         model_name: Optional[str] = None,
-        allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
+        allowed_special: Union[Literal["all"], Set[str]] = set(),
         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
         **kwargs: Any,
     ) -> None:
@@ -248,7 +243,7 @@ def __init__(
         self._allowed_special = allowed_special
         self._disallowed_special = disallowed_special
 
-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
         """Splits the input text into smaller chunks based on tokenization.
 
         This method uses a custom tokenizer configuration to encode the input text
@@ -264,7 +259,7 @@ def split_text(self, text: str) -> List[str]:
             of the input text based on the tokenization and chunking rules.
         """
 
-        def _encode(_text: str) -> List[int]:
+        def _encode(_text: str) -> list[int]:
             return self._tokenizer.encode(
                 _text,
                 allowed_special=self._allowed_special,
@@ -320,15 +315,15 @@ class Tokenizer:
     """Overlap in tokens between chunks"""
     tokens_per_chunk: int
     """Maximum number of tokens per chunk"""
-    decode: Callable[[List[int]], str]
+    decode: Callable[[list[int]], str]
     """ Function to decode a list of token ids to a string"""
-    encode: Callable[[str], List[int]]
+    encode: Callable[[str], list[int]]
     """ Function to encode a string to a list of token ids"""
 
 
-def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
+def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> list[str]:
     """Split incoming text and return chunks using tokenizer."""
-    splits: List[str] = []
+    splits: list[str] = []
     input_ids = tokenizer.encode(text)
     start_idx = 0
     cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
 
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from typing import Any, List, Literal, Optional, Union
+from typing import Any, Literal, Optional, Union
 
 from langchain_text_splitters.base import Language, TextSplitter
 
@@ -17,7 +17,7 @@ def __init__(
         self._separator = separator
         self._is_separator_regex = is_separator_regex
 
-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
         """Split into chunks without re-inserting lookaround separators."""
         # 1. Determine split pattern: raw regex or escaped literal
         sep_pattern = (
@@ -46,7 +46,7 @@ def split_text(self, text: str) -> List[str]:
 
 def _split_text_with_regex(
     text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]]
-) -> List[str]:
+) -> list[str]:
     # Now that we have the separator, split the text
     if separator:
         if keep_separator:
@@ -80,7 +80,7 @@ class RecursiveCharacterTextSplitter(TextSplitter):
 
     def __init__(
         self,
-        separators: Optional[List[str]] = None,
+        separators: Optional[list[str]] = None,
         keep_separator: Union[bool, Literal["start", "end"]] = True,
         is_separator_regex: bool = False,
         **kwargs: Any,
@@ -90,7 +90,7 @@ def __init__(
         self._separators = separators or ["\n\n", "\n", " ", ""]
         self._is_separator_regex = is_separator_regex
 
-    def _split_text(self, text: str, separators: List[str]) -> List[str]:
+    def _split_text(self, text: str, separators: list[str]) -> list[str]:
         """Split incoming text and return chunks."""
         final_chunks = []
         # Get appropriate separator to use
@@ -130,7 +130,7 @@ def _split_text(self, text: str, separators: List[str]) -> List[str]:
             final_chunks.extend(merged_text)
         return final_chunks
 
-    def split_text(self, text: str) -> List[str]:
+    def split_text(self, text: str) -> list[str]:
         """Split the input text into smaller chunks based on predefined separators.
 
         Args:
@@ -161,7 +161,7 @@ def from_language(
         return cls(separators=separators, is_separator_regex=True, **kwargs)
 
     @staticmethod
-    def get_separators_for_language(language: Language) -> List[str]:
+    def get_separators_for_language(language: Language) -> list[str]:
         """Retrieve a list of separators specific to the given language.
 
         Args: