Release 0.7.7

Manul from Pathway · embe-pw · janchorowski · Manul from Pathway · commit c371b117de57 · 2023-12-27T13:29:55.000Z
Co-authored-by: Michał Bartoszkiewicz &lt;embe@pathway.com&gt;
Co-authored-by: Jan Chorowski &lt;janek@pathway.com&gt;
Co-authored-by: Xavier Gendre &lt;xavier@pathway.com&gt;
Co-authored-by: Adrian Kosowski &lt;adrian@pathway.com&gt;
Co-authored-by: Jakub Kowalski &lt;kuba@pathway.com&gt;
Co-authored-by: Sergey Kulik &lt;sergey@pathway.com&gt;
Co-authored-by: Mateusz Lewandowski &lt;mateusz@pathway.com&gt;
Co-authored-by: Mohamed Malhou &lt;mohamed@pathway.com&gt;
Co-authored-by: Krzysztof Nowicki &lt;krzysiek@pathway.com&gt;
Co-authored-by: Richard Pelgrim &lt;richard.pelgrim@pathway.com&gt;
Co-authored-by: Kamil Piechowiak &lt;kamil@pathway.com&gt;
Co-authored-by: Paweł Podhajski &lt;pawel.podhajski@pathway.com&gt;
Co-authored-by: Olivier Ruas &lt;olivier@pathway.com&gt;
Co-authored-by: Przemysław Uznański &lt;przemek@pathway.com&gt;
Co-authored-by: Sebastian Włudzik &lt;sebastian.wludzik@pathway.com&gt;
GitOrigin-RevId: 312344420a55f049c50addb049b77b403a5ce194
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## [Unreleased]
 
+## [0.7.7] - 2023-12-27
+
+### Added
+- pathway.xpacks.llm.splitter.TokenCountSplitter.
+
 ## [0.7.6] - 2023-12-22
 
 ## New Features
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pathway"
-version = "0.7.6"
+version = "0.7.7"
 edition = "2021"
 publish = false
 rust-version = "1.72.0"
diff --git a/python/pathway/xpacks/llm/splitter.py b/python/pathway/xpacks/llm/splitter.py
@@ -2,6 +2,7 @@
 A library of text spliiters - routines which slit a long text into smaller chunks.
 """
 
+import unicodedata
 from typing import Dict, List, Tuple
 
 
@@ -17,3 +18,79 @@ def null_splitter(txt: str) -> List[Tuple[str, Dict]]:
     The null splitter always return a list of length one containing the full text and empty metadata.
     """
     return [(txt, {})]
+
+
+def _normalize_unicode(text: str):
+    """
+    Get rid of ligatures
+    """
+    return unicodedata.normalize("NFKC", text)
+
+
+class TokenCountSplitter:
+    """
+    Splits a given string or a list of strings into chunks based on token
+    count.
+
+    This splitter tokenizes the input texts and splits them into smaller parts ("chunks")
+    ensuring that each chunk has a token count between `min_tokens` and
+    `max_tokens`. It also attempts to break chunks at sensible points such as
+    punctuation marks.
+
+    Arguments:
+        min_tokens: minimum tokens in a chunk of text.
+        max_tokens: maximum size of a chunk in tokens.
+        encoding_name: name of the encoding from `tiktoken`.
+
+    Example:
+
+    # >>> from pathway.xpacks.llm.splitter import TokenCountSplitter
+    # >>> import pathway as pw
+    # >>> t  = pw.debug.table_from_markdown(
+    # ...     '''| text
+    # ... 1| cooltext'''
+    # ... )
+    # >>> splitter = TokenCountSplitter(min_tokens=1, max_tokens=1)
+    # >>> t += t.select(chunks = pw.apply(splitter, pw.this.text))
+    # >>> pw.debug.compute_and_print(t, include_id=False)
+    # text     | chunks
+    # cooltext | (('cool', pw.Json({})), ('text', pw.Json({})))
+    """
+
+    CHARS_PER_TOKEN = 3
+    PUNCTUATION = [".", "?", "!", "\n"]
+
+    def __init__(
+        self,
+        min_tokens: int = 50,
+        max_tokens: int = 500,
+        encoding_name: str = "cl100k_base",
+    ):
+        self.min_tokens = min_tokens
+        self.max_tokens = max_tokens
+        self.encoding_name = encoding_name
+
+    def __call__(self, txt: str) -> List[Tuple[str, Dict]]:
+        import tiktoken
+
+        tokenizer = tiktoken.get_encoding(self.encoding_name)
+        text = _normalize_unicode(txt)
+        tokens = tokenizer.encode_ordinary(text)
+        output: List[Tuple[str, Dict]] = []
+        i = 0
+        while i < len(tokens):
+            chunk_tokens = tokens[i : i + self.max_tokens]
+            chunk = tokenizer.decode(chunk_tokens)
+            last_punctuation = max(
+                [chunk.rfind(p) for p in self.PUNCTUATION], default=-1
+            )
+            if (
+                last_punctuation != -1
+                and last_punctuation > self.CHARS_PER_TOKEN * self.min_tokens
+            ):
+                chunk = chunk[: last_punctuation + 1]
+
+            i += len(tokenizer.encode_ordinary(chunk))
+
+            output.append((chunk, {}))
+        return output