Skip to content

Commit 5bff598

Browse files
PalmPalm7oindrillac
authored andcommitted
Updated chunking_document.
1. Applied document-specific test splitter from Langchain in replace of original naive version. 2. Made heuristics changes to markdown file, especially using regex to trim markdown tables in attempt to fit in the whole table with limited context window. 3. For updated chunk_document() function, see Chunking_Demo.ipynb on chunking with server_ctx_size=4096, chunk_word_count=1024). Granite 7b has 4k context windows. Signed-off-by: Andy Xie <anxie@redhat.com>
1 parent c2e19a4 commit 5bff598

File tree

1 file changed

+32
-7
lines changed

1 file changed

+32
-7
lines changed

src/instructlab/sdg/utils/chunking.py

Lines changed: 32 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22

33
# Standard
44
from typing import List
5+
import logging
6+
import re
57

68
# Third Party
7-
from langchain_text_splitters import RecursiveCharacterTextSplitter
9+
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
810

911
_DEFAULT_CHUNK_OVERLAP = 100
1012

13+
logger = logging.getLogger(__name__)
14+
1115

1216
def _num_tokens_from_words(num_words) -> int:
1317
return int(num_words * 1.3) # 1 word ~ 1.3 token
@@ -21,12 +25,24 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
2125
"""
2226
Iterates over the documents and splits them into chunks based on the word count provided by the user.
2327
Args:
24-
documents (dict): List of documents retrieved from git (can also consist of a single document).
28+
documents (list): List of documents retrieved from git (can also consist of a single document).
2529
server_ctx_size (int): Context window size of server.
2630
chunk_word_count (int): Maximum number of words to chunk a document.
2731
Returns:
2832
List[str]: List of chunked documents.
2933
"""
34+
35+
# Checks for input type error
36+
if isinstance(documents, str):
37+
documents = [documents]
38+
logger.info(
39+
"Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
40+
)
41+
elif not isinstance(documents, list):
42+
raise TypeError(
43+
"Expected: documents to be a list, but got {}".format(type(documents))
44+
)
45+
3046
no_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
3147
if no_tokens_per_doc > int(server_ctx_size - 1024):
3248
raise ValueError(
@@ -36,15 +52,24 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
3652
)
3753
)
3854
)
55+
# Placeholder for params
3956
content = []
40-
text_splitter = RecursiveCharacterTextSplitter(
41-
separators=["\n\n", "\n", " "],
42-
chunk_size=_num_chars_from_tokens(no_tokens_per_doc),
43-
chunk_overlap=_DEFAULT_CHUNK_OVERLAP,
57+
chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
58+
chunk_overlap = _DEFAULT_CHUNK_OVERLAP
59+
60+
# Using Markdown as default, document-specific chunking will be implemented in seperate pr.
61+
text_splitter = RecursiveCharacterTextSplitter.from_language(
62+
language=Language.MARKDOWN,
63+
chunk_size=chunk_size,
64+
chunk_overlap=chunk_overlap,
4465
)
4566

67+
# Determine file type for heuristics, default with markdown
4668
for docs in documents:
69+
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
70+
docs = re.sub(r"-{2,}\|", "-|", docs)
71+
# Remove unnecessary spaces in front of pipe characters in a markdown table.
72+
docs = re.sub(r"\ +\|", " |", docs)
4773
temp = text_splitter.create_documents([docs])
4874
content.extend([item.page_content for item in temp])
49-
5075
return content

0 commit comments

Comments
 (0)