Skip to content

Commit e1e76ba

Browse files
examples: add chunker job to support configurable chunking (#3093)
What: Extend the pgvector-embedder by adding configurable chunking mechanism. Why: Until now, the whole documents were embedded and ingested into the database but their size sometimes exceed the token limit imposed by the LLM used for inference. This change introduces a configurable document chunking mechanism to overcome this problem. Testing Done: Ran the pipeline jobs locally. Closes #3084 Signed-off by: Yoan Salambashev <[email protected]> --------- Signed-off-by: dependabot[bot] <[email protected]> Co-authored-by: Yoan Salambashev <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
1 parent b566d4f commit e1e76ba

File tree

11 files changed

+337
-52
lines changed

11 files changed

+337
-52
lines changed

examples/chunker/00_properties.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
import os
4+
5+
from config import CHUNKS_JSON_FILE
6+
from config import DOCUMENTS_JSON_FILE
7+
from vdk.api.job_input import IJobInput
8+
9+
10+
def run(job_input: IJobInput):
11+
properties = job_input.get_all_properties()
12+
13+
data_file = os.path.join(job_input.get_job_directory(), DOCUMENTS_JSON_FILE)
14+
chunks_file = os.path.join(job_input.get_job_directory(), CHUNKS_JSON_FILE)
15+
properties.update(
16+
dict(
17+
data_file=data_file,
18+
chunks_file=chunks_file,
19+
chunking_strategy="fixed",
20+
)
21+
)
22+
job_input.set_all_properties(properties)

examples/chunker/10_chunk_data.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
import json
4+
import logging
5+
import pathlib
6+
import re
7+
import string
8+
9+
from config import CHUNK_OVERLAP
10+
from config import CHUNK_SIZE
11+
from config import CHUNKS_JSON_FILE
12+
from config import DOCUMENTS_JSON_FILE
13+
from nltk.tokenize import word_tokenize
14+
from vdk.api.job_input import IJobInput
15+
16+
log = logging.getLogger(__name__)
17+
18+
19+
def custom_join(tokens):
20+
"""
21+
Joins a list of tokens into a string, adding a space between words
22+
but not between a word and following punctuation.
23+
"""
24+
result = ""
25+
for i, token in enumerate(tokens):
26+
if i == 0:
27+
result += token
28+
elif token in string.punctuation:
29+
result += token
30+
else:
31+
result += " " + token
32+
return result
33+
34+
35+
class ChunkerFactory:
36+
@staticmethod
37+
def get_chunker(strategy_name: str, **kwargs):
38+
chunkers = {
39+
"fixed": FixedSizeChunker,
40+
"wiki": WikiSectionChunker,
41+
}
42+
if strategy_name in chunkers:
43+
return (
44+
chunkers[strategy_name](**kwargs)
45+
if strategy_name == "fixed"
46+
else chunkers[strategy_name]()
47+
)
48+
else:
49+
raise ValueError(
50+
f"Unknown chunking strategy: {strategy_name}. "
51+
f"Supported strategies: {list(chunkers.keys())}"
52+
)
53+
54+
55+
class Chunker:
56+
"""
57+
Splits text into chunks. One of the provided options must be chosen.
58+
"""
59+
60+
def chunk(self, documents: dict):
61+
raise NotImplementedError("The chunking strategy is not supported.")
62+
63+
64+
class FixedSizeChunker(Chunker):
65+
"""
66+
Splits text into chunks of fixed size with overlap between neighbouring ones.
67+
"""
68+
69+
def __init__(self, chunk_size, chunk_overlap):
70+
self.chunk_size = chunk_size
71+
self.chunk_overlap = chunk_overlap
72+
73+
def chunk(self, documents):
74+
chunked_documents = []
75+
for doc in documents:
76+
tokens = word_tokenize(doc["data"])
77+
for i in range(0, len(tokens), self.chunk_size - self.chunk_overlap):
78+
chunk_id = f"{doc['metadata']['id']}_{i // (self.chunk_size - self.chunk_overlap)}"
79+
chunk_metadata = doc["metadata"].copy()
80+
chunk_metadata["id"] = chunk_id
81+
chunked_documents.append(
82+
{
83+
"metadata": chunk_metadata,
84+
"data": custom_join(tokens[i : i + self.chunk_size]),
85+
}
86+
)
87+
return chunked_documents
88+
89+
90+
class WikiSectionChunker(Chunker):
91+
"""
92+
Splits Wiki articles into chunks.
93+
"""
94+
95+
def __init__(self):
96+
pass
97+
98+
def chunk(self, documents):
99+
chunked_documents = []
100+
for doc in documents:
101+
sections = re.split(
102+
r"==+ [^=]+ ==", doc["data"]
103+
) # Wiki section headers are identified by ==
104+
for i, section in enumerate(sections):
105+
chunk_id = f"{doc['metadata']['id']}_{i}"
106+
chunk_metadata = doc["metadata"].copy()
107+
chunk_metadata["id"] = chunk_id
108+
chunked_documents.append(
109+
{
110+
"metadata": chunk_metadata,
111+
"data": section.strip(),
112+
}
113+
)
114+
return chunked_documents
115+
116+
117+
def load_documents(json_file_path: str):
118+
"""
119+
Loads documents from JSON file.
120+
121+
:param json_file_path: Path to the JSON file containing documents.
122+
:return: List of documents.
123+
"""
124+
with open(json_file_path, encoding="utf-8") as file:
125+
return json.load(file)
126+
127+
128+
def store(name, content):
129+
json_data = json.dumps(content, indent=4)
130+
with open(name, "w") as file:
131+
file.write(json_data)
132+
133+
134+
def run(job_input: IJobInput):
135+
log.info(f"Starting job step {__name__}")
136+
137+
data_job_dir = pathlib.Path(job_input.get_job_directory())
138+
input_json = job_input.get_property("data_file", data_job_dir / DOCUMENTS_JSON_FILE)
139+
output_json = job_input.get_property("chunks_file", data_job_dir / CHUNKS_JSON_FILE)
140+
chunking_strategy = job_input.get_property("chunking_strategy", "fixed")
141+
chunk_size = CHUNK_SIZE
142+
chunk_overlap = CHUNK_OVERLAP
143+
144+
documents = load_documents(input_json)
145+
print(documents)
146+
chunker = ChunkerFactory.get_chunker(
147+
chunking_strategy, chunk_size=chunk_size, chunk_overlap=chunk_overlap
148+
)
149+
chunked_documents = chunker.chunk(documents)
150+
print(chunked_documents)
151+
if chunked_documents:
152+
log.info(
153+
f"{len(chunked_documents)} documents chunks created using the {chunking_strategy} chunking strategy."
154+
)
155+
store(output_json, chunked_documents)
156+
log.info(f"Chunks saved to {output_json}")

examples/chunker/README.md

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Embed And Ingest Data Job Example
2+
3+
The following Versatile Data Kit example allows you to chunk document data (in certain format).
4+
5+
# Expected input format
6+
7+
```json
8+
[
9+
{
10+
"metadata": {
11+
"title": "Page (or chunk) title",
12+
"id": "Content page ID",
13+
"source": "Source URL",
14+
"deleted": <is the content being deleted in the source>
15+
},
16+
"data": "Content Text"
17+
},
18+
]
19+
```
20+
21+
# Output format
22+
23+
The output format is the same as the input one. The "data" field is the only difference: it now contains a chunk
24+
of a document instead of the whole document.
25+
26+
# Chunking the data
27+
28+
There is a property chunking_strategy which accounts for the type of chunking to use for the documents.
29+
It is set by default to "fixed" which means fixed size chunking with overlap.
30+
The example for the fixed size chunking supports configurable chunking - the CHUNK_SIZE and CHUNK_OVERLAP
31+
are configured in config.py.
32+
They account for the chunk size (tokens) and the chunk overlap between neighbouring chunks of the data.
33+
Another chunking strategy is "wiki" which chunks Wikipedia articles into the different sections.
34+
35+
# Run the example
36+
To run the data job locally:
37+
```bash
38+
vdk run chunker
39+
```

examples/chunker/config.ini

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[owner]
2+
team = my-team

examples/chunker/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# Copyright 2021-2024 VMware, Inc.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
DOCUMENTS_JSON_FILE = "fixed_size_example.json"
5+
CHUNKS_JSON_FILE = "chunks_example.json"
6+
CHUNK_SIZE = 2048
7+
CHUNK_OVERLAP = 64
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
[
2+
{
3+
"metadata": {
4+
"title": "Getting Started",
5+
"id": "123213312",
6+
"source": "https://github.com/vmware/versatile-data-kit/wiki/Getting-Started",
7+
"deleted": false
8+
},
9+
"data": "VDK Getting Started guide"
10+
},
11+
{
12+
"metadata": {
13+
"title": "VDK Wiki",
14+
"id": "747124724",
15+
"source": "https://github.com/vmware/versatile-data-kit/wiki",
16+
"deleted": false
17+
},
18+
"data": "VDK Wiki"
19+
},
20+
{
21+
"metadata": {
22+
"title": "VDK Issues",
23+
"id": "721295269",
24+
"source": "https://github.com/vmware/versatile-data-kit/issues",
25+
"deleted": false
26+
},
27+
"data": "VDK Issues"
28+
},
29+
{
30+
"metadata": {
31+
"title": "VDK PRs",
32+
"id": "1323122133",
33+
"source": "https://github.com/vmware/versatile-data-kit/pulls",
34+
"deleted": false
35+
},
36+
"data": "VDK Pull Requests"
37+
},
38+
{
39+
"metadata": {
40+
"title": "VDK Main Page",
41+
"id": "312343243",
42+
"source": "https://github.com/vmware/versatile-data-kit/tree/main",
43+
"deleted": false
44+
},
45+
"data": "VDK: One framework to develop, deploy and operate data workflows with Python and SQL."
46+
},
47+
{
48+
"metadata": {
49+
"title": "VDK VEP",
50+
"id": "747124725",
51+
"source": "https://github.com/vmware/versatile-data-kit/tree/main/specs/vep-milestone-25-vector-database-ingestion",
52+
"deleted": false
53+
},
54+
"data": "VDK VEP milestone 25 vector database ingestion. Summary: With the rise in popularity of LLMs and RAG we see VDK as a core component to getting the data where we need it to be. VDK's strengths are ETL tasks. We see that its very well suited to populating the databases needed for RAG."
55+
}
56+
]

examples/chunker/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
nltk
2+
numpy
3+
sentence-transformers

examples/chunker/wiki_example.json

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"metadata": {
4+
"title": "Ajvar",
5+
"id": "123",
6+
"source": "https://en.wikipedia.org/wiki/Ajvar#:~:text=Ajvar%20(pronounced%3A%20%2F%CB%88a%C9%AA,is%20popular%20in%20Southeast%20Europe.&text=Homemade%20ajvar%20is%20made%20of%20roasted%20peppers.",
7+
"deleted": false
8+
},
9+
"data": "Ajvar is a condiment made principally from sweet bell peppers and eggplants. The relish became a popular side dish throughout Yugoslavia after World War II and is popular in Southeast Europe. Homemade ajvar is made of roasted peppers. Depending on the capsaicin content in bell peppers and the amount of added chili peppers, it can be sweet , piquant , or very hot. Ajvar can be consumed as a bread spread or as a side dish. Ajvar has a few variations. One variation contains tomato and eggplant. Another is made with green bell peppers and oregano. Homemade Leskovac Ajvar and Macedonian Ajvar are registered with the World Intellectual Property Organization in order to protect their brand names. == Etymology and origin == The name ajvar comes from the Turkish word havyar, which means salted roe, caviar and shares an etymology with caviar, coming from the Persian word xaviyar. Before the 20th century, significant local production of caviar occurred on the Danube, with sturgeon swimming from the Black Sea up to Belgrade. Domestic ajvar, meaning caviar,” used to be a very popular dish in Belgrade homes and restaurants, but the domestic production of caviar became unsteady in the 1890s because of labor disputes. Eventually a special pepper salad was offered as a substitute in Belgrade restaurants under the name red ajvar or Serbian ajvar . == Preparation == Homemade ajvar is made of roasted, minced, and then cooked peppers, while some industrial producers use fresh minced peppers, cooked with sunflower oil afterwards, which leads to a lower quality. Ajvar preparation is somewhat difficult, because it requires considerable manual labour, particularly for peeling the roasted peppers. Traditionally, people prepare it in mid-autumn, when bell peppers are most abundant, and preserve it in glass jars for consumption throughout the year. Anecdotally, most households' stocks do not last until the spring, when fresh vegetables become available, so it is usually enjoyed as a winter food. Often, the whole family or neighbours gather to prepare the bell peppers. The traditional cultivar of pepper used is called roga . Roga is large, red, horn-shaped, with thick flesh and relatively easy to peel. It typically ripens in late September.To produce ajvar, bell peppers are roasted whole on a plate on an open fire, a plate of wood in a stove, or in an oven. The baked peppers must briefly cool to allow the flesh to separate from the skin. Next, the skin is carefully peeled off and the seeds are removed. The peppers are then ground in a mill or chopped into tiny pieces . Finally, the resulting mash is stewed for several hours in large pots. Sunflower oil is added at this stage to condense and reduce the water, and to enhance later preservation. Salt is added at the end and the hot mush is poured directly into sterilized glass jars, which are sealed immediately. == Production == Ajvar is produced in most Balkan countries, including Albania, Bosnia, Croatia, North Macedonia, Slovenia and Serbia. Serbia's reported annual production is 640 tons.Ajvar is one of the so-called zimnica , which include pickled chili peppers, pickled tomatoes, and anything else that can be preserved in a jar just before winter. == See also == Ljutenica – dishPages displaying wikidata descriptions as a fallbackPages displaying short descriptions with no spaces, a similar relish in Bulgarian, Macedonian, and Serbian cuisines Pindjur – relish formPages displaying wikidata descriptions as a fallback, a similar relish in Bosnian, Macedonian, and Serbian cuisines Zacuscă – Romanian-Moldovan dish, a similar relish in Romanian cuisine Kyopolou – Bulgarian-Turkish dish, an eggplant-based relish in Bulgarian and Turkish cuisines Malidzano, a similar relish in Macedonian cuisine Biber salçası – Paste made from peppers or tomato and salt, originating in TurkeyPages displaying short descriptions of redirect targets, a Turkish paste made from red peppers alone Lecso – Hungarian dishPages displaying short descriptions of redirect targets, a similar Hungarian stewed red pepper, onion, and garlic dish List of spreads Achar – Pickled varieties of vegetable and fruit, a similar relish of Indo-European origin in South Asian cuisines == References == == External links == Fall Brings Red Peppers and Ajvar, 'Serbian Salsa'. NPR. 8 November 2006. Ajvar srpski kavijar . Novosti. 2013. Leskovčanka po čijem receptu je brendiran srpski ajvar ušla u biznis kad je ostala bez posla. Blic . 2012. Ajvar - Top-notch gastronomic delight, vegan soul food, recipes and origin. Ajvar.com. 2017."
10+
},
11+
{
12+
"metadata": {
13+
"title": "Bob chorba",
14+
"id": "124",
15+
"source": "https://en.wikipedia.org/wiki/Bob_chorba",
16+
"deleted": false
17+
},
18+
"data": "Bob chorba is a chorba, a Bulgarian soup. It is made from dry beans, onions, tomatoes, chubritza or dzhodzhen and carrots.Local variations may also exclude the carrots or include paprika, potatoes or even some kind of meat. Historically, it has been a common soup and staple food at Bulgarian monasteries. == See also == Bulgarian cuisine List of bean soups List of soups == References =="
19+
}
20+
]

examples/pgvector-embedder/20_clean_and_embed_json_data.py renamed to examples/pgvector-embedder/20_embed_data.py

Lines changed: 8 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
# SPDX-License-Identifier: Apache-2.0
33
import json
44
import logging
5-
import re
65

76
from common.database_storage import DatabaseStorage
87
from config import get_value
@@ -12,26 +11,9 @@
1211
log = logging.getLogger(__name__)
1312

1413

15-
def clean_text(text):
16-
"""
17-
Prepares text for NLP tasks (embedding and RAG) by standardizing its form.
18-
:param text: A string containing the text to be processed.
19-
:return: The processed text as a string.
20-
"""
21-
return text
22-
23-
24-
def load_and_clean_documents(content):
25-
cleaned_documents = []
26-
documents = json.loads(content)
27-
28-
for doc in documents:
29-
if "data" in doc:
30-
cleaned_text = clean_text(doc["data"])
31-
cleaned_documents.append([cleaned_text])
32-
33-
print(len(cleaned_documents))
34-
return cleaned_documents
14+
def load_documents(json_file_path):
15+
with open(json_file_path, encoding="utf-8") as file:
16+
return json.load(file)
3517

3618

3719
def embed_documents_in_batches(documents):
@@ -42,7 +24,7 @@ def embed_documents_in_batches(documents):
4224
embeddings = []
4325
for start_index in range(0, total):
4426
# the resources are not enough to batch 2 documents at a time, so the batch = 1 doc
45-
batch = documents[start_index]
27+
batch = [documents[start_index]]
4628
log.info(f"BATCH: {len(batch)}.")
4729
embeddings.extend(model.encode(batch, show_progress_bar=True))
4830

@@ -54,16 +36,13 @@ def run(job_input: IJobInput):
5436
log.info(f"Starting job step {__name__}")
5537

5638
output_embeddings = get_value(job_input, "output_embeddings")
57-
5839
storage = DatabaseStorage(get_value(job_input, "storage_connection_string"))
5940
storage_name = get_value(job_input, "storage_name", "confluence_data")
6041

61-
cleaned_documents = load_and_clean_documents(storage.retrieve(storage_name))
62-
if cleaned_documents:
63-
log.info(
64-
f"{len(cleaned_documents)} documents loaded and cleaned for embedding."
65-
)
66-
embeddings = embed_documents_in_batches(cleaned_documents)
42+
documents = load_documents(storage.retrieve(storage_name))
43+
if documents:
44+
log.info(f"{len(documents)} chunks loaded and cleaned for embedding.")
45+
embeddings = embed_documents_in_batches(documents)
6746
with open(output_embeddings, "wb") as file:
6847
import pickle
6948

0 commit comments

Comments
 (0)