Skip to content

Commit 86cde08

Browse files
committed
Add New Text Chunker
1 parent 041af68 commit 86cde08

File tree

8 files changed

+484
-66
lines changed

8 files changed

+484
-66
lines changed

adi_function_app/function_app.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from adi_2_ai_search import process_adi_2_ai_search
99
from pre_embedding_cleaner import process_pre_embedding_cleaner
1010
from key_phrase_extraction import process_key_phrase_extraction
11+
from semantic_text_chunker import process_semantic_text_chunker, SemanticTextChunker
1112

1213
logging.basicConfig(level=logging.DEBUG)
1314
app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
@@ -87,6 +88,62 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
8788
)
8889

8990

91+
@app.route(route="semantic_text_chunker", methods=[func.HttpMethod.POST])
92+
async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
93+
"""HTTP trigger for text chunking function.
94+
95+
Args:
96+
req (func.HttpRequest): The HTTP request object.
97+
98+
Returns:
99+
func.HttpResponse: The HTTP response object."""
100+
logging.info("Python HTTP trigger text chunking function processed a request.")
101+
102+
try:
103+
req_body = req.get_json()
104+
values = req_body.get("values")
105+
106+
semantic_text_chunker_config = req.headers
107+
108+
num_surrounding_sentences = semantic_text_chunker_config.get(
109+
"num_surrounding_sentences", 1
110+
)
111+
similarity_threshold = semantic_text_chunker_config.get(
112+
"similarity_threshold", 0.8
113+
)
114+
max_chunk_tokens = semantic_text_chunker_config.get("max_chunk_tokens", 500)
115+
116+
except ValueError:
117+
return func.HttpResponse(
118+
"Please valid Custom Skill Payload in the request body", status_code=400
119+
)
120+
else:
121+
logging.debug("Input Values: %s", values)
122+
123+
record_tasks = []
124+
125+
semantic_text_chunker = SemanticTextChunker(
126+
num_surrounding_sentences=num_surrounding_sentences,
127+
similarity_threshold=similarity_threshold,
128+
max_chunk_tokens=max_chunk_tokens,
129+
)
130+
131+
for value in values:
132+
record_tasks.append(
133+
asyncio.create_task(
134+
process_semantic_text_chunker(value, semantic_text_chunker)
135+
)
136+
)
137+
138+
results = await asyncio.gather(*record_tasks)
139+
logging.debug("Results: %s", results)
140+
cleaned_tasks = {"values": results}
141+
142+
return func.HttpResponse(
143+
json.dump(cleaned_tasks), status_code=200, mimetype="application/json"
144+
)
145+
146+
90147
@app.route(route="key_phrase_extractor", methods=[func.HttpMethod.POST])
91148
async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse:
92149
"""HTTP trigger for data cleanup function.

adi_function_app/pre_embedding_cleaner.py

Lines changed: 3 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -5,30 +5,6 @@
55
import re
66

77

8-
def get_sections(cleaned_text: str) -> list:
9-
"""
10-
Returns the section details from the content
11-
12-
Args:
13-
cleaned_text: The input text
14-
15-
Returns:
16-
list: The sections related to text
17-
18-
"""
19-
combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
20-
doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
21-
doc_metadata = [match for group in doc_metadata for match in group if match]
22-
return clean_sections(doc_metadata)
23-
24-
25-
def clean_sections(sections: list) -> list:
26-
"""Cleans the sections by removing special characters and extra white spaces."""
27-
cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
28-
29-
return cleanedSections
30-
31-
328
def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
339
"""
3410
Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -52,7 +28,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
5228
return text
5329

5430

55-
def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
31+
def clean_text(src_text: str) -> str:
5632
"""This function performs following cleanup activities on the text, remove all unicode characters
5733
remove line spacing,remove stop words, normalize characters
5834
@@ -77,8 +53,6 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
7753
}
7854
cleaned_text = remove_markdown_tags(src_text, tag_patterns)
7955

80-
sections = get_sections(cleaned_text)
81-
8256
# Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
8357
# while also removing non-printable characters
8458
cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text)
@@ -90,7 +64,7 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
9064
except Exception as e:
9165
logging.error(f"An error occurred in clean_text: {e}")
9266
return ""
93-
return cleaned_text, sections
67+
return cleaned_text
9468

9569

9670
async def process_pre_embedding_cleaner(record: dict) -> dict:
@@ -114,19 +88,7 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
11488
"warnings": None,
11589
}
11690

117-
# scenarios when page by chunking is enabled
118-
if isinstance(record["data"]["chunk"], dict):
119-
(
120-
cleaned_record["data"]["cleanedChunk"],
121-
cleaned_record["data"]["sections"],
122-
) = clean_text_with_section_extraction(record["data"]["chunk"]["content"])
123-
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
124-
else:
125-
(
126-
cleaned_record["data"]["cleanedChunk"],
127-
cleaned_record["data"]["sections"],
128-
) = clean_text_with_section_extraction(record["data"]["chunk"])
129-
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
91+
cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["content"])
13092

13193
except Exception as e:
13294
logging.error("string cleanup Error: %s", e)

adi_function_app/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ azure-ai-vision-imageanalysis
1919
PyMuPDF
2020
aiohttp
2121
Pillow
22+
spacy
23+
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz
24+
tiktoken

0 commit comments

Comments
 (0)