Skip to content

Commit 2bea14d

Browse files
committed
Update chunking mechanism
1 parent 86cde08 commit 2bea14d

File tree

7 files changed

+177
-138
lines changed

7 files changed

+177
-138
lines changed

adi_function_app/function_app.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import asyncio
77

88
from adi_2_ai_search import process_adi_2_ai_search
9-
from pre_embedding_cleaner import process_pre_embedding_cleaner
9+
from adi_function_app.mark_up_cleaner import process_mark_up_cleaner
1010
from key_phrase_extraction import process_key_phrase_extraction
1111
from semantic_text_chunker import process_semantic_text_chunker, SemanticTextChunker
1212

@@ -51,8 +51,8 @@ async def adi_2_ai_search(req: func.HttpRequest) -> func.HttpResponse:
5151
)
5252

5353

54-
@app.route(route="pre_embedding_cleaner", methods=[func.HttpMethod.POST])
55-
async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
54+
@app.route(route="mark_up_cleaner", methods=[func.HttpMethod.POST])
55+
async def mark_up_cleaner(req: func.HttpRequest) -> func.HttpResponse:
5656
"""HTTP trigger for data cleanup function.
5757
5858
Args:
@@ -75,9 +75,7 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
7575
record_tasks = []
7676

7777
for value in values:
78-
record_tasks.append(
79-
asyncio.create_task(process_pre_embedding_cleaner(value))
80-
)
78+
record_tasks.append(asyncio.create_task(process_mark_up_cleaner(value)))
8179

8280
results = await asyncio.gather(*record_tasks)
8381
logging.debug("Results: %s", results)
Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,30 @@
55
import re
66

77

8+
def get_sections(text: str) -> list:
9+
"""
10+
Returns the section details from the content.
11+
12+
Args:
13+
text: The input text
14+
15+
Returns:
16+
list: The sections related to text
17+
"""
18+
# Updated regex pattern to capture markdown headers like ### Header
19+
combined_pattern = r"(?<=\n|^)[#]+\s*(.*?)(?=\n)"
20+
doc_metadata = re.findall(combined_pattern, text, re.DOTALL)
21+
return clean_sections(doc_metadata)
22+
23+
24+
def clean_sections(sections: list) -> list:
25+
"""
26+
Cleans the sections by removing special characters and extra white spaces.
27+
"""
28+
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
29+
return cleaned_sections
30+
31+
832
def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
933
"""
1034
Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -28,7 +52,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
2852
return text
2953

3054

31-
def clean_text(src_text: str) -> str:
55+
def clean_text_and_extract_metadata(src_text: str) -> tuple[str, str]:
3256
"""This function performs following cleanup activities on the text, remove all unicode characters
3357
remove line spacing,remove stop words, normalize characters
3458
@@ -38,16 +62,21 @@ def clean_text(src_text: str) -> str:
3862
Returns:
3963
str: The clean text."""
4064

65+
return_record = {}
66+
4167
try:
4268
logging.info(f"Input text: {src_text}")
4369
if len(src_text) == 0:
4470
logging.error("Input text is empty")
4571
raise ValueError("Input text is empty")
4672

73+
return_record["marked_up_chunk"] = src_text
74+
return_record["sections"] = get_sections(src_text)
75+
4776
# Define specific patterns for each tag
4877
tag_patterns = {
4978
"figurecontent": r"<!-- FigureContent=(.*?)-->",
50-
"figure": r"<figure>(.*?)</figure>",
79+
"figure": r"<figure(?:\s+FigureId=\"[^\"]*\")?>(.*?)</figure>",
5180
"figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
5281
"figcaption": r"<figcaption>(.*?)</figcaption>",
5382
}
@@ -61,13 +90,15 @@ def clean_text(src_text: str) -> str:
6190
if len(cleaned_text) == 0:
6291
logging.error("Cleaned text is empty")
6392
raise ValueError("Cleaned text is empty")
93+
else:
94+
return_record["cleaned_chunk"] = cleaned_text
6495
except Exception as e:
65-
logging.error(f"An error occurred in clean_text: {e}")
96+
logging.error(f"An error occurred in clean_text_and_extract_metadata: {e}")
6697
return ""
67-
return cleaned_text
98+
return return_record
6899

69100

70-
async def process_pre_embedding_cleaner(record: dict) -> dict:
101+
async def process_mark_up_cleaner(record: dict) -> dict:
71102
"""Cleanup the data using standard python libraries.
72103
73104
Args:
@@ -88,7 +119,9 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
88119
"warnings": None,
89120
}
90121

91-
cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["content"])
122+
cleaned_record["data"] = clean_text_and_extract_metadata(
123+
record["data"]["content"]
124+
)
92125

93126
except Exception as e:
94127
logging.error("string cleanup Error: %s", e)

adi_function_app/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ azure-ai-vision-imageanalysis
1919
PyMuPDF
2020
aiohttp
2121
Pillow
22+
numpy
2223
spacy
2324
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz
2425
tiktoken

0 commit comments

Comments
 (0)