Skip to content

Commit c10bc02

Browse files
Bug fixes in key phrase and pre embedding cleaner (#47)
* Bug fixes in key phrase and pre embedding cleaner
1 parent c5a6f2f commit c10bc02

File tree

3 files changed

+120
-99
lines changed

3 files changed

+120
-99
lines changed
Lines changed: 106 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,107 @@
11
# Copyright (c) Microsoft Corporation.
22
# Licensed under the MIT License.
3-
43
import logging
54
import json
65
import os
76
from azure.ai.textanalytics.aio import TextAnalyticsClient
87
from azure.core.exceptions import HttpResponseError
9-
from azure.core.credentials import AzureKeyCredential
10-
import asyncio
118
from azure.identity import DefaultAzureCredential
12-
from environment import IdentityType, get_identity_type
9+
from tenacity import retry
10+
from tenacity.stop import stop_after_attempt
11+
from tenacity.wait import wait_exponential
12+
import asyncio
1313

1414
MAX_TEXT_ELEMENTS = 5120
1515

1616

17-
def split_document(document: str, max_size: int) -> list[str]:
18-
"""Split a document into chunks of max_size.
17+
def split_document(document, max_size):
18+
"""Split a document into chunks of max_size and filter out any empty strings
1919
2020
Args:
2121
document (str): The document to split.
22-
max_size (int): The maximum size of each chunk."""
23-
return [document[i : i + max_size] for i in range(0, len(document), max_size)]
22+
max_size (int): The maximum size of each chunk.
2423
25-
26-
async def extract_key_phrases_from_text(
27-
data: list[str], max_key_phrase_count: int, retries_left: int = 3
24+
Returns:
25+
list: The list of document chunks."""
26+
return [
27+
document[i : i + max_size]
28+
for i in range(0, len(document), max_size)
29+
if len(document[i : i + max_size]) > 0
30+
]
31+
32+
33+
@retry(
34+
reraise=True,
35+
stop=stop_after_attempt(3),
36+
wait=wait_exponential(multiplier=1, min=1, max=10),
37+
)
38+
async def extract_key_phrases_from_batch(
39+
batch_data: list[str], max_key_phrase_count: int
2840
) -> list[str]:
29-
"""Extract key phrases from the text.
41+
"""Extract key phrases from text using Azure AI services.
3042
3143
Args:
32-
data (list[str]): The text data.
33-
max_key_phrase_count (int): The maximum number of key phrases to return.
44+
batch_data (list[str]): The list of text to process.
45+
max_key_phrase_count(int): no of keywords to return
3446
3547
Returns:
36-
list[str]: The key phrases extracted from the text."""
37-
logging.info("Python HTTP trigger function processed a request.")
48+
list: The list of key phrases."""
3849

3950
key_phrase_list = []
4051

41-
if get_identity_type() == IdentityType.SYSTEM_ASSIGNED:
42-
credential = DefaultAzureCredential()
43-
elif get_identity_type() == IdentityType.USER_ASSIGNED:
44-
credential = DefaultAzureCredential(
45-
managed_identity_client_id=os.environ.get("FunctionApp__ClientId")
46-
)
47-
else:
48-
credential = AzureKeyCredential(os.environ.get("AIService__Language__Key"))
4952
text_analytics_client = TextAnalyticsClient(
50-
endpoint=os.environ.get("AIService__Language__Endpoint"),
51-
credential=credential,
53+
endpoint=os.environ["AIService__Services__Endpoint"],
54+
credential=DefaultAzureCredential(
55+
managed_identity_client_id=os.environ.get("FunctionApp__ClientId")
56+
),
5257
)
5358

5459
async with text_analytics_client:
5560
try:
56-
# Split large documents
57-
split_documents = []
58-
for doc in data:
59-
if len(doc) > MAX_TEXT_ELEMENTS:
60-
split_documents.extend(split_document(doc, MAX_TEXT_ELEMENTS))
61-
else:
62-
split_documents.append(doc)
63-
64-
result = await text_analytics_client.extract_key_phrases(split_documents)
65-
for idx, doc in enumerate(result):
61+
result = await text_analytics_client.extract_key_phrases(batch_data)
62+
for doc in result:
6663
if not doc.is_error:
6764
key_phrase_list.extend(doc.key_phrases[:max_key_phrase_count])
6865
else:
69-
raise Exception(f"Document {idx} error: {doc.error}")
66+
raise Exception(f"Document error: {doc.error}")
7067
except HttpResponseError as e:
71-
if e.status_code == 429 and retries_left > 0: # Rate limiting error
72-
wait_time = 2**retries_left # Exponential backoff
73-
logging.info(
74-
"%s Rate limit exceeded. Retrying in %s seconds...", e, wait_time
75-
)
76-
await asyncio.sleep(wait_time)
77-
return await extract_key_phrases_from_text(
78-
data, max_key_phrase_count, retries_left - 1
79-
)
80-
else:
81-
raise Exception(f"An error occurred: {e}") from e
68+
logging.error("An error occurred: %s", e)
69+
raise e
70+
71+
return key_phrase_list
72+
73+
74+
async def extract_key_phrases_from_text(
75+
data: list[str], max_key_phrase_count: int
76+
) -> list[str]:
77+
"""Extract key phrases from text using Azure AI services.
78+
79+
Args:
80+
data (list[str]): The list of text to process.
81+
max_key_phrase_count(int): no of keywords to return"""
82+
logging.info("Python HTTP trigger function processed a request.")
83+
key_phrase_list = []
84+
85+
split_documents = []
86+
for doc in data:
87+
if len(doc) > MAX_TEXT_ELEMENTS:
88+
split_documents.extend(split_document(doc, MAX_TEXT_ELEMENTS))
89+
elif len(doc) > 0:
90+
split_documents.append(doc)
91+
92+
# Filter out any empty documents
93+
split_documents = [doc for doc in split_documents if len(doc) > 0]
94+
95+
for i in range(0, len(split_documents), 10):
96+
key_phrase_list.extend(
97+
await extract_key_phrases_from_batch(
98+
split_documents[i : i + 10], max_key_phrase_count
99+
)
100+
)
101+
102+
if len(key_phrase_list) > max_key_phrase_count:
103+
key_phrase_list = key_phrase_list[:max_key_phrase_count]
104+
break
82105

83106
return key_phrase_list
84107

@@ -105,26 +128,40 @@ async def process_key_phrase_extraction(
105128
"errors": None,
106129
"warnings": None,
107130
}
108-
extracted_record["data"]["key_phrases"] = await extract_key_phrases_from_text(
131+
extracted_record["data"]["keyPhrases"] = await extract_key_phrases_from_text(
109132
[record["data"]["text"]], max_key_phrase_count
110133
)
111-
except Exception as inner_e:
112-
logging.error("key phrase extraction Error: %s", inner_e)
113-
logging.error(
114-
"Failed to extract key phrase. Check function app logs for more details of exact failure."
115-
)
116-
return {
117-
"recordId": record["recordId"],
118-
"data": {},
119-
"errors": [
120-
{
121-
"message": "Failed to extract key phrase. Check function app logs for more details of exact failure."
122-
}
123-
],
124-
"warnings": None,
125-
}
126-
else:
127-
json_str = json.dumps(extracted_record, indent=4)
128-
129-
logging.info(f"key phrase extraction output: {json_str}")
130-
return extracted_record
134+
except Exception as e:
135+
logging.error("key phrase extraction Error: %s", e)
136+
await asyncio.sleep(10)
137+
try:
138+
extracted_record = {
139+
"recordId": record["recordId"],
140+
"data": {},
141+
"errors": None,
142+
"warnings": None,
143+
}
144+
extracted_record["data"][
145+
"keyPhrases"
146+
] = await extract_key_phrases_from_text(
147+
[record["data"]["text"]], max_key_phrase_count
148+
)
149+
except Exception as inner_e:
150+
logging.error("key phrase extraction Error: %s", inner_e)
151+
logging.error(
152+
"Failed to extract key phrase. Check function app logs for more details of exact failure."
153+
)
154+
return {
155+
"recordId": record["recordId"],
156+
"data": {},
157+
"errors": [
158+
{
159+
"message": "Failed to extract key phrase. Check function app logs for more details of exact failure."
160+
}
161+
],
162+
"warnings": None,
163+
}
164+
json_str = json.dumps(extracted_record, indent=4)
165+
166+
logging.info(f"key phrase extraction output: {json_str}")
167+
return extracted_record

adi_function_app/pre_embedding_cleaner.py

Lines changed: 13 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,7 @@
22
# Licensed under the MIT License.
33
import logging
44
import json
5-
import nltk
65
import re
7-
from nltk.tokenize import word_tokenize
8-
9-
nltk.download("punkt")
10-
nltk.download("stopwords")
11-
nltk.download("punkt_tab")
126

137

148
def get_section(cleaned_text: str) -> list:
@@ -69,38 +63,28 @@ def clean_text(src_text: str) -> str:
6963
str: The clean text."""
7064

7165
try:
66+
logging.info(f"Input text: {src_text}")
67+
if len(src_text) == 0:
68+
logging.error("Input text is empty")
69+
raise ValueError("Input text is empty")
70+
7271
# Define specific patterns for each tag
7372
tag_patterns = {
74-
"figurecontent": r"<!--.*?FigureContent=(.*?)-->",
73+
"figurecontent": r"<!-- FigureContent=(.*?)-->",
7574
"figure": r"<figure>(.*?)</figure>",
7675
"figures": r"\(figures/\d+\)(.*?)\(figures/\d+\)",
7776
"figcaption": r"<figcaption>(.*?)</figcaption>",
7877
}
7978
cleaned_text = remove_markdown_tags(src_text, tag_patterns)
8079

81-
# remove html tags
82-
cleaned_text = re.sub(r"<.*?>", "", cleaned_text)
83-
84-
# Replace newline characters with spaces
85-
cleaned_text = re.sub(r"\n", " ", cleaned_text)
86-
87-
# Replace multiple whitespace characters with a single space
88-
cleaned_text = re.sub(r"\s+", " ", cleaned_text)
89-
90-
# remove stopwords
91-
tokens = word_tokenize(cleaned_text, "english")
92-
stop_words = nltk.corpus.stopwords.words("english")
93-
filtered_tokens = [word for word in tokens if word not in stop_words]
94-
cleaned_text = " ".join(filtered_tokens)
95-
96-
# remove special characters
97-
cleaned_text = re.sub(r"[^a-zA-Z\s]", "", cleaned_text)
98-
99-
# remove extra white spaces
100-
cleaned_text = " ".join([word for word in cleaned_text.split()])
80+
# Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
81+
# while also removing non-printable characters
82+
cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text)
10183

102-
# case normalization
103-
cleaned_text = cleaned_text.lower()
84+
logging.info(f"Cleaned text: {cleaned_text}")
85+
if len(cleaned_text) == 0:
86+
logging.error("Cleaned text is empty")
87+
raise ValueError("Cleaned text is empty")
10488
except Exception as e:
10589
logging.error(f"An error occurred in clean_text: {e}")
10690
return ""

adi_function_app/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ pandas
99
azure-identity
1010
openpyxl
1111
regex
12-
nltk==3.9.1
12+
tenacity
1313
bs4
1414
azure-search
1515
azure-search-documents

0 commit comments

Comments
 (0)