microsoft
diff --git a/‎adi_function_app/function_app.py‎
Lines changed: 57 additions & 0 deletions b/‎adi_function_app/function_app.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎adi_function_app/pre_embedding_cleaner.py‎
Lines changed: 3 additions & 41 deletions b/‎adi_function_app/pre_embedding_cleaner.py‎
Lines changed: 3 additions & 41 deletions
diff --git a/‎adi_function_app/requirements.txt‎
Lines changed: 3 additions & 0 deletions b/‎adi_function_app/requirements.txt‎
Lines changed: 3 additions & 0 deletions
@@ -8,6 +8,7 @@
 from adi_2_ai_search import process_adi_2_ai_search
 from pre_embedding_cleaner import process_pre_embedding_cleaner
 from key_phrase_extraction import process_key_phrase_extraction
+from semantic_text_chunker import process_semantic_text_chunker, SemanticTextChunker
 
 logging.basicConfig(level=logging.DEBUG)
 app = func.FunctionApp(http_auth_level=func.AuthLevel.FUNCTION)
@@ -87,6 +88,62 @@ async def pre_embedding_cleaner(req: func.HttpRequest) -> func.HttpResponse:
         )
 
 
+@app.route(route="semantic_text_chunker", methods=[func.HttpMethod.POST])
+async def semantic_text_chunker(req: func.HttpRequest) -> func.HttpResponse:
+    """HTTP trigger for text chunking function.
+
+    Args:
+        req (func.HttpRequest): The HTTP request object.
+
+    Returns:
+        func.HttpResponse: The HTTP response object."""
+    logging.info("Python HTTP trigger text chunking function processed a request.")
+
+    try:
+        req_body = req.get_json()
+        values = req_body.get("values")
+
+        semantic_text_chunker_config = req.headers
+
+        num_surrounding_sentences = semantic_text_chunker_config.get(
+            "num_surrounding_sentences", 1
+        )
+        similarity_threshold = semantic_text_chunker_config.get(
+            "similarity_threshold", 0.8
+        )
+        max_chunk_tokens = semantic_text_chunker_config.get("max_chunk_tokens", 500)
+
+    except ValueError:
+        return func.HttpResponse(
+            "Please valid Custom Skill Payload in the request body", status_code=400
+        )
+    else:
+        logging.debug("Input Values: %s", values)
+
+        record_tasks = []
+
+        semantic_text_chunker = SemanticTextChunker(
+            num_surrounding_sentences=num_surrounding_sentences,
+            similarity_threshold=similarity_threshold,
+            max_chunk_tokens=max_chunk_tokens,
+        )
+
+        for value in values:
+            record_tasks.append(
+                asyncio.create_task(
+                    process_semantic_text_chunker(value, semantic_text_chunker)
+                )
+            )
+
+        results = await asyncio.gather(*record_tasks)
+        logging.debug("Results: %s", results)
+        cleaned_tasks = {"values": results}
+
+        return func.HttpResponse(
+            json.dump(cleaned_tasks), status_code=200, mimetype="application/json"
+        )
+
+
 @app.route(route="key_phrase_extractor", methods=[func.HttpMethod.POST])
 async def key_phrase_extractor(req: func.HttpRequest) -> func.HttpResponse:
     """HTTP trigger for data cleanup function.
 
@@ -5,30 +5,6 @@
 import re
 
 
-def get_sections(cleaned_text: str) -> list:
-    """
-    Returns the section details from the content
-
-    Args:
-        cleaned_text: The input text
-
-    Returns:
-        list: The sections related to text
-
-    """
-    combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
-    doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
-    doc_metadata = [match for group in doc_metadata for match in group if match]
-    return clean_sections(doc_metadata)
-
-
-def clean_sections(sections: list) -> list:
-    """Cleans the sections by removing special characters and extra white spaces."""
-    cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]
-
-    return cleanedSections
-
-
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     """
     Remove specified Markdown tags from the text, keeping the contents of the tags.
@@ -52,7 +28,7 @@ def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
     return text
 
 
-def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
+def clean_text(src_text: str) -> str:
     """This function performs following cleanup activities on the text, remove all unicode characters
     remove line spacing,remove stop words, normalize characters
 
@@ -77,8 +53,6 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
         }
         cleaned_text = remove_markdown_tags(src_text, tag_patterns)
 
-        sections = get_sections(cleaned_text)
-
         # Updated regex to keep Unicode letters, punctuation, whitespace, currency symbols, and percentage signs,
         # while also removing non-printable characters
         cleaned_text = re.sub(r"[^\p{L}\p{P}\s\p{Sc}%\x20-\x7E]", "", cleaned_text)
@@ -90,7 +64,7 @@ def clean_text_with_section_extraction(src_text: str) -> tuple[str, str]:
     except Exception as e:
         logging.error(f"An error occurred in clean_text: {e}")
         return ""
-    return cleaned_text, sections
+    return cleaned_text
 
 
 async def process_pre_embedding_cleaner(record: dict) -> dict:
@@ -114,19 +88,7 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
             "warnings": None,
         }
 
-        # scenarios when page by chunking is enabled
-        if isinstance(record["data"]["chunk"], dict):
-            (
-                cleaned_record["data"]["cleanedChunk"],
-                cleaned_record["data"]["sections"],
-            ) = clean_text_with_section_extraction(record["data"]["chunk"]["content"])
-            cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
-        else:
-            (
-                cleaned_record["data"]["cleanedChunk"],
-                cleaned_record["data"]["sections"],
-            ) = clean_text_with_section_extraction(record["data"]["chunk"])
-            cleaned_record["data"]["chunk"] = record["data"]["chunk"]
+        cleaned_record["data"]["cleaned_chunk"] = clean_text(record["data"]["content"])
 
     except Exception as e:
         logging.error("string cleanup Error: %s", e)
 
@@ -19,3 +19,6 @@ azure-ai-vision-imageanalysis
 PyMuPDF
 aiohttp
 Pillow
+spacy
+en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1.tar.gz
+tiktoken