revise for comments

chienyuanchang · chienyuanchang · commit 32f4a3e48649 · 2025-07-16T22:48:32.000Z
diff --git a/python/content_understanding_client.py b/python/content_understanding_client.py
@@ -12,6 +12,9 @@
 from azure.storage.blob.aio import ContainerClient
 
 
+POLL_TIMEOUT_SECONDS = 120
+
+
 class AzureContentUnderstandingClient:
 
     PREBUILT_DOCUMENT_ANALYZER_ID: str = "prebuilt-documentAnalyzer"
@@ -127,43 +130,39 @@ def _get_headers(
         return headers
     
     @staticmethod
-    def is_supported_type_by_file_path(file_path: Path, is_pro_mode: bool=False) -> bool:
+    def is_supported_type_by_file_ext(file_ext: str, is_pro_mode: bool=False) -> bool:
         """
-        Checks if the given file path has a supported file type.
+        Checks if the given file extension is supported.
 
         Args:
-            file_path (Path): The path to the file to check.
+            file_ext (str): The file extension to check.
             is_pro_mode (bool): If True, checks against Pro mode supported file types.
 
         Returns:
             bool: True if the file type is supported, False otherwise.
         """
-        if not file_path.is_file():
-            return False
-        file_ext = file_path.suffix.lower()
         supported_types = (
             AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES_PRO_MODE
             if is_pro_mode else AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES
         )
-        return file_ext in supported_types
-
+        return file_ext.lower() in supported_types
+    
     @staticmethod
-    def is_supported_type_by_file_ext(file_ext: str, is_pro_mode: bool=False) -> bool:
+    def is_supported_type_by_file_path(file_path: Path, is_pro_mode: bool=False) -> bool:
         """
-        Checks if the given file extension is supported.
+        Checks if the given file path has a supported file type.
 
         Args:
-            file_ext (str): The file extension to check.
+            file_path (Path): The path to the file to check.
             is_pro_mode (bool): If True, checks against Pro mode supported file types.
 
         Returns:
             bool: True if the file type is supported, False otherwise.
         """
-        supported_types = (
-            AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES_PRO_MODE
-            if is_pro_mode else AzureContentUnderstandingClient.SUPPORTED_FILE_TYPES
-        )
-        return file_ext.lower() in supported_types
+        if not file_path.is_file():
+            return False
+        file_ext = file_path.suffix.lower()
+        return AzureContentUnderstandingClient.is_supported_type_by_file_ext(file_ext, is_pro_mode)
 
     def get_all_analyzers(self) -> Dict[str, Any]:
         """
@@ -315,10 +314,10 @@ def begin_analyze(self, analyzer_id: str, file_location: str) -> Response:
                 data = {
                     "inputs": [
                         {
-                            "name": f.name,
+                            "name": "_".join(f.relative_to(file_path).parts),  # flatten the relative file path into a single string using underscores
                             "data": base64.b64encode(f.read_bytes()).decode("utf-8")
                         }
-                        for f in file_path.iterdir()
+                        for f in file_path.rglob("*")
                         if f.is_file() and self.is_supported_type_by_file_path(f, is_pro_mode=True)
                     ]
                 }
@@ -365,7 +364,7 @@ def get_prebuilt_document_analyze_result(self, file_location: str) -> Dict[str,
             file_location=file_location,
         )
         
-        return self.poll_result(response, timeout_seconds=360)
+        return self.poll_result(response, timeout_seconds=POLL_TIMEOUT_SECONDS)
     
     async def _upload_file_to_blob(
         self, container_client: ContainerClient, file_path: str, target_blob_path: str
@@ -397,40 +396,40 @@ async def generate_knowledge_base_on_blob(
         storage_container_path_prefix: str,
         skip_analyze: bool = False,
     ) -> None:
-        container_client = ContainerClient.from_container_url(storage_container_sas_url)
+        if not storage_container_path_prefix.endswith("/"):
+            storage_container_path_prefix += "/"
         resources = []
-        for dirpath, _, filenames in os.walk(reference_docs_folder):
-            for filename in filenames:
-                filename_no_ext, file_ext = os.path.splitext(filename)
-                if self.is_supported_type_by_file_ext(file_ext, is_pro_mode=True):
-                    file_path = os.path.join(dirpath, filename)
-                    result_file_name = filename_no_ext + self.OCR_RESULT_FILE_SUFFIX
-                    result_file_blob_path = storage_container_path_prefix + result_file_name
-                    # Get and upload result.json
-                    if not skip_analyze:
-                        self._logger.info(f"Analyzing result for {filename}")
-                        try:
-                            analyze_result = self.get_prebuilt_document_analyze_result(file_path)
-                        except Exception as e:
-                            self._logger.error(f"Error of getting analyze result of {filename}: {e}")
-                            continue
-                        await self._upload_json_to_blob(container_client, analyze_result, result_file_blob_path)
-                    else:
-                        self._logger.info(f"Using existing result.json for {filename}")
-                        result_file_path = os.path.join(dirpath, result_file_name)
-                        if not os.path.exists(result_file_path):
-                            self._logger.warning(f"Result file {result_file_name} does not exist, skipping.")
-                            continue
-                        await self._upload_file_to_blob(container_client, result_file_path, result_file_blob_path)
-                    # Upload the original file
-                    file_blob_path = storage_container_path_prefix + filename
-                    await self._upload_file_to_blob(container_client, file_path, file_blob_path)
-                    resources.append({"file": filename, "resultFile": result_file_name})
-        # Upload sources.jsonl
-        await self.upload_jsonl_to_blob(
-            container_client, resources, storage_container_path_prefix + self.KNOWLEDGE_SOURCE_LIST_FILE_NAME)
-        await container_client.close()
-
+        async with ContainerClient.from_container_url(storage_container_sas_url) as container_client:
+            for dirpath, _, filenames in os.walk(reference_docs_folder):
+                for filename in filenames:
+                    filename_no_ext, file_ext = os.path.splitext(filename)
+                    if self.is_supported_type_by_file_ext(file_ext, is_pro_mode=True):
+                        file_path = os.path.join(dirpath, filename)
+                        result_file_name = filename_no_ext + self.OCR_RESULT_FILE_SUFFIX
+                        result_file_blob_path = storage_container_path_prefix + result_file_name
+                        # Get and upload result.json
+                        if not skip_analyze:
+                            self._logger.info(f"Analyzing result for {filename}")
+                            try:
+                                analyze_result = self.get_prebuilt_document_analyze_result(file_path)
+                            except Exception as e:
+                                self._logger.error(f"Error of getting analyze result of {filename}: {e}")
+                                continue
+                            await self._upload_json_to_blob(container_client, analyze_result, result_file_blob_path)
+                        else:
+                            self._logger.info(f"Using existing result.json for {filename}")
+                            result_file_path = os.path.join(dirpath, result_file_name)
+                            if not os.path.exists(result_file_path):
+                                self._logger.warning(f"Result file {result_file_name} does not exist, skipping.")
+                                continue
+                            await self._upload_file_to_blob(container_client, result_file_path, result_file_blob_path)
+                        # Upload the original file
+                        file_blob_path = storage_container_path_prefix + filename
+                        await self._upload_file_to_blob(container_client, file_path, file_blob_path)
+                        resources.append({"file": filename, "resultFile": result_file_name})
+            # Upload sources.jsonl
+            await self.upload_jsonl_to_blob(
+                container_client, resources, storage_container_path_prefix + self.KNOWLEDGE_SOURCE_LIST_FILE_NAME)
 
     def get_image_from_analyze_operation(
         self, analyze_response: Response, image_id: str
@@ -552,7 +551,7 @@ def begin_classify(self, classifier_id: str, file_location: str) -> Response:
     def poll_result(
         self,
         response: Response,
-        timeout_seconds: int = 120,
+        timeout_seconds: int = POLL_TIMEOUT_SECONDS,
         polling_interval_seconds: int = 2,
     ) -> Dict[str, Any]:
         """