Azure-Samples
diff --git a/‎.azdo/pipelines/azure-dev.yml‎
Lines changed: 1 addition & 0 deletions b/‎.azdo/pipelines/azure-dev.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/azure-dev.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/azure-dev.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎app/backend/prepdocs.py‎
Lines changed: 5 additions & 5 deletions b/‎app/backend/prepdocs.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎app/backend/prepdocslib/cu_image.py‎
Lines changed: 28 additions & 33 deletions b/‎app/backend/prepdocslib/cu_image.py‎
Lines changed: 28 additions & 33 deletions
@@ -120,6 +120,7 @@ steps:
       DEPLOYMENT_TARGET: $(DEPLOYMENT_TARGET)
       AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE)
       USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER)
+      USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU)
   - task: AzureCLI@2
     displayName: Deploy Application
     inputs:
 
@@ -103,6 +103,7 @@ jobs:
       DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }}
       AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }}
       USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }}
+      USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
 
@@ -22,7 +22,7 @@ contact [[email protected]](mailto:[email protected]) with any additio
 - [Running unit tests](#running-unit-tests)
 - [Running E2E tests](#running-e2e-tests)
 - [Code Style](#code-style)
-- [Adding new azd environment variables](#add-new-azd-environment-variables)
+- [Adding new azd environment variables](#adding-new-azd-environment-variables)
 
 ## Code of Conduct
 
@@ -166,6 +166,8 @@ If you followed the steps above to install the pre-commit hooks, then you can ju
 
 When adding new azd environment variables, please remember to update:
 
+1. [main.parameters.json](./main.parameters.json)
+1. [appEnvVariables in main.bicep](./main.bicep)
 1. App Service's [azure.yaml](./azure.yaml)
 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml).
 1. [Github workflows](.github/workflows/azure-dev.yml)
@@ -92,7 +92,9 @@ However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2
 - Azure AI Document Intelligence: SO (Standard) tier using pre-built layout. Pricing per document page, sample documents have 261 pages total. [Pricing](https://azure.microsoft.com/pricing/details/form-recognizer/)
 - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/)
 - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/)
-- Azure Cosmos DB: Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/)
+- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/en-us/pricing/details/cognitive-services/computer-vision/)
+- Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per TODO. [Pricing](TODO)
 - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/)
 
 To reduce costs, you can switch to free SKUs for various services, but those SKUs have limitations.
 
@@ -7,6 +7,7 @@
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider
+from rich.logging import RichHandler
 
 from load_azd_env import load_azd_env
 from prepdocslib.blobmanager import BlobManager
@@ -161,7 +162,7 @@ def setup_file_processors(
     use_content_understanding: bool = False,
     content_understanding_endpoint: Union[str, None] = None,
 ):
-    sentence_text_splitter = SentenceTextSplitter(has_image_embeddings=search_images)
+    sentence_text_splitter = SentenceTextSplitter()
 
     doc_int_parser: Optional[DocumentAnalysisParser] = None
     # check if Azure Document Intelligence credentials are provided
@@ -245,8 +246,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
-        epilog="Example: prepdocs.py '.\\data\*' -v",
+        description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index."
     )
     parser.add_argument("files", nargs="?", help="Files to be processed")
 
@@ -299,7 +299,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
     args = parser.parse_args()
 
     if args.verbose:
-        logging.basicConfig(format="%(message)s")
+        logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)])
         # We only set the level to INFO for our logger,
         # to avoid seeing the noisy INFO level logs from the Azure SDKs
         logger.setLevel(logging.DEBUG)
@@ -310,7 +310,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
     use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true"
     use_acls = os.getenv("AZURE_ADLS_GEN2_STORAGE_ACCOUNT") is not None
     dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false"
-    use_content_understanding = os.getenv("USE_CONTENT_UNDERSTANDING", "").lower() == "true"
+    use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true"
 
     # Use the current user identity to connect to Azure services. See infra/main.bicep for role assignments.
     if tenant_id := os.getenv("AZURE_TENANT_ID"):
 
@@ -4,6 +4,7 @@
 import aiohttp
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import get_bearer_token_provider
+from rich.progress import Progress
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed
 
 logger = logging.getLogger("scripts")
@@ -44,7 +45,23 @@ def __init__(self, endpoint: str, credential: Union[AsyncTokenCredential, str]):
         self.endpoint = endpoint
         self.credential = credential
 
+    async def poll_api(self, session, poll_url, headers):
+
+        @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
+        async def poll():
+            async with session.get(poll_url, headers=headers) as response:
+                response.raise_for_status()
+                response_json = await response.json()
+                if response_json["status"] == "Failed":
+                    raise Exception("Failed")
+                if response_json["status"] == "Running":
+                    raise ValueError("Running")
+                return response_json
+
+        return await poll()
+
     async def create_analyzer(self):
+        logger.info("Creating analyzer '%s'...", image_schema["analyzerId"])
 
         token_provider = get_bearer_token_provider(self.credential, "https://cognitiveservices.azure.com/.default")
         token = await token_provider()
@@ -55,33 +72,21 @@ async def create_analyzer(self):
         async with aiohttp.ClientSession() as session:
             async with session.put(url=cu_endpoint, params=params, headers=headers, json=image_schema) as response:
                 if response.status == 409:
-                    print(f"Analyzer '{analyzer_id}' already exists.")
+                    logger.info("Analyzer '%s' already exists.", analyzer_id)
                     return
                 elif response.status != 201:
                     data = await response.text()
-                    # TODO: log it
-                    print(data)
+                    logger.error("Error creating analyzer: %s", data)
                     response.raise_for_status()
                 else:
                     poll_url = response.headers.get("Operation-Location")
 
-            @retry(stop=stop_after_attempt(60), wait=wait_fixed(2))
-            async def poll():
-                async with session.get(poll_url, headers=headers) as response:
-                    response.raise_for_status()
-                    response_json = await response.json()
-                    if response_json["status"] != "Succeeded":
-                        raise ValueError("Retry")
-
-            await poll()
+            with Progress() as progress:
+                progress.add_task("Creating analyzer...", total=None, start=False)
+                await self.poll_api(session, poll_url, headers)
 
-    def run_cu_image(self, analyzer_name, image):
-        result = self.run_inference(analyzer_name, image)
-        model_output = result["result"]["contents"][0]["fields"]
-        model_output_raw = str(model_output)
-        return model_output, model_output_raw
-
-    async def verbalize_figure(self, image_bytes) -> str:
+    async def describe_image(self, image_bytes) -> str:
+        logger.info("Sending image to Azure Content Understanding service...")
         async with aiohttp.ClientSession() as session:
             token = await self.credential.get_token("https://cognitiveservices.azure.com/.default")
             headers = {"Authorization": "Bearer " + token.token}
@@ -96,19 +101,9 @@ async def verbalize_figure(self, image_bytes) -> str:
                 response.raise_for_status()
                 poll_url = response.headers["Operation-Location"]
 
-                @retry(stop=stop_after_attempt(60), wait=wait_fixed(2), retry=retry_if_exception_type(ValueError))
-                async def poll():
-                    async with session.get(poll_url, headers=headers) as response:
-                        response.raise_for_status()
-                        response_json = await response.json()
-                        print(response_json)
-                        # rich.print it all pretty progress-y
-                        if response_json["status"] == "Failed":
-                            raise Exception("Failed")
-                        if response_json["status"] == "Running":
-                            raise ValueError("Running")
-                        return response_json
-
-                results = await poll()
+                with Progress() as progress:
+                    progress.add_task("Processing...", total=None, start=False)
+                    results = await self.poll_api(session, poll_url, headers)
+
                 fields = results["result"]["contents"][0]["fields"]
                 return fields["DescriptionHTML"]["valueString"]