diff --git a/notebooks/content_extraction.ipynb b/notebooks/content_extraction.ipynb index 24174cb..bda4209 100644 --- a/notebooks/content_extraction.ipynb +++ b/notebooks/content_extraction.ipynb @@ -11,7 +11,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook demonstrate you can use Content Understanding API to extract semantic content from multimodal files." + "This notebook demonstrates how to use the Content Understanding API to extract semantic content from multimodal files." ] }, { @@ -19,8 +19,8 @@ "metadata": {}, "source": [ "## Prerequisites\n", - "1. Ensure Azure AI service is configured following [steps](../README.md#configure-azure-ai-service-resource)\n", - "2. Install the required packages to run the sample." + "1. Ensure your Azure AI service is configured by following the [configuration steps](../README.md#configure-azure-ai-service-resource).\n", + "2. Install the required packages to run this sample." ] }, { @@ -38,14 +38,16 @@ "source": [ "## Create Azure AI Content Understanding Client\n", "\n", - "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class containing functions to interact with the Content Understanding API. Before the official release of the Content Understanding SDK, it can be regarded as a lightweight SDK. Fill the constant **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, **AZURE_AI_API_KEY** with the information from your Azure AI Service.\n", + "> The [AzureContentUnderstandingClient](../python/content_understanding_client.py) is a utility class that provides functions to interact with the Content Understanding API. Prior to the official release of the Content Understanding SDK, it serves as a lightweight SDK.\n", + ">\n", + "> Fill in the constants **AZURE_AI_ENDPOINT**, **AZURE_AI_API_VERSION**, and **AZURE_AI_API_KEY** with the details from your Azure AI Service.\n", "\n", "> ⚠️ Important:\n", - "You must update the code below to match your Azure authentication method.\n", - "Look for the `# IMPORTANT` comments and modify those sections accordingly.\n", - "If you skip this step, the sample may not run correctly.\n", + "You must update the code below to use your preferred Azure authentication method.\n", + "Look for the `# IMPORTANT` comments in the code and modify those sections accordingly.\n", + "Skipping this step may cause the sample to not run correctly.\n", "\n", - "> ⚠️ Note: Using a subscription key works, but using a token provider with Azure Active Directory (AAD) is much safer and is highly recommended for production environments." + "> ⚠️ Note: While using a subscription key is supported, it is strongly recommended to use a token provider with Azure Active Directory (AAD) for enhanced security in production environments." ] }, { @@ -66,9 +68,9 @@ "load_dotenv(find_dotenv())\n", "logging.basicConfig(level=logging.INFO)\n", "\n", - "# For authentication, you can use either token-based auth or subscription key, and only one of them is required\n", + "# For authentication, you can use either token-based auth or subscription key; only one is required\n", "AZURE_AI_ENDPOINT = os.getenv(\"AZURE_AI_ENDPOINT\")\n", - "# IMPORTANT: Replace with your actual subscription key or set up in \".env\" file if not using token auth\n", + "# IMPORTANT: Replace with your actual subscription key or set it in your \".env\" file if not using token authentication\n", "AZURE_AI_API_KEY = os.getenv(\"AZURE_AI_API_KEY\")\n", "AZURE_AI_API_VERSION = os.getenv(\"AZURE_AI_API_VERSION\", \"2025-05-01-preview\")\n", "\n", @@ -85,9 +87,9 @@ " api_version=AZURE_AI_API_VERSION,\n", " # IMPORTANT: Comment out token_provider if using subscription key\n", " token_provider=token_provider,\n", - " # IMPORTANT: Uncomment this if using subscription key\n", + " # IMPORTANT: Uncomment the following line if using subscription key\n", " # subscription_key=AZURE_AI_API_KEY,\n", - " x_ms_useragent=\"azure-ai-content-understanding-python/content_extraction\", # This header is used for sample usage telemetry, please comment out this line if you want to opt out.\n", + " x_ms_useragent=\"azure-ai-content-understanding-python/content_extraction\", # This header is used for sample usage telemetry; comment out if you want to opt out.\n", ")\n", "\n", "# Utility function to save images\n", @@ -100,6 +102,7 @@ " image_id=image_id\n", " )\n", " image = Image.open(BytesIO(raw_image))\n", + " # To display the image, uncomment the following line:\n", " # image.show()\n", " Path(\".cache\").mkdir(exist_ok=True)\n", " image.save(f\".cache/{image_id}.jpg\", \"JPEG\")\n" @@ -111,8 +114,7 @@ "source": [ "## Document Content\n", "\n", - "Content Understanding API is designed to extract all textual content from a specified document file. In addition to text extraction, it conducts a comprehensive layout analysis to identify and categorize tables and figures within the document. The output is then presented in a structured markdown format, ensuring clarity and ease of interpretation.\n", - "\n" + "The Content Understanding API extracts all textual content from a given document file. In addition to text extraction, it performs a thorough layout analysis to identify and categorize tables and figures within the document. The output is presented in a structured markdown format, ensuring clarity and ease of use." ] }, { @@ -124,7 +126,7 @@ "ANALYZER_SAMPLE_FILE = '../data/invoice.pdf'\n", "ANALYZER_ID = 'prebuilt-documentAnalyzer'\n", "\n", - "# Analyzer file\n", + "# Analyze document file\n", "response = client.begin_analyze(ANALYZER_ID, file_location=ANALYZER_SAMPLE_FILE)\n", "result_json = client.poll_result(response)\n", "\n", @@ -135,7 +137,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> The markdown output contains layout information, which is very useful for Retrieval-Augmented Generation (RAG) scenarios. You can paste the markdown into a viewer such as Visual Studio Code and preview the layout structure." + "> The markdown output contains detailed layout information, which is especially useful for Retrieval-Augmented Generation (RAG) scenarios. You can paste the markdown into a viewer such as Visual Studio Code to preview the layout structure." ] }, { @@ -144,14 +146,14 @@ "metadata": {}, "outputs": [], "source": [ - "print(result_json[\"result\"][\"contents\"][0][\"markdown\"])" + "print(result_json[\"result\"][\"contents\"][0][\"markdown\"])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "> You can get the layout information, including ```words/lines``` in the pagesnode and paragraphs info in ```paragraphs```, and ```tables``` in the table." + "> You can access layout information including `words` and `lines` within the `pages` node, paragraph details under `paragraphs`, and tables listed in the `tables` section." ] }, { @@ -167,7 +169,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> This statement allows you to get structural information of the tables in the documents." + "> This output helps you retrieve structural information about the tables embedded within the document." ] }, { @@ -184,17 +186,15 @@ "metadata": {}, "source": [ "## Audio Content\n", - "Our API output facilitates detailed analysis of spoken language, allowing developers to utilize the data for various applications, such as voice recognition, customer service analytics, and conversational AI. The structure of the output makes it easy to extract and analyze different components of the conversation for further processing or insights.\n", + "The API provides detailed analysis of spoken language, enabling developers to build applications such as voice recognition, customer service analytics, and conversational AI. The output structure facilitates extraction and analysis of different conversation components for further processing or insights.\n", "\n", - "1. Speaker Identification: Each phrase is attributed to a specific speaker (in this case, \"Speaker 2\"). This allows for clarity in conversations with multiple participants.\n", - "1. Timing Information: Each transcription includes precise timing data:\n", - " - startTimeMs: The time (in milliseconds) when the phrase begins.\n", - " - endTimeMs: The time (in milliseconds) when the phrase ends.\n", - " This information is crucial for applications like video subtitles, allowing synchronization between the audio and the text.\n", - "1. Text Content: The actual spoken text is provided, which in this instance is \"Thank you for calling Woodgrove Travel.\" This is the main content of the transcription.\n", - "1. Confidence Score: Each transcription phrase includes a confidence score (0.933 in this case), indicating the likelihood that the transcription is accurate. A higher score suggests greater reliability.\n", - "1. Word-Level Breakdown: The transcription is further broken down into individual words, each with its own timing information. This allows for detailed analysis of speech patterns and can be useful for applications such as language processing or speech recognition improvement.\n", - "1. Locale Specification: The locale is specified as \"en-US,\" indicating that the transcription is in American English. This is important for ensuring that the transcription algorithms account for regional dialects and pronunciations.\n" + "Key features include:\n", + "1. **Speaker Identification:** Each phrase is linked to a specific speaker (e.g., \"Speaker 2\"), enabling clear differentiation in multi-participant conversations.\n", + "2. **Timing Information:** Each transcription includes precise start and end times (in milliseconds), crucial for applications like video subtitles and audio-text synchronization.\n", + "3. **Text Content:** The actual spoken text, such as \"Thank you for calling Woodgrove Travel,\" representing the main transcription.\n", + "4. **Confidence Score:** Each phrase has a confidence score (e.g., 0.933) indicating transcription reliability.\n", + "5. **Word-Level Breakdown:** Detailed timing for each word supports advanced speech analysis and improvements in speech recognition.\n", + "6. **Locale Specification:** The locale (e.g., \"en-US\") informs the transcription process of regional dialects and pronunciation nuances." ] }, { @@ -206,7 +206,7 @@ "ANALYZER_SAMPLE_FILE = '../data/audio.wav'\n", "ANALYZER_ID = 'prebuilt-audioAnalyzer'\n", "\n", - "# Analyzer file\n", + "# Analyze audio file\n", "response = client.begin_analyze(ANALYZER_ID, file_location=ANALYZER_SAMPLE_FILE)\n", "result_json = client.poll_result(response)\n", "\n", @@ -218,14 +218,14 @@ "metadata": {}, "source": [ "## Video Content\n", - "Video output provides detailed information about audiovisual content, specifically video shots. Here are the key features it offers:\n", + "The video output provides detailed metadata about audiovisual content, specifically video shots. Key features include:\n", "\n", - "1. Shot Information: Each shot is defined by a start and end time, along with a unique identifier. For example, Shot 0:0.0 to 0:2.800 includes a transcript and key frames.\n", - "1. Transcript: The API includes a transcript of the audio, formatted in WEBVTT, which allows for easy synchronization with the video. It captures spoken content and specifies the timing of the dialogue.\n", - "1. Key Frames: It provides a series of key frames (images) that represent important moments in the video shot, allowing users to visualize the content at specific timestamps.\n", - "1. Description: Each shot is accompanied by a description, providing context about the visuals presented. This helps in understanding the scene or subject matter without watching the video.\n", - "1. Audio Visual Metadata: Details about the video such as dimensions (width and height), type (audiovisual), and the presence of key frame timestamps are included.\n", - "1. Transcript Phrases: The output includes specific phrases from the transcript, along with timing and speaker information, enhancing the usability for applications like closed captioning or search functionalities." + "1. **Shot Information:** Each shot has a start and end time with a unique identifier. For example, Shot 0 from 0:0.0 to 0:2.800 includes a transcript and key frames.\n", + "2. **Transcript:** Audio transcripts formatted in WEBVTT facilitate synchronization with video playback.\n", + "3. **Key Frames:** A collection of key frames (images) represent important moments in the video, allowing visualization of specific timestamps.\n", + "4. **Description:** Each shot includes a descriptive summary, providing context about the visuals.\n", + "5. **AudioVisual Metadata:** Information such as video dimensions (width, height), type (audiovisual), and key frame timestamps.\n", + "6. **Transcript Phrases:** Specific dialog phrases with timing and speaker attribution enhance usability for applications like closed captioning and search." ] }, { @@ -237,7 +237,7 @@ "ANALYZER_SAMPLE_FILE = '../data/FlightSimulator.mp4'\n", "ANALYZER_ID = 'prebuilt-videoAnalyzer'\n", "\n", - "# Analyzer file\n", + "# Analyze video file\n", "response = client.begin_analyze(ANALYZER_ID, file_location=ANALYZER_SAMPLE_FILE)\n", "result_json = client.poll_result(response)\n", "\n", @@ -248,14 +248,13 @@ "result_data = result_json.get(\"result\", {})\n", "contents = result_data.get(\"contents\", [])\n", "\n", - "# Iterate over contents to find keyframes if available\n", + "# Extract keyframe IDs from markdown content\n", "for content in contents:\n", - " # Extract keyframe IDs from \"markdown\" if it exists and is a string\n", " markdown_content = content.get(\"markdown\", \"\")\n", " if isinstance(markdown_content, str):\n", " keyframe_ids.update(re.findall(r\"(keyFrame\\.\\d+)\\.jpg\", markdown_content))\n", "\n", - "# Output the results\n", + "# Output unique keyframe IDs\n", "print(\"Unique Keyframe IDs:\", keyframe_ids)\n", "\n", "# Save all keyframe images\n", @@ -267,8 +266,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Video Content with Face\n", - "This is a gated feature, please go through process [Azure AI Resource Face Gating](https://learn.microsoft.com/en-us/legal/cognitive-services/computer-vision/limited-access-identity?context=%2Fazure%2Fai-services%2Fcomputer-vision%2Fcontext%2Fcontext#registration-process) Select `[Video Indexer] Facial Identification (1:N or 1:1 matching) to search for a face in a media or entertainment video archive to find a face within a video and generate metadata for media or entertainment use cases only` in the registration form." + "## Video Content with Face Recognition\n", + "This is a gated feature. To enable it, please follow the registration process outlined in [Azure AI Resource Face Gating](https://learn.microsoft.com/en-us/legal/cognitive-services/computer-vision/limited-access-identity?context=%2Fazure%2Fai-services%2Fcomputer-vision%2Fcontext%2Fcontext#registration-process).\n", + "In the registration form, select:\n", + "`[Video Indexer] Facial Identification (1:N or 1:1 matching)` to search for faces within media or entertainment video archives and generate metadata for these use cases." ] }, { @@ -280,7 +281,7 @@ "ANALYZER_SAMPLE_FILE = '../data/FlightSimulator.mp4'\n", "ANALYZER_ID = 'prebuilt-videoAnalyzer'\n", "\n", - "# Analyzer file\n", + "# Analyze video file with face recognition\n", "response = client.begin_analyze(ANALYZER_ID, file_location=ANALYZER_SAMPLE_FILE)\n", "result_json = client.poll_result(response)\n", "\n", @@ -291,7 +292,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Get and Save Key Frames and Face Thumbnails" + "### Retrieve and Save Key Frames and Face Thumbnails" ] }, { @@ -300,17 +301,16 @@ "metadata": {}, "outputs": [], "source": [ - "# Initialize sets for unique face IDs and keyframe IDs\n", + "# Initialize sets to store unique face IDs and keyframe IDs\n", "face_ids = set()\n", "keyframe_ids = set()\n", "\n", - "# Extract unique face IDs safely\n", + "# Safely extract face IDs and keyframe IDs from content\n", "result_data = result_json.get(\"result\", {})\n", "contents = result_data.get(\"contents\", [])\n", "\n", - "# Iterate over contents to find faces and keyframes if available\n", "for content in contents:\n", - " # Safely retrieve face IDs if \"faces\" exists and is a list\n", + " # Extract face IDs if \"faces\" field exists and is a list\n", " faces = content.get(\"faces\", [])\n", " if isinstance(faces, list):\n", " for face in faces:\n", @@ -318,12 +318,12 @@ " if face_id:\n", " face_ids.add(f\"face.{face_id}\")\n", "\n", - " # Extract keyframe IDs from \"markdown\" if it exists and is a string\n", + " # Extract keyframe IDs from \"markdown\" if present and a string\n", " markdown_content = content.get(\"markdown\", \"\")\n", " if isinstance(markdown_content, str):\n", " keyframe_ids.update(re.findall(r\"(keyFrame\\.\\d+)\\.jpg\", markdown_content))\n", "\n", - "# Output the results\n", + "# Display unique face and keyframe IDs\n", "print(\"Unique Face IDs:\", face_ids)\n", "print(\"Unique Keyframe IDs:\", keyframe_ids)\n", "\n", @@ -358,4 +358,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file