Merge pull request #50490 from sherzyang/main

denrea · web-flow · commit 4b9e968503c9 · 2025-05-16T12:36:31.000-07:00
Add new module with acrolinx fixes.
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/1-introduction.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/1-introduction.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.introduction
+title: Introduction
+metadata:
+  title: Introduction
+  description: "Introduction"
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 1
+content: |
+  [!include[](includes/1-introduction.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/2-overview.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/2-overview.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.overview
+title: Overview
+metadata:
+  title: Overview
+  description: "Overview"
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 2
+content: |
+  [!include[](includes/2-overview.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/3-vision-extraction.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/3-vision-extraction.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.vision-extraction
+title: Understand the extraction of data from images  
+metadata:
+  title: Understand the extraction of data from images    
+  description: "Understand how machine learning enables the extraction of data from images."
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 3
+content: |
+  [!include[](includes/3-vision-extraction.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/4-form-extraction.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/4-form-extraction.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.form-extraction
+title: Understand the extraction of data from forms
+metadata:
+  title: Understand the extraction of data from forms
+  description: "Understand how machine learning enables data extraction from forms."
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 3
+content: |
+  [!include[](includes/4-form-extraction.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/5-multimodal-extraction.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/5-multimodal-extraction.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.multimodal-extraction
+title: Understand multimodal data extraction 
+metadata:
+  title: Understand multimodal data extraction
+  description: "Understand different techniques that enable multimodal data extraction."
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 3
+content: |
+  [!include[](includes/5-multimodal-extraction.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/6-knowledge-mining.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/6-knowledge-mining.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.knowledge-mining
+title: Understand data extraction for knowledge mining 
+metadata:
+  title: Understand data extraction for knowledge mining 
+  description: "Understand data extraction for knowledge mining."
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 3
+content: |
+  [!include[](includes/6-knowledge-mining.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/7-knowledge-check.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/7-knowledge-check.yml
@@ -0,0 +1,49 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.knowledge-check
+title: Module assessment
+metadata:
+  title: Module assessment
+  description: "Knowledge check"
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 3
+quiz:
+  title: "Check your knowledge"
+  questions:
+  - content: "What is the primary role of machine learning in information extraction?"
+    choices:
+    - content: "To store extracted data in a database."
+      isCorrect: false
+      explanation: ""
+    - content: "To convert structured data into unstructured formats."
+      isCorrect: false
+      explanation: ""
+    - content: "To transform content into numerical data and predict fields and values."
+      isCorrect: true
+      explanation: ""
+  - content: "Which of the following best describes a “field” in the context of data extraction?"
+    choices:
+    - content: "A visual marker used to highlight important text"
+      isCorrect: false
+      explanation: ""
+    - content: "A key that identifies the type of data being extracted"
+      isCorrect: true
+      explanation: ""
+    - content: "A storage location for raw content"
+      isCorrect: false
+      explanation: ""
+  - content: "How does generative AI enhance the data extraction process?"
+    choices:
+    - content: "By allowing users to define custom fields and generate values from unstructured content"
+      isCorrect: true
+      explanation: ""
+    - content: "By converting JSON data into images"
+      isCorrect: false
+      explanation: ""
+    - content: "By generating new documents from scratch"
+      isCorrect: false
+      explanation: ""
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/8-summary.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/8-summary.yml
@@ -0,0 +1,15 @@
+### YamlMime:ModuleUnit
+uid: learn.wwl.introduction-information-extraction.summary
+title: Summary
+metadata:
+  title: Summary
+  description: "Summary"
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: unit
+  ms.custom:
+  - N/A
+durationInMinutes: 1
+content: |
+  [!include[](includes/8-summary.md)]
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/1-introduction.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/1-introduction.md
@@ -0,0 +1,9 @@
+Today's organizations deal with all kinds of content such as documents, video, audio, images, and text. A common task in these organizations includes identifying and storing key information from the content into databases.
+
+Consider some of these use cases: 
+- A manufacturer has images of each of its products. The images need to be analyzed for defects and anomalies.  
+- A business works with a high volume of invoices, contracts, and reports with charts. Key data and summaries from the documents need to be extracted and logged. 
+- Many hours of customer calls are recorded for quality purposes. The audio needs to be transcribed, summarized, and analyzed for sentiment.  
+- A streaming catalog contains a large volume of video. Important moments in each video need to be tagged with metadata based on their content.
+
+Manually processing such content can be slow and potentially error-prone. **AI-powered information extraction** encompasses capabilities that extract meaning from content. In this module, you explore core concepts related to information extraction.
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/2-overview.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/2-overview.md
@@ -0,0 +1,21 @@
+**AI-powered information extraction** and analysis enables organizations to gain actionable insights from data that might otherwise be locked up in documents, images, audio files, or other assets. Insights can come from structured and unstructured content. **Structured content** is information stored in a consistent format. Some examples include invoices, tax forms, and tables. **Unstructured content** is information that isn't in a predefined format. Some examples include emails, audio recordings, images, and videos. 
+
+## Information extraction processes 
+
+In general, information extraction processes follow these steps:
+
+|**Step** | **Description** |
+|-|-|
+| **Source Identification** | Determine where the information resides and if it needs to be digitized.|
+| **Extraction** | Leverages many techniques based on machine learning to understand and extract data from digitized content. |
+| **Transformation & Structuring** | Extracted data is transformed into structured formats like JSON or tables.|
+| **Storage & Integration**| The processed data is then stored in databases, data lakes, or analytics platforms for further use.|
+
+Both the type of content and type of insights needed from that content inform which techniques are necessary for information extraction. In this module we will take a look at the extraction of information with AI: 
+
+- From images 
+- From forms
+- From multiple modalities 
+- For knowledge mining 
+
+In many ways, the techniques used for images, forms, multiple modalities, and knowledge mining build upon each other.
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/3-vision-extraction.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/3-vision-extraction.md
@@ -0,0 +1,12 @@
+AI-powered information extraction replaces the need to manually inspect each piece of content for insights. Computer vision can extract insights from images to describe the people, places, things, and words they depict. 
+
+Computer vision is made possible by machine learning models that are trained to recognize features based on large volumes of existing images. Machine learning models process images by transforming the images into numerical information. At its core, vision models perform calculations on the numerical information, which result in predictions of what's in the images. 
+
+![Screenshot of an envelope with the address handwritten. The address is digitized next to the image, showing an example of OCR.](../media/sample-mail.jpg) 
+
+**Optical Character Recognition (OCR)** helps computers recognize that an element in an image contains text. OCR is the foundation of processing text in images and uses machine learning models that are trained to recognize individual shapes as letters, numerals, punctuation, or other elements of text. Much of the early work on implementing this kind of capability was performed by postal services to support automatic sorting of mail based on postal codes. Since then, the state-of-the-art for reading text has moved on, and we have models that detect printed or handwritten text in an image and digitize it line-by-line and word-by-word.
+
+> [!NOTE]
+> The machine learning concepts associated with vision are covered in-depth in [Introduction to computer vision concepts](/training/modules/analyze-images-computer-vision/).  
+
+Next, let's see how data is extracted from forms with techniques that build upon OCR. 
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/4-form-extraction.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/4-form-extraction.md
@@ -0,0 +1,24 @@
+Forms and other documents have text data with *semantic meaning*. Semantic meaning refers to the intended meaning or interpretation of words, phrases, or symbols in a given context. Semantic meaning goes beyond just the literal definition of a word (syntax) and focuses on what the word or sentence actually conveys.
+
+*Document intelligence* describes AI capabilities that process text and attach semantic meaning to the extracted text. As an extension of optical character recognition (OCR), document intelligence automates the process of extracting and understanding information. 
+
+Consider an organization that needs to process large numbers of receipts for expenses claims, project costs, and other accounting purposes. Using document intelligence, the company can take a scanned image of a receipt, digitize the text with OCR, and extract semantic meaning. The semantic meaning of data in forms can be described in field-value pairs. 
+
+- The **field name** is the key or type of data entry.   
+- The **field description** is the definition of what the field name represents.  
+- The **value** corresponds with the field name and is the data specific to the content.  
+
+For example, in an invoice, the fields recognized may include:
+
+- Name, address, and telephone number of the merchant
+- Date and time of the purchase
+- Name, quantity, and price of each item purchased
+- Total, subtotals, and tax values
+
+The data in forms is recognized with *bounding boxes*.
+
+![A screenshot of a scanned receipt for the purchase of a Surface Pro and a Surface Pen.](../media/contoso-receipt.png)
+
+For example, the address information in on the receipt is saved as a `field name`, `address` and a `value`, `123 Main Street` with coordinates [4.1, 2.2], [4.3, 2.2], [4.3, 2.4], [4.1, 2.4]. Machine learning models can interpret the data in a document or form because they're trained to recognize patterns in bounding box coordinate locations.
+
+The results of data extraction are associated with **confidence levels** for each field and data pair. This *confidence level* is a percentage between 0 and 1, indicating the likely level of accuracy. Data extracted with a high confidence score (closer to 1) could be relied on more confidently to actually represent what is in the original content. 
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/5-multimodal-extraction.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/5-multimodal-extraction.md
@@ -0,0 +1,22 @@
+AI-powered information extraction techniques can be combined to perform data extraction on multiple modalities of content, from documents to video and audio. Using multimodal data extraction can help with digital asset management, workflow automation, generating further insights, and more. 
+
+The orchestration of extraction techniques can include vision and document intelligence, and others including:
+
+- **Natural language processing** can be used to find key phrases, entities, sentiment, etc. in written or spoken language.
+
+> [!NOTE] 
+> The machine learning concepts associated with NLP are covered in-depth in [Introduction to natural language processing concepts](/training/modules/analyze-text-with-text-analytics-service).
+
+- **Speech recognition** takes the spoken word and converts it into data that can be processed - often by transcribing it into text. The spoken words can be in the form of a recorded voice in an audio file, or live audio from a microphone. 
+
+> [!NOTE] 
+> Speech recognition is covered in [Get started with speech on Azure](/training/modules/recognize-synthesize-speech).
+
+- **Generative AI** can add to the data extraction process by allowing users to identify their own fields and field descriptions. It can be particularly useful when dealing with unstructured content. One example is the user-added *field* of "summary". The *value* associated with the field must be generated based on the data in the content.
+
+>[!NOTE]
+> Generative AI concepts are covered in-depth in [Introduction to generative AI on Azure](/training/modules/fundamentals-azure-ai-services/).
+
+The content processing pipeline for multimodal information extraction can include layers of these extraction techniques. One example of the pipeline's output is structured insights and additional generated content.
+
+![Screenshot of the possible components of multimodal information extraction.](../media/component-overview.png)
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/6-knowledge-mining.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/6-knowledge-mining.md
@@ -0,0 +1,21 @@
+Knowledge mining solutions provide automated information extraction from large volumes of often unstructured data. A foundational knowledge mining solution is search, the process of retrieving relevant information from a large dataset in response to a user query. AI-powered information extraction supports improvements in what is searchable in a search index.
+
+In AI-powered information extraction for search, content first moves through **Document cracking**. Document cracking describes opening document formats like PDFs to extract the contents as ASCII text for analysis and indexing.
+
+The contents then move through **AI enrichment**, which implements AI on your original content to extract more information. Examples of AI enrichment include adding captions to a photo and evaluating text sentiment. AI enriched content can be sent to a **knowledge store**, which persists output from an AI enrichment pipeline for independent analysis or downstream processing.  
+
+The resulting data is serialized as JSON data. The JSON populates the *search index*. The populated **search index**  can be explored through queries. When users make a search query such as "coffee", the search engine looks for that information in the search index. A search index has a structure similar to a table, known as the index *schema*. A typical search index schema contains *fields*, the field's data type (such as string), and *field attributes*. The fields store searchable text, and the field attributes allow for actions such as filtering and sorting. Below is an example of a search index schema: 
+
+![A screenshot of the structure of an index schema in json including key phrases and image tags.](../media/json-index-example.png)
+
+A result is a search solution which typically includes the following components:
+
+|**Component** | **Function**|
+|-|-|
+|API Layer | Accepts user queries and routes them to the search engine.|
+|Query Processor| Parses and interprets the query.|
+|Search Strategies| Determines how to search—e.g., keyword, semantic, vector, or hybrid.|
+|Execution Engine| Executes the query across the search index. AI-powered information extraction adds to the data that is searchable.|
+|Result Aggregator| Combines results from multiple sources into a unified list.|
+|Ranking Engine| Sorts results based on relevance, freshness, popularity, or AI signals.|
+|Response Formatter| Formats the results for display in the user interface.|
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/includes/8-summary.md b/learn-pr/wwl-data-ai/introduction-information-extraction/includes/8-summary.md
@@ -0,0 +1,6 @@
+
+In this module you learned how modern organizations can process content using AI-powered information extraction. Extracting key information from this content and storing it in structured formats (like databases) is essential but often time-consuming and error-prone. AI-powered information extraction uses machine learning to automate the identification and structuring of data. Generative AI further allows users to define custom fields (like “summary”) and generate values from unstructured content such as emails or videos. All of this extracted data can be used to improve the searchable content in a search index. 
+
+## Learn more
+
+Apply these concepts in [Get started with AI-powered information extraction in Azure](/training/modules/ai-information-extraction/).
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/index.yml b/learn-pr/wwl-data-ai/introduction-information-extraction/index.yml
@@ -0,0 +1,37 @@
+### YamlMime:Module
+uid: learn.wwl.introduction-information-extraction
+metadata:
+  title: Introduction to AI-powered information extraction concepts
+  description: "Introduction to AI-powered information extraction concepts"
+  ms.date: 5/9/2025
+  author: wwlpublish
+  ms.author: sheryang
+  ms.topic: module
+  ai-usage: ai-assisted
+  ms.service: azure
+title: Introduction to AI-powered information extraction concepts
+summary: Introduction to AI-powered information extraction
+abstract: Learn concepts of information extraction and search. 
+prerequisites: Ability to navigate the Azure portal
+iconUrl: /training/achievements/generic-badge.svg
+levels:
+- beginner
+roles:
+- ai-engineer
+- data-scientist
+- developer
+- solution-architect
+- student
+products:
+- ai-services
+units:
+- learn.wwl.introduction-information-extraction.introduction
+- learn.wwl.introduction-information-extraction.overview
+- learn.wwl.introduction-information-extraction.vision-extraction
+- learn.wwl.introduction-information-extraction.form-extraction
+- learn.wwl.introduction-information-extraction.multimodal-extraction
+- learn.wwl.introduction-information-extraction.knowledge-mining
+- learn.wwl.introduction-information-extraction.knowledge-check
+- learn.wwl.introduction-information-extraction.summary
+badge:
+  uid: learn.wwl.introduction-content-understanding.badge
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/ai-portal-content-understanding.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/ai-portal-content-understanding.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/component-overview.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/component-overview.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/contoso-receipt.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/contoso-receipt.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/data-indexing-process-1.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/data-indexing-process-1.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/json-index-example.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/json-index-example.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/launch-exercise.png b/learn-pr/wwl-data-ai/introduction-information-extraction/media/launch-exercise.png
diff --git a/learn-pr/wwl-data-ai/introduction-information-extraction/media/sample-mail.jpg b/learn-pr/wwl-data-ai/introduction-information-extraction/media/sample-mail.jpg