Add include category feature

nb-a153 · nb-a153 · commit dc44d6989f13 · 2024-10-05T18:32:56.000+09:00
- Modified approach.py to include category logic
- Updated models.ts with category types
- Added translations for category("All") in en, es, fr, and ja locales
- Updated Ask.tsx and Chat.tsx to handle category
- Updated data ingestion documentation
diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py
@@ -123,9 +123,12 @@ def __init__(
         self.vision_token_provider = vision_token_provider
 
     def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]:
+        include_category = overrides.get("include_category")
         exclude_category = overrides.get("exclude_category")
         security_filter = self.auth_helper.build_security_filters(overrides, auth_claims)
         filters = []
+        if include_category:
+            filters.append("category eq '{}'".format(include_category.replace("'", "''")))
         if exclude_category:
             filters.append("category ne '{}'".format(exclude_category.replace("'", "''")))
         if security_filter:
diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts
@@ -20,6 +20,7 @@ export type ChatAppRequestOverrides = {
     retrieval_mode?: RetrievalMode;
     semantic_ranker?: boolean;
     semantic_captions?: boolean;
+    include_category?: string;
     exclude_category?: string;
     seed?: number;
     top?: number;
diff --git a/app/frontend/src/locales/en/translation.json b/app/frontend/src/locales/en/translation.json
@@ -81,6 +81,10 @@
         "minimumSearchScore": "Minimum search score",
         "minimumRerankerScore": "Minimum reranker score",
         "retrieveCount": "Retrieve this many search results:",
+        "includeCategory": "Include category",
+        "includeCategoryOptions": {
+            "all": "All"
+        },
         "excludeCategory": "Exclude category",
         "useSemanticRanker": "Use semantic ranker for retrieval",
         "useSemanticCaptions": "Use semantic captions",
@@ -127,6 +131,8 @@
              "Sets a minimum score for search results coming back from the semantic reranker. The score always ranges between 0-4. The higher the score, the more semantically relevant the result is to the question.",
         "retrieveNumber":
              "Sets the number of search results to retrieve from Azure AI search. More results may increase the likelihood of finding the correct answer, but may lead to the model getting 'lost in the middle'.",
+        "includeCategory":
+            "Specifies a category to include in the search results. There are no categories used in the default data set.",  
          "excludeCategory":
             "Specifies a category to exclude from the search results. There are no categories used in the default data set.",
          "useSemanticReranker":
diff --git a/app/frontend/src/locales/es/translation.json b/app/frontend/src/locales/es/translation.json
@@ -81,6 +81,10 @@
         "minimumSearchScore": "Puntaje mínimo de búsqueda",
         "minimumRerankerScore": "Puntaje mínimo de re-clasificación",
         "retrieveCount": "Obtén éste número resultados de búsqueda:",
+        "includeCategory": "Incluir categoría",
+        "includeCategoryOptions": {
+            "all": "Todos"
+        },
         "excludeCategory": "Excluir categoría",
         "useSemanticRanker": "Usar clasificador semántico para la recuperación",
         "useSemanticCaptions": "Usar subtítulos semánticos",
@@ -128,6 +132,8 @@
             "Establece una puntuación mínima para los resultados de búsqueda que vuelven del re-clasificador semántico. La puntuación siempre varía entre 0-4. Cuanto mayor es la puntuación, más relevante es semánticamente el resultado a la pregunta.",
         "retrieveNumber":
             "Establece el número de resultados de búsqueda para recuperar de Azure AI search. Más resultados pueden aumentar la probabilidad de encontrar la respuesta correcta, pero pueden provocar que el modelo se 'pierda en el medio'.",
+        "includeCategory":
+            "Especifica una categoría para incluir en los resultados de búsqueda. No se utilizan categorías en el conjunto de datos predeterminado.", 
         "excludeCategory":
             "Especifica una categoría para excluir de los resultados de búsqueda. No se utilizan categorías en el conjunto de datos predeterminado.",
         "useSemanticReranker":
diff --git a/app/frontend/src/locales/fr/translation.json b/app/frontend/src/locales/fr/translation.json
@@ -81,6 +81,10 @@
         "minimumSearchScore": "Score de recherche minimum",
         "minimumRerankerScore": "Score minimum du reclasseur sémantique",
         "retrieveCount": "Récupérer ce nombre de résultats de recherche :",
+        "includeCategory": "Inclure la catégorie",
+        "includeCategoryOptions": {
+            "all": "Tous"
+        },
         "excludeCategory": "Exclure la catégorie",
         "useSemanticRanker": "Utiliser le reclasseur sémantique",
         "useSemanticCaptions": "Utiliser les titres sémantiques",
@@ -128,6 +132,8 @@
             "Définit un score minimum pour les résultats de recherche provenant du reranker sémantique. Le score varie toujours entre 0 et 4. Plus le score est élevé, plus le résultat est sémantiquement pertinent par rapport à la question.",
         "retrieveNumber":
             "Définit le nombre de résultats de recherche à récupérer d'Azure AI Search. Plus de résultats peuvent augmenter la probabilité de trouver la bonne réponse, mais peuvent amener le modèle à se 'perdre au milieu'.",
+        "includeCategory":
+            "Spécifie une catégorie à inclure dans les résultats de recherche. Il n'y a aucune catégorie utilisée dans l'ensemble de données par défaut.", 
         "excludeCategory":
             "Spécifie une catégorie à exclure des résultats de recherche. Il n'y a aucune catégorie utilisée dans l'ensemble de données par défaut.",
         "useSemanticReranker":
diff --git a/app/frontend/src/locales/ja/translation.json b/app/frontend/src/locales/ja/translation.json
@@ -81,6 +81,10 @@
         "minimumSearchScore": "最小検索スコア",
         "minimumRerankerScore": "最小リランキング・スコア",
         "retrieveCount": "ここで指定する検索結果数を取得：",
+        "includeCategory": "カテゴリを指定",
+        "includeCategoryOptions": {
+            "all": "全て"
+        },
         "excludeCategory": "カテゴリを除外",
         "useSemanticRanker": "取得にセマンティック・ランカーを使用",
         "useSemanticCaptions": "セマンティック・キャプションを使用",
@@ -127,6 +131,7 @@
             "セマンティック・リランカーから返される検索結果の最小スコアを設定します。スコアの値は0から4の範囲で変更できます。スコアの値が大きいほど、質問に対する結果の意味的な関連性が高まります。",
         "retrieveNumber":
             "Azure AI Searchの検索結果から取得する数を設定します。結果が多ければ多いほど、正しい答えを見つける可能性は高まるかもしれませんが、モデルが「途中で迷子になる」可能性もあります。",
+        "includeCategory": "検索結果に含めるカテゴリを指定します。デフォルトのデータセットはカテゴリを使用していません。",
         "excludeCategory": "検索結果から除外するカテゴリを指定します。デフォルトのデータセットはカテゴリを使用していません。",
         "useSemanticReranker":
             "Azure AI Searchのセマンティック・ランカーを有効にします(ユーザーのクエリに対するセマンティック類似性に基づいて検索結果をリランク付けするモデル)。",
diff --git a/app/frontend/src/pages/ask/Ask.tsx b/app/frontend/src/pages/ask/Ask.tsx
@@ -1,7 +1,18 @@
 import { useContext, useEffect, useRef, useState } from "react";
 import { useTranslation } from "react-i18next";
 import { Helmet } from "react-helmet-async";
-import { Checkbox, Panel, DefaultButton, Spinner, TextField, ICheckboxProps, ITextFieldProps } from "@fluentui/react";
+import {
+    Checkbox,
+    Panel,
+    DefaultButton,
+    Spinner,
+    TextField,
+    ICheckboxProps,
+    ITextFieldProps,
+    Dropdown,
+    IDropdownOption,
+    IDropdownProps
+} from "@fluentui/react";
 import { useId } from "@fluentui/react-hooks";
 
 import styles from "./Ask.module.css";
@@ -38,6 +49,7 @@ export function Component(): JSX.Element {
     const [useSemanticCaptions, setUseSemanticCaptions] = useState<boolean>(false);
     const [useGPT4V, setUseGPT4V] = useState<boolean>(false);
     const [gpt4vInput, setGPT4VInput] = useState<GPT4VInput>(GPT4VInput.TextAndImages);
+    const [includeCategory, setIncludeCategory] = useState<string>("");
     const [excludeCategory, setExcludeCategory] = useState<string>("");
     const [question, setQuestion] = useState<string>("");
     const [vectorFieldList, setVectorFieldList] = useState<VectorFieldOptions[]>([VectorFieldOptions.Embedding, VectorFieldOptions.ImageEmbedding]);
@@ -120,6 +132,7 @@ export function Component(): JSX.Element {
                         prompt_template: promptTemplate.length === 0 ? undefined : promptTemplate,
                         prompt_template_prefix: promptTemplatePrefix.length === 0 ? undefined : promptTemplatePrefix,
                         prompt_template_suffix: promptTemplateSuffix.length === 0 ? undefined : promptTemplateSuffix,
+                        include_category: includeCategory.length === 0 ? undefined : includeCategory,
                         exclude_category: excludeCategory.length === 0 ? undefined : excludeCategory,
                         top: retrieveCount,
                         temperature: temperature,
@@ -181,6 +194,10 @@ export function Component(): JSX.Element {
         setUseSemanticCaptions(!!checked);
     };
 
+    const onIncludeCategoryChanged = (_ev?: React.FormEvent<HTMLElement | HTMLInputElement>, option?: IDropdownOption) => {
+        setIncludeCategory((option?.key as string) || "");
+    };
+
     const onExcludeCategoryChanged = (_ev?: React.FormEvent, newValue?: string) => {
         setExcludeCategory(newValue || "");
     };
@@ -228,6 +245,8 @@ export function Component(): JSX.Element {
     const rerankerScoreFieldId = useId("rerankerScoreField");
     const retrieveCountId = useId("retrieveCount");
     const retrieveCountFieldId = useId("retrieveCountField");
+    const includeCategoryId = useId("includeCategory");
+    const includeCategoryFieldId = useId("includeCategoryField");
     const excludeCategoryId = useId("excludeCategory");
     const excludeCategoryFieldId = useId("excludeCategoryField");
     const semanticRankerId = useId("semanticRanker");
@@ -407,6 +426,26 @@ export function Component(): JSX.Element {
                     )}
                 />
 
+                <Dropdown
+                        id={includeCategoryFieldId}
+                        className={styles.chatSettingsSeparator}
+                        label={t("labels.includeCategory")}
+                        selectedKey={includeCategory}
+                        onChange={onIncludeCategoryChanged}
+                        aria-labelledby={includeCategoryId}
+                        options={[
+                            { key: '', text: t("labels.includeCategoryOptions.all") }
+                        ]}
+                        onRenderLabel={(props: IDropdownProps | undefined) => (
+                            <HelpCallout
+                                labelId={includeCategoryId}
+                                fieldId={includeCategoryFieldId}
+                                helpText={t("helpTexts.includeCategory")}
+                                label={props?.label}
+                            />
+                        )}
+                />
+
                 <TextField
                     id={excludeCategoryFieldId}
                     className={styles.chatSettingsSeparator}
diff --git a/app/frontend/src/pages/chat/Chat.tsx b/app/frontend/src/pages/chat/Chat.tsx
@@ -1,7 +1,17 @@
 import { useRef, useState, useEffect, useContext } from "react";
 import { useTranslation } from "react-i18next";
 import { Helmet } from "react-helmet-async";
-import { Checkbox, Panel, DefaultButton, TextField, ITextFieldProps, ICheckboxProps } from "@fluentui/react";
+import {
+    Checkbox,
+    Panel,
+    DefaultButton,
+    TextField,
+    ITextFieldProps,
+    ICheckboxProps,
+    Dropdown,
+    IDropdownOption,
+    IDropdownProps
+} from "@fluentui/react";
 import { SparkleFilled } from "@fluentui/react-icons";
 import { useId } from "@fluentui/react-hooks";
 import readNDJSONStream from "ndjson-readablestream";
@@ -53,6 +63,7 @@ const Chat = () => {
     const [useSemanticRanker, setUseSemanticRanker] = useState<boolean>(true);
     const [shouldStream, setShouldStream] = useState<boolean>(true);
     const [useSemanticCaptions, setUseSemanticCaptions] = useState<boolean>(false);
+    const [includeCategory, setIncludeCategory] = useState<string>("");
     const [excludeCategory, setExcludeCategory] = useState<string>("");
     const [useSuggestFollowupQuestions, setUseSuggestFollowupQuestions] = useState<boolean>(false);
     const [vectorFieldList, setVectorFieldList] = useState<VectorFieldOptions[]>([VectorFieldOptions.Embedding]);
@@ -184,6 +195,7 @@ const Chat = () => {
                 context: {
                     overrides: {
                         prompt_template: promptTemplate.length === 0 ? undefined : promptTemplate,
+                        include_category: includeCategory.length === 0 ? undefined : includeCategory,
                         exclude_category: excludeCategory.length === 0 ? undefined : excludeCategory,
                         top: retrieveCount,
                         temperature: temperature,
@@ -291,6 +303,10 @@ const Chat = () => {
         setShouldStream(!!checked);
     };
 
+    const onIncludeCategoryChanged = (_ev?: React.FormEvent<HTMLElement | HTMLInputElement>, option?: IDropdownOption) => {
+        setIncludeCategory((option?.key as string) || "");
+    };
+
     const onExcludeCategoryChanged = (_ev?: React.FormEvent, newValue?: string) => {
         setExcludeCategory(newValue || "");
     };
@@ -345,6 +361,8 @@ const Chat = () => {
     const rerankerScoreFieldId = useId("rerankerScoreField");
     const retrieveCountId = useId("retrieveCount");
     const retrieveCountFieldId = useId("retrieveCountField");
+    const includeCategoryId = useId("includeCategory");
+    const includeCategoryFieldId = useId("includeCategoryField");
     const excludeCategoryId = useId("excludeCategory");
     const excludeCategoryFieldId = useId("excludeCategoryField");
     const semanticRankerId = useId("semanticRanker");
@@ -607,6 +625,30 @@ const Chat = () => {
                         )}
                     />
 
+                    <Dropdown
+                        id={includeCategoryFieldId}
+                        className={styles.chatSettingsSeparator}
+                        label={t("labels.includeCategory")}
+                        selectedKey={includeCategory}
+                        onChange={onIncludeCategoryChanged}
+                        aria-labelledby={includeCategoryId}
+                        options={[
+                            { key: '', text: t("labels.includeCategoryOptions.all") },
+                            // You can add a category key here for ingested data like below:
+                            // { key: 'categoryName', text: 'Meaningful Category Name' }
+                            // Alternatively, display the key to guide the user on what to type
+                            // in the "Exclude category" field (e.g., 'Meaningful Category Name(categoryName)').
+                        ]}
+                        onRenderLabel={(props: IDropdownProps | undefined) => (
+                            <HelpCallout
+                                labelId={includeCategoryId}
+                                fieldId={includeCategoryFieldId}
+                                helpText={t("helpTexts.includeCategory")}
+                                label={props?.label}
+                            />
+                        )}
+                    />
+
                     <TextField
                         id={excludeCategoryFieldId}
                         className={styles.chatSettingsSeparator}
diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md
@@ -5,6 +5,7 @@ This guide provides more details for using the `prepdocs` script to index docume
 - [Supported document formats](#supported-document-formats)
 - [Overview of the manual indexing process](#overview-of-the-manual-indexing-process)
   - [Chunking](#chunking)
+  - [Categorizing data for enhanced search](#enhancing-search-functionality-with-data-categorization)
   - [Indexing additional documents](#indexing-additional-documents)
   - [Removing documents](#removing-documents)
 - [Overview of Integrated Vectorization](#overview-of-integrated-vectorization)
@@ -41,6 +42,12 @@ The script uses the following steps to index documents:
 3. Split the PDFs into chunks of text.
 4. Upload the chunks to Azure AI Search. If using vectors (the default), also compute the embeddings and upload those alongside the text.
 
+### Enhancing search functionality with data categorization
+
+To enhance search functionality, categorize data during the ingestion process with `--category` argument, for example `scripts/prepdocs.ps1 --category ExampleCategoryName`. This argument specifies the category to which the data belongs, enabling you to filter search results based on these categories.
+
+After running the script with the desired category, ensure these categories are added to the "Include Category" dropdown list in the developer settings. The default option for this dropdown is "All". By including specific categories, you can refine your search results more effectively.
+
 ### Chunking
 
 We're often asked why we need to break up the PDFs into chunks when Azure AI Search supports searching large documents.