Add document upload

jelledruyts · jelledruyts · commit 8ed1ebaca0e1 · 2023-09-02T15:38:24.000+02:00
diff --git a/azuredeploy-webapp.json b/azuredeploy-webapp.json
@@ -54,7 +54,9 @@
         "textEmbedderMinChunkSize": 10,
         "searchIndexNameBlobDocuments": "blob-documents",
         "searchIndexNameBlobChunks": "blob-chunks",
-        "searchIndexerScheduleMinutes": 5
+        "searchIndexerScheduleMinutes": 5,
+        "disableUploadDocuments": false,
+        "disableResetSearchConfiguration": false
     },
     "resources": [
         {
@@ -361,6 +363,14 @@
                         {
                             "name": "InitialDocumentUrls",
                             "value": "[parameters('initialDocumentUrls')]"
+                        },
+                        {
+                            "name": "DisableUploadDocuments",
+                            "value": "[variables('disableUploadDocuments')]"
+                        },
+                        {
+                            "name": "DisableResetSearchConfiguration",
+                            "value": "[variables('disableResetSearchConfiguration')]"
                         }
                     ]
                 }
@@ -448,6 +458,14 @@
             "type": "string",
             "value": "[parameters('initialDocumentUrls')]"
         },
+        "disableUploadDocuments": {
+            "type": "bool",
+            "value": "[variables('disableUploadDocuments')]"
+        },
+        "disableResetSearchConfiguration": {
+            "type": "bool",
+            "value": "[variables('disableResetSearchConfiguration')]"
+        },
         "webAppUrl": {
             "type": "string",
             "value": "[concat('https://', reference(resourceId('Microsoft.Web/sites', variables('webAppName')), '2022-09-01').defaultHostName)]"
diff --git a/src/Azure.AISearch.WebApp/AppSettings.cs b/src/Azure.AISearch.WebApp/AppSettings.cs
@@ -22,4 +22,6 @@ public class AppSettings
     public string? SearchIndexNameBlobChunks { get; set; }
     public int? SearchIndexerScheduleMinutes { get; set; } // If unspecified, will be set to 5 minutes.
     public string? InitialDocumentUrls { get; set; }
+    public bool DisableUploadDocuments { get; set; } // If true, the Upload Documents functionality will be disabled.
+    public bool DisableResetSearchConfiguration { get; set; } // If true, the Reset Search Configuration functionality will be disabled.
 }
diff --git a/src/Azure.AISearch.WebApp/Pages/Manage.cshtml b/src/Azure.AISearch.WebApp/Pages/Manage.cshtml
@@ -14,6 +14,24 @@
 
 <h2 class="display-6 mb-3">@ViewData["Title"]</h2>
 
+@if (!settings.DisableUploadDocuments)
+{
+    <h3 class="mb-3">Upload documents</h3>
+
+    <form method="post" enctype="multipart/form-data">
+        <div class="alert alert-info">
+            This allows you to upload new documents to the <code>Documents</code> container in the configured Azure Storage account.
+            The documents will automatically be added to the <code>Documents</code> search index by the indexer infrastructure.
+            The indexer and associated skillset will also split the document content into smaller chunks, generate embeddings for
+            these chunks, and then store the chunk contents along with their embedding vector representations in the <code>Chunks</code>
+            search index.
+        </div>
+        <input type="file" class="form-control" name="documents" multiple>
+        <input type="hidden" name="action" value="@ManageModel.UploadDocument">
+        <button type="submit" class="btn btn-primary my-3">Upload</button>
+    </form>
+}
+
 @if (Model.SearchIndexStatuses != null && Model.SearchIndexStatuses.Any())
 {
     <h3 class="mb-3">Search index status</h3>
@@ -49,48 +67,51 @@
     </table>
 }
 
-<h3 class="mb-3">Reset search configuration</h3>
+@if (!settings.DisableResetSearchConfiguration)
+{
+    <h3 class="mb-3">Reset search configuration</h3>
 
-<form method="post">
-    <div class="alert alert-danger">
-        <p>
-            This will delete all indexes in Azure Cognitive Search along with their supporting infrastructure
-            like indexers, data sources and skillset definitions.
-        </p>
-        <p>
-            It will also delete all content in the <code>Chunks</code> blob container in the configured Azure Storage
-            account (as the chunks will be recreated from the source data, optionally with the new settings below).
-        </p>
-        <p>
-            However, it will <b><i>not</i></b> delete any data in the <code>Documents</code> container, so all your
-            previously uploaded documents will remain available and will be re-indexed after the configuration is reset.
-        </p>
-    </div>
-    <div class="card mb-3">
-        <div class="card-header">Options</div>
-        <div class="card-body">
-            <div class="mb-2">
-                <label class="form-label" for="appSettingsOverride-TextEmbedderNumTokens">Number of tokens per chunk</label>
-                <span class="info-tip" data-bs-toggle="popover" data-bs-content="The source data will be split up into smaller chunks of approximately the token size you specify here. Embeddings are generated per chunk so the larger the chunks, the more likely you will hit token limits and the more likely the vector representation will be less specific (as it's generated from a larger body of content). Experiment with this value based on the type of content being chunked and the kinds of recall performance required for retrieval scenarios."><i class="bi bi-info-circle"></i></span>
-                <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderNumTokens)" id="appSettingsOverride-TextEmbedderNumTokens" value="@settings.TextEmbedderNumTokens">
-            </div>
-            <div class="mb-2">
-                <label class="form-label" for="appSettingsOverride-TextEmbedderTokenOverlap">Token overlap between chunks</label>
-                <span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of tokens to overlap between consecutive chunks. This is useful to maintain context continuity between chunks. By including some overlapping tokens, you can ensure that a small portion of context is shared between adjacent chunks, which can help with preserving the meaning and coherence when processing the text with language models."><i class="bi bi-info-circle"></i></span>
-                <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderTokenOverlap)" id="appSettingsOverride-TextEmbedderTokenOverlap" value="@settings.TextEmbedderTokenOverlap">
-            </div>
-            <div class="mb-2">
-                <label class="form-label" for="appSettingsOverride-TextEmbedderMinChunkSize">Minimum chunk size</label>
-                <span class="info-tip" data-bs-toggle="popover" data-bs-content="The minimum number of tokens that a chunk should contain in order to be included in the <code>Chunks</code> index. This helps avoid that small chunks (with only a few words for example) have a disproportional impact on search results."><i class="bi bi-info-circle"></i></span>
-                <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderMinChunkSize)" id="appSettingsOverride-TextEmbedderMinChunkSize" value="@settings.TextEmbedderMinChunkSize">
-            </div>
-            <div class="mb-2">
-                <label class="form-label" for="appSettingsOverride-SearchIndexerScheduleMinutes">Search indexer interval in minutes (minimum <code>5</code>)</label>
-                <span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of minutes between indexer executions. If you upload new documents, it can take up to this amount of time for the data to be included in the <code>Documents</code> index. It can again take the same amount of time after that for the data to be included in the <code>Chunks</code> index, as the chunks are created while the indexer for the <code>Documents</code> index runs."><i class="bi bi-info-circle"></i></span>
-                <input type="number" class="form-control" name="@nameof(AppSettingsOverride.SearchIndexerScheduleMinutes)" id="appSettingsOverride-SearchIndexerScheduleMinutes" value="@settings.SearchIndexerScheduleMinutes">
+    <form method="post">
+        <div class="alert alert-warning">
+            <p>
+                This will delete all indexes in Azure Cognitive Search along with their supporting infrastructure
+                like indexers, data sources and skillset definitions.
+            </p>
+            <p>
+                It will also delete all content in the <code>Chunks</code> blob container in the configured Azure Storage
+                account (as the chunks will be recreated from the source data, optionally with the new settings below).
+            </p>
+            <p>
+                However, it will <b><i>not</i></b> delete any data in the <code>Documents</code> container, so all your
+                previously uploaded documents will remain available and will be re-indexed after the configuration is reset.
+            </p>
+        </div>
+        <div class="card mb-3">
+            <div class="card-header">Options</div>
+            <div class="card-body">
+                <div class="mb-2">
+                    <label class="form-label" for="appSettingsOverride-TextEmbedderNumTokens">Number of tokens per chunk</label>
+                    <span class="info-tip" data-bs-toggle="popover" data-bs-content="The source data will be split up into smaller chunks of approximately the token size you specify here. Embeddings are generated per chunk so the larger the chunks, the more likely you will hit token limits and the more likely the vector representation will be less specific (as it's generated from a larger body of content). Experiment with this value based on the type of content being chunked and the kinds of recall performance required for retrieval scenarios."><i class="bi bi-info-circle"></i></span>
+                    <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderNumTokens)" id="appSettingsOverride-TextEmbedderNumTokens" value="@settings.TextEmbedderNumTokens">
+                </div>
+                <div class="mb-2">
+                    <label class="form-label" for="appSettingsOverride-TextEmbedderTokenOverlap">Token overlap between chunks</label>
+                    <span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of tokens to overlap between consecutive chunks. This is useful to maintain context continuity between chunks. By including some overlapping tokens, you can ensure that a small portion of context is shared between adjacent chunks, which can help with preserving the meaning and coherence when processing the text with language models."><i class="bi bi-info-circle"></i></span>
+                    <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderTokenOverlap)" id="appSettingsOverride-TextEmbedderTokenOverlap" value="@settings.TextEmbedderTokenOverlap">
+                </div>
+                <div class="mb-2">
+                    <label class="form-label" for="appSettingsOverride-TextEmbedderMinChunkSize">Minimum chunk size</label>
+                    <span class="info-tip" data-bs-toggle="popover" data-bs-content="The minimum number of tokens that a chunk should contain in order to be included in the <code>Chunks</code> index. This helps avoid that small chunks (with only a few words for example) have a disproportional impact on search results."><i class="bi bi-info-circle"></i></span>
+                    <input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderMinChunkSize)" id="appSettingsOverride-TextEmbedderMinChunkSize" value="@settings.TextEmbedderMinChunkSize">
+                </div>
+                <div class="mb-2">
+                    <label class="form-label" for="appSettingsOverride-SearchIndexerScheduleMinutes">Search indexer interval in minutes (minimum <code>5</code>)</label>
+                    <span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of minutes between indexer executions. If you upload new documents, it can take up to this amount of time for the data to be included in the <code>Documents</code> index. It can again take the same amount of time after that for the data to be included in the <code>Chunks</code> index, as the chunks are created while the indexer for the <code>Documents</code> index runs."><i class="bi bi-info-circle"></i></span>
+                    <input type="number" class="form-control" name="@nameof(AppSettingsOverride.SearchIndexerScheduleMinutes)" id="appSettingsOverride-SearchIndexerScheduleMinutes" value="@settings.SearchIndexerScheduleMinutes">
+                </div>
             </div>
         </div>
-    </div>
-    <input type="hidden" name="action" value="@ManageModel.ResetSearchConfiguration">
-    <button type="submit" class="btn btn-danger">Reset configuration</button>
-</form>
+        <input type="hidden" name="action" value="@ManageModel.ResetSearchConfiguration">
+        <button type="submit" class="btn btn-danger">Reset configuration</button>
+    </form>
+}
diff --git a/src/Azure.AISearch.WebApp/Pages/Manage.cshtml.cs b/src/Azure.AISearch.WebApp/Pages/Manage.cshtml.cs
@@ -5,18 +5,24 @@
 
 namespace Azure.AISearch.WebApp.Pages;
 
+[RequestFormLimits(MultipartBodyLengthLimit = MaxDocumentUploadSize)]
+[RequestSizeLimit(MaxDocumentUploadSize)]
 public class ManageModel : PageModel
 {
+    private const int MaxDocumentUploadSize = 209715200; // 200 MB
+    public const string UploadDocument = nameof(UploadDocument);
     public const string RunSearchIndexer = nameof(RunSearchIndexer);
     public const string ResetSearchConfiguration = nameof(ResetSearchConfiguration);
 
+    private readonly AppSettings settings;
     private readonly AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService;
     private readonly AzureStorageConfigurationService azureStorageConfigurationService;
 
     public IList<SearchIndexStatus>? SearchIndexStatuses { get; set; }
 
-    public ManageModel(AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)
+    public ManageModel(AppSettings settings, AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)
     {
+        this.settings = settings;
         this.azureCognitiveSearchConfigurationService = azureCognitiveSearchConfigurationService;
         this.azureStorageConfigurationService = azureStorageConfigurationService;
     }
@@ -26,13 +32,21 @@ public async Task OnGet()
         this.SearchIndexStatuses = await this.azureCognitiveSearchConfigurationService.GetSearchIndexStatusesAsync();
     }
 
-    public async Task<IActionResult> OnPost(string action, string? searchIndexName, AppSettingsOverride? settingsOverride)
+    public async Task<IActionResult> OnPost(string action, IList<IFormFile>? documents, string? searchIndexName, AppSettingsOverride? settingsOverride)
     {
-        if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))
+        if (action == UploadDocument && !settings.DisableUploadDocuments && documents != null && documents.Any())
+        {
+            foreach (var document in documents)
+            {
+                using var fileStream = document.OpenReadStream();
+                await this.azureStorageConfigurationService.UploadDocumentAsync(fileStream, document.FileName);
+            }
+        }
+        else if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))
         {
             await this.azureCognitiveSearchConfigurationService.RunSearchIndexerAsync(searchIndexName);
         }
-        else if (action == ResetSearchConfiguration)
+        else if (action == ResetSearchConfiguration && !this.settings.DisableResetSearchConfiguration)
         {
             await this.azureCognitiveSearchConfigurationService.UninitializeAsync();
             await this.azureStorageConfigurationService.UninitializeAsync();
diff --git a/src/Azure.AISearch.WebApp/Services/AzureStorageConfigurationService.cs b/src/Azure.AISearch.WebApp/Services/AzureStorageConfigurationService.cs
diff --git a/src/Azure.AISearch.WebApp/appsettings.json b/src/Azure.AISearch.WebApp/appsettings.json

Original file line number	Diff line number	Diff line change
`@@ -22,4 +22,6 @@ public class AppSettings`
`22`	`22`	`public string? SearchIndexNameBlobChunks { get; set; }`
`23`	`23`	`public int? SearchIndexerScheduleMinutes { get; set; } // If unspecified, will be set to 5 minutes.`
`24`	`24`	`public string? InitialDocumentUrls { get; set; }`
	`25`	`+ public bool DisableUploadDocuments { get; set; } // If true, the Upload Documents functionality will be disabled.`
	`26`	`+ public bool DisableResetSearchConfiguration { get; set; } // If true, the Reset Search Configuration functionality will be disabled.`
`25`	`27`	`}`
Original file line number	Diff line number	Diff line change
`@@ -5,18 +5,24 @@`
`5`	`5`
`6`	`6`	`namespace Azure.AISearch.WebApp.Pages;`
`7`	`7`
	`8`	`+[RequestFormLimits(MultipartBodyLengthLimit = MaxDocumentUploadSize)]`
	`9`	`+[RequestSizeLimit(MaxDocumentUploadSize)]`
`8`	`10`	`public class ManageModel : PageModel`
`9`	`11`	`{`
	`12`	`+ private const int MaxDocumentUploadSize = 209715200; // 200 MB`
	`13`	`+ public const string UploadDocument = nameof(UploadDocument);`
`10`	`14`	`public const string RunSearchIndexer = nameof(RunSearchIndexer);`
`11`	`15`	`public const string ResetSearchConfiguration = nameof(ResetSearchConfiguration);`
`12`	`16`
	`17`	`+ private readonly AppSettings settings;`
`13`	`18`	`private readonly AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService;`
`14`	`19`	`private readonly AzureStorageConfigurationService azureStorageConfigurationService;`
`15`	`20`
`16`	`21`	`public IList<SearchIndexStatus>? SearchIndexStatuses { get; set; }`
`17`	`22`
`18`		`- public ManageModel(AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)`
	`23`	`+ public ManageModel(AppSettings settings, AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)`
`19`	`24`	`{`
	`25`	`+ this.settings = settings;`
`20`	`26`	`this.azureCognitiveSearchConfigurationService = azureCognitiveSearchConfigurationService;`
`21`	`27`	`this.azureStorageConfigurationService = azureStorageConfigurationService;`
`22`	`28`	`}`
`@@ -26,13 +32,21 @@ public async Task OnGet()`
`26`	`32`	`this.SearchIndexStatuses = await this.azureCognitiveSearchConfigurationService.GetSearchIndexStatusesAsync();`
`27`	`33`	`}`
`28`	`34`
`29`		`- public async Task<IActionResult> OnPost(string action, string? searchIndexName, AppSettingsOverride? settingsOverride)`
	`35`	`+ public async Task<IActionResult> OnPost(string action, IList<IFormFile>? documents, string? searchIndexName, AppSettingsOverride? settingsOverride)`
`30`	`36`	`{`
`31`		`- if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))`
	`37`	`+ if (action == UploadDocument && !settings.DisableUploadDocuments && documents != null && documents.Any())`
	`38`	`+ {`
	`39`	`+ foreach (var document in documents)`
	`40`	`+ {`
	`41`	`+ using var fileStream = document.OpenReadStream();`
	`42`	`+ await this.azureStorageConfigurationService.UploadDocumentAsync(fileStream, document.FileName);`
	`43`	`+ }`
	`44`	`+ }`
	`45`	`+ else if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))`
`32`	`46`	`{`
`33`	`47`	`await this.azureCognitiveSearchConfigurationService.RunSearchIndexerAsync(searchIndexName);`
`34`	`48`	`}`
`35`		`- else if (action == ResetSearchConfiguration)`
	`49`	`+ else if (action == ResetSearchConfiguration && !this.settings.DisableResetSearchConfiguration)`
`36`	`50`	`{`
`37`	`51`	`await this.azureCognitiveSearchConfigurationService.UninitializeAsync();`
`38`	`52`	`await this.azureStorageConfigurationService.UninitializeAsync();`