Skip to content

Commit 8ed1eba

Browse files
committed
Add document upload
1 parent d4adf6b commit 8ed1eba

File tree

6 files changed

+120
-81
lines changed

6 files changed

+120
-81
lines changed

azuredeploy-webapp.json

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@
5454
"textEmbedderMinChunkSize": 10,
5555
"searchIndexNameBlobDocuments": "blob-documents",
5656
"searchIndexNameBlobChunks": "blob-chunks",
57-
"searchIndexerScheduleMinutes": 5
57+
"searchIndexerScheduleMinutes": 5,
58+
"disableUploadDocuments": false,
59+
"disableResetSearchConfiguration": false
5860
},
5961
"resources": [
6062
{
@@ -361,6 +363,14 @@
361363
{
362364
"name": "InitialDocumentUrls",
363365
"value": "[parameters('initialDocumentUrls')]"
366+
},
367+
{
368+
"name": "DisableUploadDocuments",
369+
"value": "[variables('disableUploadDocuments')]"
370+
},
371+
{
372+
"name": "DisableResetSearchConfiguration",
373+
"value": "[variables('disableResetSearchConfiguration')]"
364374
}
365375
]
366376
}
@@ -448,6 +458,14 @@
448458
"type": "string",
449459
"value": "[parameters('initialDocumentUrls')]"
450460
},
461+
"disableUploadDocuments": {
462+
"type": "bool",
463+
"value": "[variables('disableUploadDocuments')]"
464+
},
465+
"disableResetSearchConfiguration": {
466+
"type": "bool",
467+
"value": "[variables('disableResetSearchConfiguration')]"
468+
},
451469
"webAppUrl": {
452470
"type": "string",
453471
"value": "[concat('https://', reference(resourceId('Microsoft.Web/sites', variables('webAppName')), '2022-09-01').defaultHostName)]"

src/Azure.AISearch.WebApp/AppSettings.cs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,6 @@ public class AppSettings
2222
public string? SearchIndexNameBlobChunks { get; set; }
2323
public int? SearchIndexerScheduleMinutes { get; set; } // If unspecified, will be set to 5 minutes.
2424
public string? InitialDocumentUrls { get; set; }
25+
public bool DisableUploadDocuments { get; set; } // If true, the Upload Documents functionality will be disabled.
26+
public bool DisableResetSearchConfiguration { get; set; } // If true, the Reset Search Configuration functionality will be disabled.
2527
}

src/Azure.AISearch.WebApp/Pages/Manage.cshtml

Lines changed: 63 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,24 @@
1414

1515
<h2 class="display-6 mb-3">@ViewData["Title"]</h2>
1616

17+
@if (!settings.DisableUploadDocuments)
18+
{
19+
<h3 class="mb-3">Upload documents</h3>
20+
21+
<form method="post" enctype="multipart/form-data">
22+
<div class="alert alert-info">
23+
This allows you to upload new documents to the <code>Documents</code> container in the configured Azure Storage account.
24+
The documents will automatically be added to the <code>Documents</code> search index by the indexer infrastructure.
25+
The indexer and associated skillset will also split the document content into smaller chunks, generate embeddings for
26+
these chunks, and then store the chunk contents along with their embedding vector representations in the <code>Chunks</code>
27+
search index.
28+
</div>
29+
<input type="file" class="form-control" name="documents" multiple>
30+
<input type="hidden" name="action" value="@ManageModel.UploadDocument">
31+
<button type="submit" class="btn btn-primary my-3">Upload</button>
32+
</form>
33+
}
34+
1735
@if (Model.SearchIndexStatuses != null && Model.SearchIndexStatuses.Any())
1836
{
1937
<h3 class="mb-3">Search index status</h3>
@@ -49,48 +67,51 @@
4967
</table>
5068
}
5169

52-
<h3 class="mb-3">Reset search configuration</h3>
70+
@if (!settings.DisableResetSearchConfiguration)
71+
{
72+
<h3 class="mb-3">Reset search configuration</h3>
5373

54-
<form method="post">
55-
<div class="alert alert-danger">
56-
<p>
57-
This will delete all indexes in Azure Cognitive Search along with their supporting infrastructure
58-
like indexers, data sources and skillset definitions.
59-
</p>
60-
<p>
61-
It will also delete all content in the <code>Chunks</code> blob container in the configured Azure Storage
62-
account (as the chunks will be recreated from the source data, optionally with the new settings below).
63-
</p>
64-
<p>
65-
However, it will <b><i>not</i></b> delete any data in the <code>Documents</code> container, so all your
66-
previously uploaded documents will remain available and will be re-indexed after the configuration is reset.
67-
</p>
68-
</div>
69-
<div class="card mb-3">
70-
<div class="card-header">Options</div>
71-
<div class="card-body">
72-
<div class="mb-2">
73-
<label class="form-label" for="appSettingsOverride-TextEmbedderNumTokens">Number of tokens per chunk</label>
74-
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The source data will be split up into smaller chunks of approximately the token size you specify here. Embeddings are generated per chunk so the larger the chunks, the more likely you will hit token limits and the more likely the vector representation will be less specific (as it's generated from a larger body of content). Experiment with this value based on the type of content being chunked and the kinds of recall performance required for retrieval scenarios."><i class="bi bi-info-circle"></i></span>
75-
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderNumTokens)" id="appSettingsOverride-TextEmbedderNumTokens" value="@settings.TextEmbedderNumTokens">
76-
</div>
77-
<div class="mb-2">
78-
<label class="form-label" for="appSettingsOverride-TextEmbedderTokenOverlap">Token overlap between chunks</label>
79-
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of tokens to overlap between consecutive chunks. This is useful to maintain context continuity between chunks. By including some overlapping tokens, you can ensure that a small portion of context is shared between adjacent chunks, which can help with preserving the meaning and coherence when processing the text with language models."><i class="bi bi-info-circle"></i></span>
80-
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderTokenOverlap)" id="appSettingsOverride-TextEmbedderTokenOverlap" value="@settings.TextEmbedderTokenOverlap">
81-
</div>
82-
<div class="mb-2">
83-
<label class="form-label" for="appSettingsOverride-TextEmbedderMinChunkSize">Minimum chunk size</label>
84-
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The minimum number of tokens that a chunk should contain in order to be included in the <code>Chunks</code> index. This helps avoid that small chunks (with only a few words for example) have a disproportional impact on search results."><i class="bi bi-info-circle"></i></span>
85-
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderMinChunkSize)" id="appSettingsOverride-TextEmbedderMinChunkSize" value="@settings.TextEmbedderMinChunkSize">
86-
</div>
87-
<div class="mb-2">
88-
<label class="form-label" for="appSettingsOverride-SearchIndexerScheduleMinutes">Search indexer interval in minutes (minimum <code>5</code>)</label>
89-
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of minutes between indexer executions. If you upload new documents, it can take up to this amount of time for the data to be included in the <code>Documents</code> index. It can again take the same amount of time after that for the data to be included in the <code>Chunks</code> index, as the chunks are created while the indexer for the <code>Documents</code> index runs."><i class="bi bi-info-circle"></i></span>
90-
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.SearchIndexerScheduleMinutes)" id="appSettingsOverride-SearchIndexerScheduleMinutes" value="@settings.SearchIndexerScheduleMinutes">
74+
<form method="post">
75+
<div class="alert alert-warning">
76+
<p>
77+
This will delete all indexes in Azure Cognitive Search along with their supporting infrastructure
78+
like indexers, data sources and skillset definitions.
79+
</p>
80+
<p>
81+
It will also delete all content in the <code>Chunks</code> blob container in the configured Azure Storage
82+
account (as the chunks will be recreated from the source data, optionally with the new settings below).
83+
</p>
84+
<p>
85+
However, it will <b><i>not</i></b> delete any data in the <code>Documents</code> container, so all your
86+
previously uploaded documents will remain available and will be re-indexed after the configuration is reset.
87+
</p>
88+
</div>
89+
<div class="card mb-3">
90+
<div class="card-header">Options</div>
91+
<div class="card-body">
92+
<div class="mb-2">
93+
<label class="form-label" for="appSettingsOverride-TextEmbedderNumTokens">Number of tokens per chunk</label>
94+
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The source data will be split up into smaller chunks of approximately the token size you specify here. Embeddings are generated per chunk so the larger the chunks, the more likely you will hit token limits and the more likely the vector representation will be less specific (as it's generated from a larger body of content). Experiment with this value based on the type of content being chunked and the kinds of recall performance required for retrieval scenarios."><i class="bi bi-info-circle"></i></span>
95+
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderNumTokens)" id="appSettingsOverride-TextEmbedderNumTokens" value="@settings.TextEmbedderNumTokens">
96+
</div>
97+
<div class="mb-2">
98+
<label class="form-label" for="appSettingsOverride-TextEmbedderTokenOverlap">Token overlap between chunks</label>
99+
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of tokens to overlap between consecutive chunks. This is useful to maintain context continuity between chunks. By including some overlapping tokens, you can ensure that a small portion of context is shared between adjacent chunks, which can help with preserving the meaning and coherence when processing the text with language models."><i class="bi bi-info-circle"></i></span>
100+
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderTokenOverlap)" id="appSettingsOverride-TextEmbedderTokenOverlap" value="@settings.TextEmbedderTokenOverlap">
101+
</div>
102+
<div class="mb-2">
103+
<label class="form-label" for="appSettingsOverride-TextEmbedderMinChunkSize">Minimum chunk size</label>
104+
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The minimum number of tokens that a chunk should contain in order to be included in the <code>Chunks</code> index. This helps avoid that small chunks (with only a few words for example) have a disproportional impact on search results."><i class="bi bi-info-circle"></i></span>
105+
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.TextEmbedderMinChunkSize)" id="appSettingsOverride-TextEmbedderMinChunkSize" value="@settings.TextEmbedderMinChunkSize">
106+
</div>
107+
<div class="mb-2">
108+
<label class="form-label" for="appSettingsOverride-SearchIndexerScheduleMinutes">Search indexer interval in minutes (minimum <code>5</code>)</label>
109+
<span class="info-tip" data-bs-toggle="popover" data-bs-content="The number of minutes between indexer executions. If you upload new documents, it can take up to this amount of time for the data to be included in the <code>Documents</code> index. It can again take the same amount of time after that for the data to be included in the <code>Chunks</code> index, as the chunks are created while the indexer for the <code>Documents</code> index runs."><i class="bi bi-info-circle"></i></span>
110+
<input type="number" class="form-control" name="@nameof(AppSettingsOverride.SearchIndexerScheduleMinutes)" id="appSettingsOverride-SearchIndexerScheduleMinutes" value="@settings.SearchIndexerScheduleMinutes">
111+
</div>
91112
</div>
92113
</div>
93-
</div>
94-
<input type="hidden" name="action" value="@ManageModel.ResetSearchConfiguration">
95-
<button type="submit" class="btn btn-danger">Reset configuration</button>
96-
</form>
114+
<input type="hidden" name="action" value="@ManageModel.ResetSearchConfiguration">
115+
<button type="submit" class="btn btn-danger">Reset configuration</button>
116+
</form>
117+
}

src/Azure.AISearch.WebApp/Pages/Manage.cshtml.cs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,18 +5,24 @@
55

66
namespace Azure.AISearch.WebApp.Pages;
77

8+
[RequestFormLimits(MultipartBodyLengthLimit = MaxDocumentUploadSize)]
9+
[RequestSizeLimit(MaxDocumentUploadSize)]
810
public class ManageModel : PageModel
911
{
12+
private const int MaxDocumentUploadSize = 209715200; // 200 MB
13+
public const string UploadDocument = nameof(UploadDocument);
1014
public const string RunSearchIndexer = nameof(RunSearchIndexer);
1115
public const string ResetSearchConfiguration = nameof(ResetSearchConfiguration);
1216

17+
private readonly AppSettings settings;
1318
private readonly AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService;
1419
private readonly AzureStorageConfigurationService azureStorageConfigurationService;
1520

1621
public IList<SearchIndexStatus>? SearchIndexStatuses { get; set; }
1722

18-
public ManageModel(AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)
23+
public ManageModel(AppSettings settings, AzureCognitiveSearchConfigurationService azureCognitiveSearchConfigurationService, AzureStorageConfigurationService azureStorageConfigurationService)
1924
{
25+
this.settings = settings;
2026
this.azureCognitiveSearchConfigurationService = azureCognitiveSearchConfigurationService;
2127
this.azureStorageConfigurationService = azureStorageConfigurationService;
2228
}
@@ -26,13 +32,21 @@ public async Task OnGet()
2632
this.SearchIndexStatuses = await this.azureCognitiveSearchConfigurationService.GetSearchIndexStatusesAsync();
2733
}
2834

29-
public async Task<IActionResult> OnPost(string action, string? searchIndexName, AppSettingsOverride? settingsOverride)
35+
public async Task<IActionResult> OnPost(string action, IList<IFormFile>? documents, string? searchIndexName, AppSettingsOverride? settingsOverride)
3036
{
31-
if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))
37+
if (action == UploadDocument && !settings.DisableUploadDocuments && documents != null && documents.Any())
38+
{
39+
foreach (var document in documents)
40+
{
41+
using var fileStream = document.OpenReadStream();
42+
await this.azureStorageConfigurationService.UploadDocumentAsync(fileStream, document.FileName);
43+
}
44+
}
45+
else if (action == RunSearchIndexer && !string.IsNullOrEmpty(searchIndexName))
3246
{
3347
await this.azureCognitiveSearchConfigurationService.RunSearchIndexerAsync(searchIndexName);
3448
}
35-
else if (action == ResetSearchConfiguration)
49+
else if (action == ResetSearchConfiguration && !this.settings.DisableResetSearchConfiguration)
3650
{
3751
await this.azureCognitiveSearchConfigurationService.UninitializeAsync();
3852
await this.azureStorageConfigurationService.UninitializeAsync();

0 commit comments

Comments
 (0)