Skip to content

Commit a16ae58

Browse files
Fix prepdoc source not found (#235)
## Purpose <!-- Describe the intention of the changes being proposed. What problem does it solve or functionality does it add? --> * ... ## Does this introduce a breaking change? <!-- Mark one with an "x". --> ``` [ ] Yes [ ] No ``` ## Pull Request Type What kind of change does this Pull Request introduce? <!-- Please check the one that applies to this PR using "x". --> ``` [ ] Bugfix [ ] Feature [ ] Code style update (formatting, local variables) [ ] Refactoring (no functional changes, no api changes) [ ] Documentation content changes [ ] Other... Please describe: ``` ## How to Test * Get the code ``` git clone [repo-address] cd [repo-name] git checkout [branch-name] npm install ``` * Test the code <!-- Add steps to run the tests suite and/or manually test --> ``` ``` ## What to Check Verify that the following are valid * ... ## Other Information <!-- Add any other helpful information that may be needed here. -->
1 parent 1e2b031 commit a16ae58

File tree

2 files changed

+13
-12
lines changed

2 files changed

+13
-12
lines changed

app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public async Task<bool> EmbedBlobAsync(Stream blobStream, string blobName)
2424
try
2525
{
2626
await EnsureSearchIndexAsync(searchIndexName);
27-
27+
Console.WriteLine($"Embedding blob '{blobName}'");
2828
var pageMap = await GetDocumentTextAsync(blobStream, blobName);
2929

3030
var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(blobName);
@@ -128,6 +128,7 @@ private async Task<IReadOnlyList<PageDetail>> GetDocumentTextAsync(Stream blobSt
128128
logger?.LogInformation(
129129
"Extracting text from '{Blob}' using Azure Form Recognizer", blobName);
130130

131+
Console.WriteLine($"Extracting text from '{blobName}' using Azure Form Recognizer");
131132
using var ms = new MemoryStream();
132133
blobStream.CopyTo(ms);
133134
ms.Position = 0;
@@ -184,7 +185,7 @@ private async Task<IReadOnlyList<PageDetail>> GetDocumentTextAsync(Stream blobSt
184185
pageMap.Add(new PageDetail(i, offset, pageText.ToString()));
185186
offset += pageText.Length;
186187
}
187-
188+
Console.WriteLine($"Extracted {pageMap.Count} pages from '{blobName}'");
188189
return pageMap.AsReadOnly();
189190
}
190191

@@ -374,9 +375,7 @@ private static int FindPage(IReadOnlyList<PageDetail> pageMap, int offset)
374375
return length - 1;
375376
}
376377

377-
private static string BlobNameFromFilePage(string blobName, int page = 0) => Path.GetExtension(blobName).ToLower() is ".pdf"
378-
? $"{Path.GetFileNameWithoutExtension(blobName)}-{page}.pdf"
379-
: Path.GetFileName(blobName);
378+
private static string BlobNameFromFilePage(string blobName, int page = 0) => blobName;
380379

381380
private async Task IndexSectionsAsync(string searchIndexName, IEnumerable<Section> sections, string blobName)
382381
{

app/prepdocs/PrepareDocs/Program.cs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,7 @@ static async Task ProcessSingleFileAsync(AppOptions options, string fileName, IE
5959
return;
6060
}
6161

62-
await UploadBlobsAsync(options, fileName);
63-
using (var stream = File.OpenRead(fileName))
64-
{
65-
await embedService.EmbedBlobAsync(stream, fileName);
66-
}
62+
await UploadBlobsAndCreateIndexAsync(options, fileName, embedService);
6763
}
6864
}
6965
});
@@ -159,8 +155,8 @@ Removing sections from '{fileName ?? "all"}' from search index '{options.SearchI
159155
}
160156
}
161157

162-
static async ValueTask UploadBlobsAsync(
163-
AppOptions options, string fileName)
158+
static async ValueTask UploadBlobsAndCreateIndexAsync(
159+
AppOptions options, string fileName, IEmbedService embeddingService)
164160
{
165161
var container = await GetBlobContainerClientAsync(options);
166162

@@ -190,6 +186,11 @@ static async ValueTask UploadBlobsAsync(
190186
{
191187
ContentType = "application/pdf"
192188
});
189+
190+
// revert stream position
191+
stream.Position = 0;
192+
193+
await embeddingService.EmbedBlobAsync(stream, documentName);
193194
}
194195
finally
195196
{
@@ -201,6 +202,7 @@ static async ValueTask UploadBlobsAsync(
201202
{
202203
var blobName = BlobNameFromFilePage(fileName);
203204
await UploadBlobAsync(fileName, blobName, container);
205+
await embeddingService.EmbedBlobAsync(File.OpenRead(fileName), blobName);
204206
}
205207
}
206208

0 commit comments

Comments
 (0)