Skip to content

Commit f6494f0

Browse files
committed
Make the file strategy parallel with a default of 10 files
1 parent c4ef3be commit f6494f0

File tree

2 files changed

+19
-3
lines changed

2 files changed

+19
-3
lines changed

app/backend/prepdocs.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,12 @@ async def main(strategy: Strategy, setup_index: bool = True):
311311
required=False,
312312
help="Search service system assigned Identity (Managed identity) (used for integrated vectorization)",
313313
)
314+
parser.add_argument(
315+
"--concurrency",
316+
type=int,
317+
default=10,
318+
help="Max. number of concurrent tasks to run for processing files (file strategy only) (default: 10)",
319+
)
314320

315321
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
316322
args = parser.parse_args()
@@ -467,6 +473,7 @@ async def main(strategy: Strategy, setup_index: bool = True):
467473
category=args.category,
468474
use_content_understanding=use_content_understanding,
469475
content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"),
476+
concurrency=args.concurrency,
470477
)
471478

472479
loop.run_until_complete(main(ingestion_strategy, setup_index=not args.remove and not args.removeall))

app/backend/prepdocslib/filestrategy.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import asyncio
12
import logging
23
from typing import Optional
34

@@ -56,6 +57,7 @@ def __init__(
5657
category: Optional[str] = None,
5758
use_content_understanding: bool = False,
5859
content_understanding_endpoint: Optional[str] = None,
60+
concurrency: int = 10,
5961
):
6062
self.list_file_strategy = list_file_strategy
6163
self.blob_manager = blob_manager
@@ -70,6 +72,7 @@ def __init__(
7072
self.category = category
7173
self.use_content_understanding = use_content_understanding
7274
self.content_understanding_endpoint = content_understanding_endpoint
75+
self.concurrency = concurrency
7376

7477
def setup_search_manager(self):
7578
self.search_manager = SearchManager(
@@ -98,9 +101,9 @@ async def setup(self):
98101

99102
async def run(self):
100103
self.setup_search_manager()
101-
if self.document_action == DocumentAction.Add:
102-
files = self.list_file_strategy.list()
103-
async for file in files:
104+
105+
async def process_file_worker(semaphore: asyncio.Semaphore, file: File):
106+
async with semaphore:
104107
try:
105108
sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings)
106109
if sections:
@@ -112,6 +115,12 @@ async def run(self):
112115
finally:
113116
if file:
114117
file.close()
118+
119+
if self.document_action == DocumentAction.Add:
120+
files = self.list_file_strategy.list()
121+
semaphore = asyncio.Semaphore(self.concurrency)
122+
tasks = [process_file_worker(semaphore, file) async for file in files]
123+
await asyncio.gather(*tasks)
115124
elif self.document_action == DocumentAction.Remove:
116125
paths = self.list_file_strategy.list_paths()
117126
async for path in paths:

0 commit comments

Comments
 (0)