Skip to content

Commit 886f445

Browse files
authored
Allow for subdirectory structure within data directory (#569)
* Allow for subdirectory structure within data directory - recursively read the data directory - @TODO/SUGGEST - may be worth to think about using the sub directory information for enriching the index and/or use it for access control (e.g., subdirectory names may be named like the Azure AD Group for access control) * Update prepdocs.py * Convert comment for read_files function to DocString * Handing over `use_vectors` variable in the recursive call * Update prepdocs.py - Removed out-of-reach condition within read_files function * Update prepdocs.py * Update prepdocs.py * Fixing syntax error - filename isn't global anymore due to the recursive function introduced. Have to be explicitly handed over to the split_text function
1 parent e48b0d1 commit 886f445

File tree

1 file changed

+26
-16
lines changed

1 file changed

+26
-16
lines changed

scripts/prepdocs.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ def get_document_text(filename):
147147

148148
return page_map
149149

150-
def split_text(page_map):
150+
def split_text(page_map, filename):
151151
SENTENCE_ENDINGS = [".", "!", "?"]
152152
WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
153153
if args.verbose: print(f"Splitting '{filename}' into sections")
@@ -214,7 +214,7 @@ def filename_to_id(filename):
214214

215215
def create_sections(filename, page_map, use_vectors):
216216
file_id = filename_to_id(filename)
217-
for i, (content, pagenum) in enumerate(split_text(page_map)):
217+
for i, (content, pagenum) in enumerate(split_text(page_map, filename)):
218218
section = {
219219
"id": f"{file_id}-page-{i}",
220220
"content": content,
@@ -314,6 +314,29 @@ def refresh_openai_token():
314314
openai.api_key = token_cred.get_token("https://cognitiveservices.azure.com/.default").token
315315
open_ai_token_cache[CACHE_KEY_CREATED_TIME] = time.time()
316316

317+
def read_files(path_pattern: str, use_vectors: bool):
318+
"""
319+
Recursively read directory structure under `path_pattern`
320+
and execute indexing for the individual files
321+
"""
322+
for filename in glob.glob(path_pattern):
323+
if args.verbose: print(f"Processing '{filename}'")
324+
if args.remove:
325+
remove_blobs(filename)
326+
remove_from_index(filename)
327+
else:
328+
if os.path.isdir(filename):
329+
read_files(filename + "/*", use_vectors)
330+
continue
331+
try:
332+
if not args.skipblobs:
333+
upload_blobs(filename)
334+
page_map = get_document_text(filename)
335+
sections = create_sections(os.path.basename(filename), page_map, use_vectors)
336+
index_sections(os.path.basename(filename), sections)
337+
except Exception as e:
338+
print(f"\tGot an error while reading {filename} -> {e} --> skipping file")
339+
317340
if __name__ == "__main__":
318341

319342
parser = argparse.ArgumentParser(
@@ -379,17 +402,4 @@ def refresh_openai_token():
379402
create_search_index()
380403

381404
print("Processing files...")
382-
for filename in glob.glob(args.files):
383-
if args.verbose: print(f"Processing '{filename}'")
384-
if args.remove:
385-
remove_blobs(filename)
386-
remove_from_index(filename)
387-
elif args.removeall:
388-
remove_blobs(None)
389-
remove_from_index(None)
390-
else:
391-
if not args.skipblobs:
392-
upload_blobs(filename)
393-
page_map = get_document_text(filename)
394-
sections = create_sections(os.path.basename(filename), page_map, use_vectors)
395-
index_sections(os.path.basename(filename), sections)
405+
read_files(args.files, use_vectors)

0 commit comments

Comments
 (0)