Allow for subdirectory structure within data directory (#569)

PfisterAn · web-flow · commit 886f445370a9 · 2023-08-31T05:58:27.000-07:00
* Allow for subdirectory structure within data directory

- recursively read the data directory
  - @TODO/SUGGEST - may be worth to think about using the sub directory information for enriching the index and/or use it for access control (e.g., subdirectory names may be named like the Azure AD Group for access control)

* Update prepdocs.py

* Convert comment for read_files function to DocString
* Handing over `use_vectors` variable in the recursive call

* Update prepdocs.py

- Removed out-of-reach condition within read_files function

* Update prepdocs.py

* Update prepdocs.py

* Fixing syntax error - filename isn't global anymore due to the recursive function introduced. Have to be explicitly handed over to the split_text function
diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py
@@ -147,7 +147,7 @@ def get_document_text(filename):
 
     return page_map
 
-def split_text(page_map):
+def split_text(page_map, filename):
     SENTENCE_ENDINGS = [".", "!", "?"]
     WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
     if args.verbose: print(f"Splitting '{filename}' into sections")
@@ -214,7 +214,7 @@ def filename_to_id(filename):
 
 def create_sections(filename, page_map, use_vectors):
     file_id = filename_to_id(filename)
-    for i, (content, pagenum) in enumerate(split_text(page_map)):
+    for i, (content, pagenum) in enumerate(split_text(page_map, filename)):
         section = {
             "id": f"{file_id}-page-{i}",
             "content": content,
@@ -314,6 +314,29 @@ def refresh_openai_token():
         openai.api_key = token_cred.get_token("https://cognitiveservices.azure.com/.default").token
         open_ai_token_cache[CACHE_KEY_CREATED_TIME] = time.time()
 
+def read_files(path_pattern: str, use_vectors: bool):
+    """
+    Recursively read directory structure under `path_pattern`
+    and execute indexing for the individual files
+    """
+    for filename in glob.glob(path_pattern):
+        if args.verbose: print(f"Processing '{filename}'")
+        if args.remove:
+            remove_blobs(filename)
+            remove_from_index(filename)
+        else:
+            if os.path.isdir(filename):
+                read_files(filename + "/*", use_vectors)
+                continue
+            try:
+                if not args.skipblobs:
+                    upload_blobs(filename)
+                page_map = get_document_text(filename)
+                sections = create_sections(os.path.basename(filename), page_map, use_vectors)
+                index_sections(os.path.basename(filename), sections)
+            except Exception as e:
+                print(f"\tGot an error while reading {filename} -> {e} --> skipping file")
+
 if __name__ == "__main__":
 
     parser = argparse.ArgumentParser(
@@ -379,17 +402,4 @@ def refresh_openai_token():
             create_search_index()
 
         print("Processing files...")
-        for filename in glob.glob(args.files):
-            if args.verbose: print(f"Processing '{filename}'")
-            if args.remove:
-                remove_blobs(filename)
-                remove_from_index(filename)
-            elif args.removeall:
-                remove_blobs(None)
-                remove_from_index(None)
-            else:
-                if not args.skipblobs:
-                    upload_blobs(filename)
-                page_map = get_document_text(filename)
-                sections = create_sections(os.path.basename(filename), page_map, use_vectors)
-                index_sections(os.path.basename(filename), sections)
+        read_files(args.files, use_vectors)