Merge branch 'dev'

thiswillbeyourgithub · thiswillbeyourgithub · commit e38ef9e95438 · 2024-06-29T14:24:44.000+02:00
diff --git a/DocToolsLLM/DocToolsLLM.py b/DocToolsLLM/DocToolsLLM.py
@@ -30,7 +30,7 @@
     ankiconnect, debug_chain, model_name_matcher,
     average_word_length, wpm, get_splitter,
     check_docs_tkn_length, get_tkn_length,
-    extra_args_keys, disable_internet)
+    extra_args_keys, disable_internet, loaders_temp_dir_file)
 from .utils.prompts import PR_CONDENSE_QUESTION, PR_EVALUATE_DOC, PR_ANSWER_ONE_DOC, PR_COMBINE_INTERMEDIATE_ANSWERS
 from .utils.tasks.query import format_chat_history, refilter_docs, check_intermediate_answer, parse_eval_output, query_eval_cache
 
@@ -75,7 +75,7 @@
 class DocToolsLLM_class:
     "This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
 
-    VERSION: str = "0.44"
+    VERSION: str = "0.45"
 
     #@optional_typecheck
     @typechecked
@@ -154,6 +154,9 @@ def p(message: str) -> None:
         red(pyfiglet.figlet_format("DocToolsLLM"))
         log.info("Starting DocToolsLLM")
 
+        # erases content that links to the loaders temporary files at startup
+        loaders_temp_dir_file.write_text("")
+
         # make sure the extra args are valid
         for k in cli_kwargs:
             if k not in self.allowed_extra_keys:
diff --git a/DocToolsLLM/utils/batch_file_loader.py b/DocToolsLLM/utils/batch_file_loader.py
@@ -26,7 +26,7 @@
 from .misc import doc_loaders_cache, file_hasher, min_token, get_tkn_length, unlazyload_modules, doc_kwargs_keys, cache_dir
 from .typechecker import optional_typecheck
 from .logger import red, whi, log
-from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, global_temp_dir
+from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, loaders_temp_dir_file
 from .flags import is_debug
 
 
@@ -274,7 +274,7 @@ def load_one_doc_wrapped(**doc_kwargs):
             shutil.rmtree(f)
     temp_dir = cache_dir / load_temp_name
     temp_dir.mkdir(exist_ok=False)
-    global_temp_dir[0] = temp_dir
+    loaders_temp_dir_file.write_text(str(temp_dir.absolute().resolve()))
 
     docs = []
     t_load = time.time()
diff --git a/DocToolsLLM/utils/loaders.py b/DocToolsLLM/utils/loaders.py
@@ -45,7 +45,7 @@
 
 from .misc import (doc_loaders_cache, html_to_text, hasher,
                    file_hasher, get_splitter, check_docs_tkn_length,
-                   average_word_length, wpm)
+                   average_word_length, wpm, loaders_temp_dir_file)
 from .typechecker import optional_typecheck
 from .logger import whi, yel, red, log
 from .flags import is_verbose, is_linux
@@ -173,8 +173,6 @@ def load(self):
     ["norm"],
 ]
 
-global_temp_dir = [None]  # will be replaced when load_one_doc is called
-
 
 @optional_typecheck
 def load_one_doc(
@@ -191,7 +189,11 @@ def load_one_doc(
     The loader is cached"""
     text_splitter = get_splitter(task)
 
-    assert global_temp_dir[0] is temp_dir
+    expected_global_dir = loaders_temp_dir_file.read_text().strip()
+    assert expected_global_dir, f"Empty loaders_temp_dir_file at {loaders_temp_dir_file}"
+    expected_global_dir = Path(expected_global_dir)
+    assert expected_global_dir.exists(), f"File loaders_temp_dir_file not found in {loaders_temp_dir_file} pointing at '{expected_global_dir}'"
+    assert expected_global_dir == temp_dir, f"Error handling temp dir: temp_dir is {temp_dir} but loaders_temp_dir is {expected_global_dir}"
 
     if filetype == "youtube":
         docs = load_youtube_video(**kwargs)
@@ -397,7 +399,7 @@ def load_youtube_video(
         )
     else:
         whi(f"Downloading audio from url: '{path}'")
-        file_name = global_temp_dir[0] / f"youtube_audio_{uuid.uuid4()}"  # without extension!
+        file_name = load_temp_dir / f"youtube_audio_{uuid.uuid4()}"  # without extension!
         ydl_opts = {
             'format': 'bestaudio/best',
             'postprocessors': [{
@@ -412,7 +414,7 @@ def load_youtube_video(
         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
             ydl.download([path])
         candidate = []
-        for f in global_temp_dir[0].iterdir():
+        for f in load_temp_dir.iterdir():
             if file_name.name in f.name:
                 candidate.append(f)
         assert len(candidate), f"Audio file of {path} failed to download?"
@@ -532,7 +534,7 @@ def load_anki(
     original_db = akp.find_db(user=anki_profile)
     name = f"{anki_profile}".replace(" ", "_")
     random_val = str(uuid.uuid4()).split("-")[-1]
-    new_db_path = global_temp_dir[0] / f"anki_collection_{name.replace('/', '_')}_{random_val}"
+    new_db_path = load_temp_dir / f"anki_collection_{name.replace('/', '_')}_{random_val}"
     assert not Path(new_db_path).exists(
     ), f"{new_db_path} already existing!"
     shutil.copy(original_db, new_db_path)
@@ -924,8 +926,8 @@ def load_local_audio(
         )
         red(f"Removed silence from {path.name}: {dur:.1f} -> {new_dur:.1f} in {elapsed:.1f}s")
 
-        unsilenced_path_wav = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.wav"
-        unsilenced_path_ogg = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.ogg"
+        unsilenced_path_wav = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.wav"
+        unsilenced_path_ogg = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.ogg"
         assert not unsilenced_path_wav.exists()
         assert not unsilenced_path_ogg.exists()
         torchaudio.save(
@@ -1009,7 +1011,7 @@ def load_local_video(
     ) -> List[Document]:
     assert Path(path).exists(), f"file not found: '{path}'"
 
-    audio_path = global_temp_dir[0] / f"audio_from_video_{uuid.uuid4()}.mp3"
+    audio_path = load_temp_dir / f"audio_from_video_{uuid.uuid4()}.mp3"
     assert not audio_path.exists()
 
     # extract audio from video
diff --git a/DocToolsLLM/utils/misc.py b/DocToolsLLM/utils/misc.py
@@ -31,6 +31,9 @@
 TextSplitter = lazy_import.lazy_class('langchain.text_splitter.TextSplitter')
 RecursiveCharacterTextSplitter = lazy_import.lazy_class('langchain.text_splitter.RecursiveCharacterTextSplitter')
 
+# will be replaced when load_one_doc is called, by the path to the file where the loaders can store temporary file
+loaders_temp_dir_file = cache_dir / "loaders_temp_dir.txt"
+
 try:
     import ftlangdetect
 except Exception as err:
diff --git a/bumpver.toml b/bumpver.toml
@@ -1,5 +1,5 @@
 [bumpver]
-current_version = "0.44"
+current_version = "0.45"
 version_pattern = "MAJOR.MINOR"
 commit_message = "bump version {old_version} -> {new_version}"
 tag_message = "{new_version}"
diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def run(self):
 
 setup(
     name="DocToolsLLM",
-    version="0.44",
+    version="0.45",
     description="A perfect AI powered RAG for document query and summary. Supports ~all LLM and ~all filetypes (url, pdf, epub, youtube (incl playlist), audio, anki, md, docx, pptx, oe any combination!)",
     long_description=long_description,
     long_description_content_type="text/markdown",