Skip to content

Commit e38ef9e

Browse files
Merge branch 'dev'
2 parents e8128b9 + 23800fb commit e38ef9e

File tree

6 files changed

+24
-16
lines changed

6 files changed

+24
-16
lines changed

DocToolsLLM/DocToolsLLM.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
ankiconnect, debug_chain, model_name_matcher,
3131
average_word_length, wpm, get_splitter,
3232
check_docs_tkn_length, get_tkn_length,
33-
extra_args_keys, disable_internet)
33+
extra_args_keys, disable_internet, loaders_temp_dir_file)
3434
from .utils.prompts import PR_CONDENSE_QUESTION, PR_EVALUATE_DOC, PR_ANSWER_ONE_DOC, PR_COMBINE_INTERMEDIATE_ANSWERS
3535
from .utils.tasks.query import format_chat_history, refilter_docs, check_intermediate_answer, parse_eval_output, query_eval_cache
3636

@@ -75,7 +75,7 @@
7575
class DocToolsLLM_class:
7676
"This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
7777

78-
VERSION: str = "0.44"
78+
VERSION: str = "0.45"
7979

8080
#@optional_typecheck
8181
@typechecked
@@ -154,6 +154,9 @@ def p(message: str) -> None:
154154
red(pyfiglet.figlet_format("DocToolsLLM"))
155155
log.info("Starting DocToolsLLM")
156156

157+
# erases content that links to the loaders temporary files at startup
158+
loaders_temp_dir_file.write_text("")
159+
157160
# make sure the extra args are valid
158161
for k in cli_kwargs:
159162
if k not in self.allowed_extra_keys:

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .misc import doc_loaders_cache, file_hasher, min_token, get_tkn_length, unlazyload_modules, doc_kwargs_keys, cache_dir
2727
from .typechecker import optional_typecheck
2828
from .logger import red, whi, log
29-
from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, global_temp_dir
29+
from .loaders import load_one_doc, yt_link_regex, load_youtube_playlist, markdownlink_regex, loaders_temp_dir_file
3030
from .flags import is_debug
3131

3232

@@ -274,7 +274,7 @@ def load_one_doc_wrapped(**doc_kwargs):
274274
shutil.rmtree(f)
275275
temp_dir = cache_dir / load_temp_name
276276
temp_dir.mkdir(exist_ok=False)
277-
global_temp_dir[0] = temp_dir
277+
loaders_temp_dir_file.write_text(str(temp_dir.absolute().resolve()))
278278

279279
docs = []
280280
t_load = time.time()

DocToolsLLM/utils/loaders.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
from .misc import (doc_loaders_cache, html_to_text, hasher,
4747
file_hasher, get_splitter, check_docs_tkn_length,
48-
average_word_length, wpm)
48+
average_word_length, wpm, loaders_temp_dir_file)
4949
from .typechecker import optional_typecheck
5050
from .logger import whi, yel, red, log
5151
from .flags import is_verbose, is_linux
@@ -173,8 +173,6 @@ def load(self):
173173
["norm"],
174174
]
175175

176-
global_temp_dir = [None] # will be replaced when load_one_doc is called
177-
178176

179177
@optional_typecheck
180178
def load_one_doc(
@@ -191,7 +189,11 @@ def load_one_doc(
191189
The loader is cached"""
192190
text_splitter = get_splitter(task)
193191

194-
assert global_temp_dir[0] is temp_dir
192+
expected_global_dir = loaders_temp_dir_file.read_text().strip()
193+
assert expected_global_dir, f"Empty loaders_temp_dir_file at {loaders_temp_dir_file}"
194+
expected_global_dir = Path(expected_global_dir)
195+
assert expected_global_dir.exists(), f"File loaders_temp_dir_file not found in {loaders_temp_dir_file} pointing at '{expected_global_dir}'"
196+
assert expected_global_dir == temp_dir, f"Error handling temp dir: temp_dir is {temp_dir} but loaders_temp_dir is {expected_global_dir}"
195197

196198
if filetype == "youtube":
197199
docs = load_youtube_video(**kwargs)
@@ -397,7 +399,7 @@ def load_youtube_video(
397399
)
398400
else:
399401
whi(f"Downloading audio from url: '{path}'")
400-
file_name = global_temp_dir[0] / f"youtube_audio_{uuid.uuid4()}" # without extension!
402+
file_name = load_temp_dir / f"youtube_audio_{uuid.uuid4()}" # without extension!
401403
ydl_opts = {
402404
'format': 'bestaudio/best',
403405
'postprocessors': [{
@@ -412,7 +414,7 @@ def load_youtube_video(
412414
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
413415
ydl.download([path])
414416
candidate = []
415-
for f in global_temp_dir[0].iterdir():
417+
for f in load_temp_dir.iterdir():
416418
if file_name.name in f.name:
417419
candidate.append(f)
418420
assert len(candidate), f"Audio file of {path} failed to download?"
@@ -532,7 +534,7 @@ def load_anki(
532534
original_db = akp.find_db(user=anki_profile)
533535
name = f"{anki_profile}".replace(" ", "_")
534536
random_val = str(uuid.uuid4()).split("-")[-1]
535-
new_db_path = global_temp_dir[0] / f"anki_collection_{name.replace('/', '_')}_{random_val}"
537+
new_db_path = load_temp_dir / f"anki_collection_{name.replace('/', '_')}_{random_val}"
536538
assert not Path(new_db_path).exists(
537539
), f"{new_db_path} already existing!"
538540
shutil.copy(original_db, new_db_path)
@@ -924,8 +926,8 @@ def load_local_audio(
924926
)
925927
red(f"Removed silence from {path.name}: {dur:.1f} -> {new_dur:.1f} in {elapsed:.1f}s")
926928

927-
unsilenced_path_wav = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.wav"
928-
unsilenced_path_ogg = global_temp_dir[0] / f"unsilenced_audio_{uuid.uuid4()}.ogg"
929+
unsilenced_path_wav = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.wav"
930+
unsilenced_path_ogg = load_temp_dir / f"unsilenced_audio_{uuid.uuid4()}.ogg"
929931
assert not unsilenced_path_wav.exists()
930932
assert not unsilenced_path_ogg.exists()
931933
torchaudio.save(
@@ -1009,7 +1011,7 @@ def load_local_video(
10091011
) -> List[Document]:
10101012
assert Path(path).exists(), f"file not found: '{path}'"
10111013

1012-
audio_path = global_temp_dir[0] / f"audio_from_video_{uuid.uuid4()}.mp3"
1014+
audio_path = load_temp_dir / f"audio_from_video_{uuid.uuid4()}.mp3"
10131015
assert not audio_path.exists()
10141016

10151017
# extract audio from video

DocToolsLLM/utils/misc.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
TextSplitter = lazy_import.lazy_class('langchain.text_splitter.TextSplitter')
3232
RecursiveCharacterTextSplitter = lazy_import.lazy_class('langchain.text_splitter.RecursiveCharacterTextSplitter')
3333

34+
# will be replaced when load_one_doc is called, by the path to the file where the loaders can store temporary file
35+
loaders_temp_dir_file = cache_dir / "loaders_temp_dir.txt"
36+
3437
try:
3538
import ftlangdetect
3639
except Exception as err:

bumpver.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[bumpver]
2-
current_version = "0.44"
2+
current_version = "0.45"
33
version_pattern = "MAJOR.MINOR"
44
commit_message = "bump version {old_version} -> {new_version}"
55
tag_message = "{new_version}"

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def run(self):
2828

2929
setup(
3030
name="DocToolsLLM",
31-
version="0.44",
31+
version="0.45",
3232
description="A perfect AI powered RAG for document query and summary. Supports ~all LLM and ~all filetypes (url, pdf, epub, youtube (incl playlist), audio, anki, md, docx, pptx, oe any combination!)",
3333
long_description=long_description,
3434
long_description_content_type="text/markdown",

0 commit comments

Comments
 (0)