Skip to content

Commit ec44e0e

Browse files
Merge branch 'dev'
2 parents ea8b65b + 0199312 commit ec44e0e

File tree

10 files changed

+246
-148
lines changed

10 files changed

+246
-148
lines changed

DocToolsLLM/DocToolsLLM.py

Lines changed: 9 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
class DocToolsLLM_class:
7676
"This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
7777

78-
VERSION: str = "0.49"
78+
VERSION: str = "0.52"
7979

8080
#@optional_typecheck
8181
@typechecked
@@ -84,7 +84,8 @@ def __init__(
8484
task: str,
8585
filetype: str = "infer",
8686

87-
modelname: str = "openai/gpt-4o",
87+
modelname: str = "openrouter/anthropic/claude-3.5-sonnet",
88+
# modelname: str = "openai/gpt-4o",
8889
# modelname: str = "openai/gpt-3.5-turbo-0125",
8990
# modelname: str = "mistral/mistral-large-latest",
9091

@@ -102,10 +103,11 @@ def __init__(
102103

103104
query: Optional[str] = None,
104105
query_retrievers: str = "default",
105-
query_eval_modelname: Optional[str] = "openai/gpt-3.5-turbo",
106+
query_eval_modelname: Optional[str] = "openrouter/anthropic/claude-3.5-sonnet",
107+
# query_eval_modelname: Optional[str] = "openai/gpt-3.5-turbo",
106108
# query_eval_modelname: str = "mistral/open-mixtral-8x7b",
107109
# query_eval_modelname: str = "mistral/open-small",
108-
query_eval_check_number: int = 3,
110+
query_eval_check_number: int = 1,
109111
query_relevancy: float = 0.1,
110112
query_condense_question: Union[bool, int] = True,
111113

@@ -402,45 +404,6 @@ def ntfy(text: str) -> str:
402404
task=self.task,
403405
backend=self.file_loader_parallel_backend,
404406
**self.cli_kwargs)
405-
406-
# check that the hash are unique
407-
if len(self.loaded_docs) > 1:
408-
ids = [id(d.metadata) for d in self.loaded_docs]
409-
assert len(ids) == len(set(ids)), (
410-
"Same metadata object is used to store information on "
411-
"multiple documents!")
412-
413-
hashes = [d.metadata["hash"] for d in self.loaded_docs]
414-
uniq_hashes = list(set(hashes))
415-
removed_paths = []
416-
removed_docs = []
417-
counter = {h: hashes.count(h) for h in uniq_hashes}
418-
if len(hashes) != len(uniq_hashes):
419-
red("Found duplicate hashes after loading documents:")
420-
421-
for i, doc in enumerate(tqdm(self.loaded_docs, desc="Looking for duplicates")):
422-
h = doc.metadata['hash']
423-
n = counter[h]
424-
if n > 1:
425-
removed_docs.append(self.loaded_docs[i])
426-
self.loaded_docs[i] = None
427-
counter[h] -= 1
428-
assert counter[h] > 0
429-
red(f"Removed {len(removed_docs)}/{len(hashes)} documents because they had the same hash")
430-
431-
# check if deduplication likely amputated documents
432-
self.loaded_docs = [d for d in self.loaded_docs if d is not None]
433-
present_path = [d.metadata["path"] for d in self.loaded_docs]
434-
435-
intersect = set(removed_paths).intersection(set(present_path))
436-
if intersect:
437-
red(f"Found {len(intersect)} documents that were only partially removed, this results in incomplete documents.")
438-
for i, inte in enumerate(intersect):
439-
red(f" * #{i + 1}: {inte}")
440-
raise Exception()
441-
else:
442-
red(f"Removed {len(removed_paths)}/{len(hashes)} documents because they had the same hash")
443-
444407
else:
445408
self.loaded_docs = None # will be loaded when embeddings are loaded
446409

@@ -1128,7 +1091,7 @@ def query_task(self, query: Optional[str]) -> Optional[str]:
11281091
base_compressor=pipeline, base_retriever=retriever
11291092
)
11301093

1131-
if " >>>> " in query:
1094+
if ">>>>" in query:
11321095
sp = query.split(">>>>")
11331096
assert len(sp) == 2, "The query must contain a maximum of 1 occurence of '>>>>'"
11341097
query_fe = sp[0].strip()
@@ -1208,7 +1171,7 @@ def evaluate_doc_chain(
12081171
reasons = [gen.generation_info["finish_reason"] for gen in out.generations]
12091172
outputs = [gen.text for gen in out.generations]
12101173
# don't crash if finish_reason is not stop, because it can sometimes still be parsed.
1211-
if not all(r in ["stop", "lenghth"] for r in reasons):
1174+
if not all(r in ["stop", "length"] for r in reasons):
12121175
red(f"Unexpected generation finish_reason: '{reasons}' for generations: '{outputs}'")
12131176
assert outputs, "No generations found by query eval llm"
12141177
outputs = [parse_eval_output(o) for o in outputs]
@@ -1254,7 +1217,7 @@ async def do_eval(inputs):
12541217
return outputs
12551218

12561219
# uses in most places to increase concurrency limit
1257-
multi = {"max_concurrency": 50 if not self.debug else 1}
1220+
multi = {"max_concurrency": 10 if not self.debug else 1}
12581221

12591222
if self.task == "search":
12601223
if self.query_eval_modelname:

DocToolsLLM/__init__.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@ def fire_wrapper(
2323
*args,
2424
**kwargs,
2525
) -> dict:
26-
"used to catch --help arg to display it better then fire does on its own"
26+
"used to catch --help arg to display it better than fire would do"
2727

28-
# --help or similar
29-
if ("help" in args and len(args) == 1) or ("help" in kwargs and kwargs["help"]):
28+
# --help but not catched by sys.argv
29+
if "help" in kwargs and kwargs["help"]:
3030
print("Showing help")
3131
md = Markdown(DocToolsLLM.__doc__)
3232
console = Console()
@@ -73,6 +73,12 @@ def fire_wrapper(
7373

7474
def cli_launcher() -> None:
7575
sys_args = sys.argv
76+
if "--help" in sys_args:
77+
print("Showing help")
78+
md = Markdown(DocToolsLLM.__doc__)
79+
console = Console()
80+
console.print(md, style=None)
81+
raise SystemExit()
7682
if "--completion" in sys_args:
7783
return fire.Fire(DocToolsLLM)
7884

DocToolsLLM/docs/USAGE.md

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
---
4747

48-
* `--modelname`: str, default `"openai/gpt-4o"`
48+
* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
4949
* Keep in mind that given that the default backend used is litellm
5050
the part of modelname before the slash (/) is the backend name (also called provider).
5151
If the backend is 'testing/' then a fake LLM will be used
@@ -110,14 +110,14 @@
110110
if contains `hyde` but modelname contains `testing` then `hyde` will
111111
be removed.
112112

113-
* `--query_eval_modelname`: str, default `"openai/gpt-3.5-turbo"`
113+
* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
114114
* Cheaper and quicker model than modelname. Used for intermediate
115115
steps in the RAG, not used in other tasks.
116116
If the value is not part of the model list of litellm, will use
117117
fuzzy matching to find the best match.
118118
None to disable.
119119

120-
* `--query_eval_check_number`: int, default `3`
120+
* `--query_eval_check_number`: int, default `1`
121121
* number of pass to do with the eval llm to check if the document
122122
is indeed relevant to the question. The document will not
123123
be processed if all answers from the eval llm are 0, and will
@@ -381,6 +381,14 @@
381381
BeautifulSoup. Useful to decode html stored in .js files.
382382
Do tell me if you want more of this.
383383

384+
* `--min_lang_prob`: float, default `0.5`
385+
* float between 0 and 1 that sets the threshold under which to
386+
consider a document invalid if the estimation of
387+
fasttext's langdetect of any language is below that value.
388+
For example, setting it to 0.9 means that only documents that
389+
fasttext thinks have at least 90% probability of being a
390+
language are valid.
391+
384392
* `--source_tag`: str, default `None`
385393
* a string that will be added to the document metadata at the
386394
key `source_tag`. Useful when using filetype combination.

DocToolsLLM/utils/batch_file_loader.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import shutil
1010
import uuid
1111
import re
12+
import sys
13+
import traceback
1214
from tqdm import tqdm
1315
from functools import cache as memoizer
1416
import time
@@ -247,11 +249,16 @@ def load_one_doc_wrapped(**doc_kwargs):
247249
return out
248250
except Exception as err:
249251
filetype = doc_kwargs["filetype"]
250-
red(f"Error when loading doc with filetype {filetype}: '{err}'. Arguments: {doc_kwargs}")
252+
exc_type, exc_obj, exc_tb = sys.exc_info()
253+
formatted_tb = '\n'.join(traceback.format_tb(exc_tb))
254+
red(f"Error when loading doc with filetype {filetype}: '{err}'. "
255+
f"Arguments: {doc_kwargs}"
256+
f"\nLine number: {exc_tb.tb_lineno}"
257+
f"\nFull traceback:\n{formatted_tb}")
251258
if loading_failure == "crash" or is_debug:
252259
raise
253260
elif loading_failure == "warn":
254-
return None
261+
return err
255262
else:
256263
raise ValueError(loading_failure)
257264

@@ -298,11 +305,13 @@ def load_one_doc_wrapped(**doc_kwargs):
298305
red(f"Done loading all {len(to_load)} documents in {time.time()-t_load:.2f}s")
299306
missing_docargs = []
300307
for idoc, d in tqdm(enumerate(doc_lists), total=len(doc_lists), desc="Concatenating results"):
301-
if d is not None:
308+
if isinstance(d, list):
302309
docs.extend(d)
303310
else:
311+
assert isinstance(d, str)
304312
missing_docargs.append(to_load[idoc])
305-
assert None not in docs
313+
missing_docargs[-1]["error_message"] = d
314+
assert not any(isinstance(d, str) for d in docs)
306315

307316
if missing_docargs:
308317
missing_docargs = sorted(missing_docargs, key=lambda x: json.dumps(x))
@@ -337,6 +346,44 @@ def load_one_doc_wrapped(**doc_kwargs):
337346
shutil.rmtree(temp_dir)
338347
assert not temp_dir.exists()
339348

349+
# check that the hash are unique
350+
if len(docs) > 1:
351+
ids = [id(d.metadata) for d in docs]
352+
assert len(ids) == len(set(ids)), (
353+
"Same metadata object is used to store information on "
354+
"multiple documents!")
355+
356+
hashes = [d.metadata["all_hash"] for d in docs]
357+
uniq_hashes = list(set(hashes))
358+
removed_paths = []
359+
removed_docs = []
360+
counter = {h: hashes.count(h) for h in uniq_hashes}
361+
if len(hashes) != len(uniq_hashes):
362+
red("Found duplicate hashes after loading documents:")
363+
364+
for i, doc in enumerate(tqdm(docs, desc="Looking for duplicates")):
365+
h = doc.metadata['all_hash']
366+
n = counter[h]
367+
if n > 1:
368+
removed_docs.append(docs[i])
369+
docs[i] = None
370+
counter[h] -= 1
371+
assert counter[h] > 0
372+
red(f"Removed {len(removed_docs)}/{len(hashes)} documents because they had the same hash")
373+
374+
# check if deduplication likely amputated documents
375+
docs = [d for d in docs if d is not None]
376+
present_path = [d.metadata["path"] for d in docs]
377+
378+
intersect = set(removed_paths).intersection(set(present_path))
379+
if intersect:
380+
red(f"Found {len(intersect)} documents that were only partially removed, this results in incomplete documents.")
381+
for i, inte in enumerate(intersect):
382+
red(f" * #{i + 1}: {inte}")
383+
raise Exception()
384+
else:
385+
red(f"Removed {len(removed_paths)}/{len(hashes)} documents because they had the same hash")
386+
340387
return docs
341388

342389
@optional_typecheck

0 commit comments

Comments
 (0)