thiswillbeyourgithub
diff --git a/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 9 additions & 46 deletions b/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 9 additions & 46 deletions
diff --git a/‎DocToolsLLM/__init__.py‎
Lines changed: 9 additions & 3 deletions b/‎DocToolsLLM/__init__.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 11 additions & 3 deletions b/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 51 additions & 4 deletions b/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 51 additions & 4 deletions
@@ -75,7 +75,7 @@
 class DocToolsLLM_class:
     "This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
 
-    VERSION: str = "0.49"
+    VERSION: str = "0.52"
 
     #@optional_typecheck
     @typechecked
@@ -84,7 +84,8 @@ def __init__(
         task: str,
         filetype: str = "infer",
 
-        modelname: str = "openai/gpt-4o",
+        modelname: str = "openrouter/anthropic/claude-3.5-sonnet",
+        # modelname: str = "openai/gpt-4o",
         # modelname: str = "openai/gpt-3.5-turbo-0125",
         # modelname: str = "mistral/mistral-large-latest",
 
@@ -102,10 +103,11 @@ def __init__(
 
         query: Optional[str] = None,
         query_retrievers: str = "default",
-        query_eval_modelname: Optional[str] = "openai/gpt-3.5-turbo",
+        query_eval_modelname: Optional[str] = "openrouter/anthropic/claude-3.5-sonnet",
+        # query_eval_modelname: Optional[str] = "openai/gpt-3.5-turbo",
         # query_eval_modelname: str = "mistral/open-mixtral-8x7b",
         # query_eval_modelname: str = "mistral/open-small",
-        query_eval_check_number: int = 3,
+        query_eval_check_number: int = 1,
         query_relevancy: float = 0.1,
         query_condense_question: Union[bool, int] = True,
 
@@ -402,45 +404,6 @@ def ntfy(text: str) -> str:
                 task=self.task,
                 backend=self.file_loader_parallel_backend,
                 **self.cli_kwargs)
-
-            # check that the hash are unique
-            if len(self.loaded_docs) > 1:
-                ids = [id(d.metadata) for d in self.loaded_docs]
-                assert len(ids) == len(set(ids)), (
-                        "Same metadata object is used to store information on "
-                        "multiple documents!")
-
-                hashes = [d.metadata["hash"] for d in self.loaded_docs]
-                uniq_hashes = list(set(hashes))
-                removed_paths = []
-                removed_docs = []
-                counter = {h: hashes.count(h) for h in uniq_hashes}
-                if len(hashes) != len(uniq_hashes):
-                    red("Found duplicate hashes after loading documents:")
-
-                    for i, doc in enumerate(tqdm(self.loaded_docs, desc="Looking for duplicates")):
-                        h = doc.metadata['hash']
-                        n = counter[h]
-                        if n > 1:
-                            removed_docs.append(self.loaded_docs[i])
-                            self.loaded_docs[i] = None
-                            counter[h] -= 1
-                        assert counter[h] > 0
-                    red(f"Removed {len(removed_docs)}/{len(hashes)} documents because they had the same hash")
-
-                    # check if deduplication likely amputated documents
-                    self.loaded_docs = [d for d in self.loaded_docs if d is not None]
-                    present_path = [d.metadata["path"] for d in self.loaded_docs]
-
-                    intersect = set(removed_paths).intersection(set(present_path))
-                    if intersect:
-                        red(f"Found {len(intersect)} documents that were only partially removed, this results in incomplete documents.")
-                        for i, inte in enumerate(intersect):
-                            red(f"  * #{i + 1}: {inte}")
-                        raise Exception()
-                    else:
-                        red(f"Removed {len(removed_paths)}/{len(hashes)} documents because they had the same hash")
-
         else:
             self.loaded_docs = None  # will be loaded when embeddings are loaded
 
@@ -1128,7 +1091,7 @@ def query_task(self, query: Optional[str]) -> Optional[str]:
                 base_compressor=pipeline, base_retriever=retriever
             )
 
-        if " >>>> " in query:
+        if ">>>>" in query:
             sp = query.split(">>>>")
             assert len(sp) == 2, "The query must contain a maximum of 1 occurence of '>>>>'"
             query_fe = sp[0].strip()
@@ -1208,7 +1171,7 @@ def evaluate_doc_chain(
                 reasons = [gen.generation_info["finish_reason"] for gen in out.generations]
                 outputs = [gen.text for gen in out.generations]
                 # don't crash if finish_reason is not stop, because it can sometimes still be parsed.
-                if not all(r in ["stop", "lenghth"] for r in reasons):
+                if not all(r in ["stop", "length"] for r in reasons):
                     red(f"Unexpected generation finish_reason: '{reasons}' for generations: '{outputs}'")
                 assert outputs, "No generations found by query eval llm"
                 outputs = [parse_eval_output(o) for o in outputs]
@@ -1254,7 +1217,7 @@ async def do_eval(inputs):
             return outputs
 
         # uses in most places to increase concurrency limit
-        multi = {"max_concurrency": 50 if not self.debug else 1}
+        multi = {"max_concurrency": 10 if not self.debug else 1}
 
         if self.task == "search":
             if self.query_eval_modelname:
 
@@ -23,10 +23,10 @@ def fire_wrapper(
     *args,
     **kwargs,
     ) -> dict:
-    "used to catch --help arg to display it better then fire does on its own"
+    "used to catch --help arg to display it better than fire would do"
 
-    # --help or similar
-    if ("help" in args and len(args) == 1) or ("help" in kwargs and kwargs["help"]):
+    # --help but not catched by sys.argv
+    if "help" in kwargs and kwargs["help"]:
         print("Showing help")
         md = Markdown(DocToolsLLM.__doc__)
         console = Console()
@@ -73,6 +73,12 @@ def fire_wrapper(
 
 def cli_launcher() -> None:
     sys_args = sys.argv
+    if "--help" in sys_args:
+        print("Showing help")
+        md = Markdown(DocToolsLLM.__doc__)
+        console = Console()
+        console.print(md, style=None)
+        raise SystemExit()
     if "--completion" in sys_args:
         return fire.Fire(DocToolsLLM)
 
 
@@ -45,7 +45,7 @@
 
 ---
 
-* `--modelname`: str, default `"openai/gpt-4o"`
+* `--modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
     * Keep in mind that given that the default backend used is litellm
     the part of modelname before the slash (/) is the backend name (also called provider).
     If the backend is 'testing/' then a fake LLM will be used
@@ -110,14 +110,14 @@
     if contains `hyde` but modelname contains `testing` then `hyde` will
     be removed.
 
-* `--query_eval_modelname`: str, default `"openai/gpt-3.5-turbo"`
+* `--query_eval_modelname`: str, default `"openrouter/anthropic/claude-3.5-sonnet"`
     * Cheaper and quicker model than modelname. Used for intermediate
     steps in the RAG, not used in other tasks.
     If the value is not part of the model list of litellm, will use
     fuzzy matching to find the best match.
     None to disable.
 
-* `--query_eval_check_number`: int, default `3`
+* `--query_eval_check_number`: int, default `1`
     * number of pass to do with the eval llm to check if the document
     is indeed relevant to the question. The document will not
     be processed if all answers from the eval llm are 0, and will
@@ -381,6 +381,14 @@
     BeautifulSoup. Useful to decode html stored in .js files.
     Do tell me if you want more of this.
 
+* `--min_lang_prob`: float, default `0.5`
+    * float between 0 and 1 that sets the threshold under which to
+    consider a document invalid if the estimation of
+    fasttext's langdetect of any language is below that value.
+    For example, setting it to 0.9 means that only documents that
+    fasttext thinks have at least 90% probability of being a
+    language are valid.
+
 * `--source_tag`: str, default `None`
     * a string that will be added to the document metadata at the
     key `source_tag`. Useful when using filetype combination.
 
@@ -9,6 +9,8 @@
 import shutil
 import uuid
 import re
+import sys
+import traceback
 from tqdm import tqdm
 from functools import cache as memoizer
 import time
@@ -247,11 +249,16 @@ def load_one_doc_wrapped(**doc_kwargs):
             return out
         except Exception as err:
             filetype = doc_kwargs["filetype"]
-            red(f"Error when loading doc with filetype {filetype}: '{err}'. Arguments: {doc_kwargs}")
+            exc_type, exc_obj, exc_tb = sys.exc_info()
+            formatted_tb = '\n'.join(traceback.format_tb(exc_tb))
+            red(f"Error when loading doc with filetype {filetype}: '{err}'. "
+                f"Arguments: {doc_kwargs}"
+                f"\nLine number: {exc_tb.tb_lineno}"
+                f"\nFull traceback:\n{formatted_tb}")
             if loading_failure == "crash" or is_debug:
                 raise
             elif loading_failure == "warn":
-                return None
+                return err
             else:
                 raise ValueError(loading_failure)
 
@@ -298,11 +305,13 @@ def load_one_doc_wrapped(**doc_kwargs):
     red(f"Done loading all {len(to_load)} documents in {time.time()-t_load:.2f}s")
     missing_docargs = []
     for idoc, d in tqdm(enumerate(doc_lists), total=len(doc_lists), desc="Concatenating results"):
-        if d is not None:
+        if isinstance(d, list):
             docs.extend(d)
         else:
+            assert isinstance(d, str)
             missing_docargs.append(to_load[idoc])
-    assert None not in docs
+            missing_docargs[-1]["error_message"] = d
+    assert not any(isinstance(d, str) for d in docs)
 
     if missing_docargs:
         missing_docargs = sorted(missing_docargs, key=lambda x: json.dumps(x))
@@ -337,6 +346,44 @@ def load_one_doc_wrapped(**doc_kwargs):
     shutil.rmtree(temp_dir)
     assert not temp_dir.exists()
 
+    # check that the hash are unique
+    if len(docs) > 1:
+        ids = [id(d.metadata) for d in docs]
+        assert len(ids) == len(set(ids)), (
+                "Same metadata object is used to store information on "
+                "multiple documents!")
+
+        hashes = [d.metadata["all_hash"] for d in docs]
+        uniq_hashes = list(set(hashes))
+        removed_paths = []
+        removed_docs = []
+        counter = {h: hashes.count(h) for h in uniq_hashes}
+        if len(hashes) != len(uniq_hashes):
+            red("Found duplicate hashes after loading documents:")
+
+            for i, doc in enumerate(tqdm(docs, desc="Looking for duplicates")):
+                h = doc.metadata['all_hash']
+                n = counter[h]
+                if n > 1:
+                    removed_docs.append(docs[i])
+                    docs[i] = None
+                    counter[h] -= 1
+                assert counter[h] > 0
+            red(f"Removed {len(removed_docs)}/{len(hashes)} documents because they had the same hash")
+
+            # check if deduplication likely amputated documents
+            docs = [d for d in docs if d is not None]
+            present_path = [d.metadata["path"] for d in docs]
+
+            intersect = set(removed_paths).intersection(set(present_path))
+            if intersect:
+                red(f"Found {len(intersect)} documents that were only partially removed, this results in incomplete documents.")
+                for i, inte in enumerate(intersect):
+                    red(f"  * #{i + 1}: {inte}")
+                raise Exception()
+            else:
+                red(f"Removed {len(removed_paths)}/{len(hashes)} documents because they had the same hash")
+
     return docs
 
 @optional_typecheck