thiswillbeyourgithub
diff --git a/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 37 additions & 17 deletions b/‎DocToolsLLM/DocToolsLLM.py‎
Lines changed: 37 additions & 17 deletions
diff --git a/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 4 additions & 4 deletions b/‎DocToolsLLM/docs/USAGE.md‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 38 additions & 12 deletions b/‎DocToolsLLM/utils/batch_file_loader.py‎
Lines changed: 38 additions & 12 deletions
diff --git a/‎DocToolsLLM/utils/embeddings.py‎
Lines changed: 0 additions & 1 deletion b/‎DocToolsLLM/utils/embeddings.py‎
Lines changed: 0 additions & 1 deletion
@@ -30,7 +30,7 @@
     ankiconnect, debug_chain, model_name_matcher,
     average_word_length, wpm, get_splitter,
     check_docs_tkn_length, get_tkn_length,
-    extra_args_keys, disable_internet, loaders_temp_dir_file)
+    extra_args_keys, disable_internet)
 from .utils.prompts import PR_CONDENSE_QUESTION, PR_EVALUATE_DOC, PR_ANSWER_ONE_DOC, PR_COMBINE_INTERMEDIATE_ANSWERS
 from .utils.tasks.query import format_chat_history, refilter_docs, check_intermediate_answer, parse_eval_output, query_eval_cache
 
@@ -75,7 +75,7 @@
 class DocToolsLLM_class:
     "This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
 
-    VERSION: str = "0.45"
+    VERSION: str = "0.49"
 
     #@optional_typecheck
     @typechecked
@@ -154,9 +154,6 @@ def p(message: str) -> None:
         red(pyfiglet.figlet_format("DocToolsLLM"))
         log.info("Starting DocToolsLLM")
 
-        # erases content that links to the loaders temporary files at startup
-        loaders_temp_dir_file.write_text("")
-
         # make sure the extra args are valid
         for k in cli_kwargs:
             if k not in self.allowed_extra_keys:
@@ -857,6 +854,13 @@ def prepare_query_task(self) -> None:
         # parse filters as callable for faiss filtering
         if "filter_metadata" in self.cli_kwargs or "filter_content" in self.cli_kwargs:
             if "filter_metadata" in self.cli_kwargs:
+                # get the list of all metadata to see if a filter was not misspelled
+                all_metadata_keys = set()
+                for doc in tqdm(self.loaded_embeddings.docstore._dict.values(), desc="gathering metadata keys", unit="doc"):
+                    for k in doc.metadata.keys():
+                        all_metadata_keys.add(k)
+                assert all_metadata_keys, "No metadata keys found in any metadata, something went wrong!"
+
                 if isinstance(self.cli_kwargs["filter_metadata"], str):
                     filter_metadata = self.cli_kwargs["filter_metadata"].split(",")
                 else:
@@ -921,6 +925,10 @@ def prepare_query_task(self) -> None:
                 filters_b_minus_keys = tuple(filters_b_minus_keys)
                 filters_b_minus_values = tuple(filters_b_minus_values)
 
+                # check that all key filter indeed match metadata keys
+                for k in filters_k_plus + filters_k_minus + filters_b_plus_keys + filters_b_minus_keys:
+                    assert any(k.match(key) for key in all_metadata_keys), f"Key {k} didn't match any key in the metadata"
+
                 def filter_meta(meta: dict) -> bool:
                     # match keys
                     for inc in filters_k_plus:
@@ -1025,7 +1033,7 @@ def filter_cont(cont: str) -> bool:
             self.unfiltered_docstore = self.loaded_embeddings.serialize_to_bytes()
             status = self.loaded_embeddings.delete(ids_to_del)
 
-            # checking deletiong want well
+            # checking deletions went well
             if status is False:
                 raise Exception("Vectorstore filtering failed")
             elif status is None:
@@ -1132,10 +1140,22 @@ def query_task(self, query: Optional[str]) -> Optional[str]:
 
         # answer 0 or 1 if the document is related
         if not hasattr(self, "eval_llm"):
-            self.eval_llm_params = litellm.get_supported_openai_params(
-                model=self.query_eval_modelname,
-                custom_llm_provider=self.query_eval_modelbackend,
-            )
+            failed = False
+            if self.query_eval_modelbackend == "openrouter":
+                try:
+                    self.eval_llm_params = litellm.get_supported_openai_params(
+                        model_name_matcher(
+                            self.query_eval_modelname.split("/", 1)[1]
+                        )
+                    )
+                except Exception as err:
+                    failed = True
+                    red(f"Failed to get query_eval_model parameters information bypassing openrouter: '{err}'")
+            if self.modelbackend != "openrouter" or failed:
+                self.eval_llm_params = litellm.get_supported_openai_params(
+                    model=self.query_eval_modelname,
+                    custom_llm_provider=self.query_eval_modelbackend,
+                )
             eval_args = {}
             if "n" in self.eval_llm_params:
                 eval_args["n"] = self.query_eval_check_number
@@ -1186,10 +1206,10 @@ def evaluate_doc_chain(
             if "n" in self.eval_llm_params or self.query_eval_check_number == 1:
                 out = self.eval_llm._generate_with_cache(PR_EVALUATE_DOC.format_messages(**inputs))
                 reasons = [gen.generation_info["finish_reason"] for gen in out.generations]
-                # don't crash if finish_reason is not stop, because it can sometimes still be parsed.
-                if not all(r == "stop" for r in reasons):
-                    red(f"Unexpected generation finish_reason: '{reasons}'")
                 outputs = [gen.text for gen in out.generations]
+                # don't crash if finish_reason is not stop, because it can sometimes still be parsed.
+                if not all(r in ["stop", "lenghth"] for r in reasons):
+                    red(f"Unexpected generation finish_reason: '{reasons}' for generations: '{outputs}'")
                 assert outputs, "No generations found by query eval llm"
                 outputs = [parse_eval_output(o) for o in outputs]
                 if out.llm_output:
@@ -1216,17 +1236,17 @@ async def do_eval(inputs):
                 outs = loop.run_until_complete(asyncio.gather(*outs))
                 for out in outs:
                     assert len(out.generations) == 1, f"Query eval llm produced more than 1 evaluations: '{out.generations}'"
-                    finish_reason = out.generations[0].generation_info["finish_reason"]
-                    if not finish_reason == "stop":
-                        red(f"Unexpected finish_reason: '{finish_reason}'")
                     outputs.append(out.generations[0].text)
+                    finish_reason = out.generations[0].generation_info["finish_reason"]
+                    if not finish_reason in ["stop", "length"]:
+                        red(f"Unexpected finish_reason: '{finish_reason}' for generation '{outputs[-1]}'")
                     if out.llm_output:
                         new_p += out.llm_output["token_usage"]["prompt_tokens"]
                         new_c += out.llm_output["token_usage"]["completion_tokens"]
                 assert outputs, "No generations found by query eval llm"
                 outputs = [parse_eval_output(o) for o in outputs]
 
-            assert len(outputs) == self.query_eval_check_number, f"query eval model failed to produce {self.query_eval_check_number} outputs"
+            assert len(outputs) == self.query_eval_check_number, f"query eval model failed to produce {self.query_eval_check_number} outputs: '{outputs}'"
 
             self.eval_llm.callbacks[0].prompt_tokens += new_p
             self.eval_llm.callbacks[0].completion_tokens += new_c
 
@@ -248,11 +248,11 @@
 * `--anki_fields`: List[str]
     * List of fields to keep
 * `--anki_mode`: str
-    * any of `window`, `concatenate`, `single_note`: (or _ separated
-    value like `concatenate_window`). By default `single_note`
+    * any of `window`, `concatenate`, `singlecard`: (or _ separated
+    value like `concatenate_window`). By default `singlecard`
     is used.
     * Modes:
-        * `single_note`: 1 document is 1 anki note.
+        * `singlecard`: 1 document is 1 anki card.
         * `window`: 1 documents is 5 anki note, overlapping (so
         10 anki notes will result in 5 documents)
         * `concatenate`: 1 document is all anki notes concatenated as a
@@ -385,7 +385,7 @@
     * a string that will be added to the document metadata at the
     key `source_tag`. Useful when using filetype combination.
 
-* `--loading_failure`: str, default `crash`
+* `--loading_failure`: str, default `warn`
     * either `crash` or `warn`. Determines what to do with
     exceptions happening when loading a document. This can be set
     per document if a recursive_paths filetype is used.
 
@@ -12,7 +12,6 @@
 from tqdm import tqdm
 from functools import cache as memoizer
 import time
-import os
 from typing import List, Tuple
 from functools import wraps
 import random
@@ -89,8 +88,8 @@ def batch_load_doc(
     if "path" in cli_kwargs and isinstance(cli_kwargs["path"], str):
         cli_kwargs["path"] = cli_kwargs["path"].strip()
 
-    load_failure = cli_kwargs["load_failure"] if "load_failure" in cli_kwargs else "crash"
-    assert load_failure in ["crash", "warn"], f"load_failure must be either crash or warn. Not {load_failure}"
+    loading_failure = cli_kwargs["loading_failure"] if "loading_failure" in cli_kwargs else "warn"
+    assert loading_failure in ["crash", "warn"], f"loading_failure must be either crash or warn. Not {loading_failure}"
 
     # expand the list of document to load as long as there are recursive types
     to_load = [cli_kwargs.copy()]
@@ -144,7 +143,6 @@ def batch_load_doc(
         if new_doc_to_load:
             assert to_load[ild]["filetype"] in recursive_types
             to_load.remove(to_load[ild])
-            ild_done = None
             to_load.extend(new_doc_to_load)
             new_doc_to_load = []
             continue
@@ -176,7 +174,7 @@ def batch_load_doc(
             del doc[k]
     # filter out the usuall unexpected
     all_unexp_keys = [a for a in all_unexp_keys if a not in [
-        "out_file", "file_loader_n_jobs"
+        "out_file", "file_loader_n_jobs", "loading_failure",
     ]]
     if all_unexp_keys:
         red(f"Found unexpected keys in doc_kwargs: '{all_unexp_keys}'")
@@ -250,12 +248,12 @@ def load_one_doc_wrapped(**doc_kwargs):
         except Exception as err:
             filetype = doc_kwargs["filetype"]
             red(f"Error when loading doc with filetype {filetype}: '{err}'. Arguments: {doc_kwargs}")
-            if load_failure == "crash" or is_debug:
+            if loading_failure == "crash" or is_debug:
                 raise
-            elif load_failure == "warn":
+            elif loading_failure == "warn":
                 return None
             else:
-                raise ValueError(load_failure)
+                raise ValueError(loading_failure)
 
     if len(to_load) == 1 or is_debug:
         n_jobs = 1
@@ -293,12 +291,40 @@ def load_one_doc_wrapped(**doc_kwargs):
             colour="magenta",
         )
     )
+
+    # erases content that links to the loaders temporary files at startup
+    loaders_temp_dir_file.write_text("")
+
     red(f"Done loading all {len(to_load)} documents in {time.time()-t_load:.2f}s")
-    n_failed = len([d for d in doc_lists if d is None])
-    if n_failed:
-        red(f"Number of failed documents: {n_failed}")
-    [docs.extend(d) for d in doc_lists if d is not None]
+    missing_docargs = []
+    for idoc, d in tqdm(enumerate(doc_lists), total=len(doc_lists), desc="Concatenating results"):
+        if d is not None:
+            docs.extend(d)
+        else:
+            missing_docargs.append(to_load[idoc])
     assert None not in docs
+
+    if missing_docargs:
+        missing_docargs = sorted(missing_docargs, key=lambda x: json.dumps(x))
+        red(f"Number of failed documents: {len(missing_docargs)}:")
+        missed_recur = []
+        for imissed, missed in enumerate(missing_docargs):
+            if len(missing_docargs) > 99:
+                red(f"- {imissed + 1:03d}]: '{missed}'")
+            else:
+                red(f"- {imissed + 1:02d}]: '{missed}'")
+            if missed["filetype"] in recursive_types:
+                missed_recur.append(missed)
+
+        if missed_recur:
+            missed_recur = sorted(missed_recur, key=lambda x: json.dumps(x))
+            red("Crashing because some recursive filetypes failed:")
+            for imr, mr in enumerate(missed_recur):
+                red(f"- {imr + 1}]: '{mr}'")
+            raise Exception(f"{len(missed_recur)} recursive filetypes failed to load.")
+    else:
+        red("No document failed to load!")
+
     assert docs, "No documents were succesfully loaded!"
 
     size = sum([get_tkn_length(d.page_content) for d in docs])
 
@@ -10,7 +10,6 @@
 import faiss
 import random
 import time
-import copy
 from pathlib import Path, PosixPath
 from tqdm import tqdm
 import threading