Merge branch 'dev'

thiswillbeyourgithub · thiswillbeyourgithub · commit eab1d8176ecb · 2024-06-19T15:32:14.000+02:00
diff --git a/DocToolsLLM/DocToolsLLM.py b/DocToolsLLM/DocToolsLLM.py
@@ -78,7 +78,7 @@
 class DocToolsLLM_class:
     "This docstring is dynamically replaced by the content of DocToolsLLM/docs/USAGE.md"
 
-    VERSION: str = "0.36"
+    VERSION: str = "0.37"
 
     #@optional_typecheck
     @typechecked
@@ -282,6 +282,8 @@ def handle_exception(exc_type, exc_value, exc_traceback):
             else:
                 self.llm_cache = SQLiteCache(database_path=(cache_dir / "private_langchain.db").resolve().absolute())
                 set_llm_cache(self.llm_cache)
+        else:
+            self.llm_cache = not no_llm_cache
 
         if llms_api_bases["model"]:
             red(f"Disabling price computation for model because api_base was modified")
@@ -364,7 +366,7 @@ def ntfy(text: str) -> str:
         self.llm = load_llm(
             modelname=modelname,
             backend=self.modelbackend,
-            llm_cache=self.llm_cache if not self.no_llm_cache else False,
+            llm_cache=self.llm_cache,
             temperature=0,
             verbose=self.llm_verbosity,
             api_base=self.llms_api_bases["model"],
@@ -1021,11 +1023,37 @@ def _query(self, query: Optional[str]) -> Optional[str]:
         whi(f"Question to answer: {query_an}")
 
         # the eval doc chain needs its own caching
-        if not self.no_llm_cache:
+        if self.llm_cache:
             eval_cache_wrapper = doc_eval_cache.cache
         else:
             def eval_cache_wrapper(func): return func
 
+        # answer 0 or 1 if the document is related
+        if not hasattr(self, "eval_llm"):
+            self.eval_llm_params = litellm.get_supported_openai_params(
+                model=self.query_eval_modelname,
+                custom_llm_provider=self.query_eval_modelbackend,
+            )
+            eval_args = {}
+            if "n" in self.eval_llm_params:
+                eval_args["n"] = self.query_eval_check_number
+            else:
+                red(f"Model {self.query_eval_modelname} does not support parameter 'n' so will be called multiple times instead. This might cost more.")
+            if "max_tokens" in self.eval_llm_params:
+                eval_args["max_tokens"] = 2
+            else:
+                red(f"Model {self.query_eval_modelname} does not support parameter 'max_token' so the result might be of less quality.")
+            self.eval_llm = load_llm(
+                modelname=self.query_eval_modelname,
+                backend=self.query_eval_modelbackend,
+                llm_cache=False,  # disables caching because another caching is used on top
+                verbose=self.llm_verbosity,
+                temperature=1,
+                api_base=self.llms_api_bases["query_eval_model"],
+                private=self.private,
+                **eval_args,
+            )
+
         @chain
         @optional_typecheck
         @eval_cache_wrapper
@@ -1039,8 +1067,12 @@ def evaluate_doc_chain(
                 outputs = [gen.text for gen in out.generations]
                 assert outputs, "No generations found by query eval llm"
                 outputs = [parse_eval_output(o) for o in outputs]
-                new_p = out.llm_output["token_usage"]["prompt_tokens"]
-                new_c = out.llm_output["token_usage"]["completion_tokens"]
+                if out.llm_output:
+                    new_p = out.llm_output["token_usage"]["prompt_tokens"]
+                    new_c = out.llm_output["token_usage"]["completion_tokens"]
+                else:
+                    new_p = 0
+                    new_c = 0
             else:
                 outputs = []
                 new_p = 0
@@ -1060,8 +1092,9 @@ async def eval(inputs):
                 for out in outs:
                     assert len(out.generations) == 1, f"Query eval llm produced more than 1 evaluations: '{out.generations}'"
                     outputs.append(out.generations[0].text)
-                    new_p += out.llm_output["token_usage"]["prompt_tokens"]
-                    new_c += out.llm_output["token_usage"]["completion_tokens"]
+                    if out.llm_output:
+                        new_p += out.llm_output["token_usage"]["prompt_tokens"]
+                        new_c += out.llm_output["token_usage"]["completion_tokens"]
                 assert outputs, "No generations found by query eval llm"
                 outputs = [parse_eval_output(o) for o in outputs]
 
@@ -1072,36 +1105,12 @@ async def eval(inputs):
             self.eval_llm.callbacks[0].total_tokens += new_p + new_c
             return outputs
 
+        # uses in most places to increase concurrency limit
+        multi = {"max_concurrency": 50 if not self.debug else 1}
+
         if self.task == "search":
             if self.query_eval_modelname:
-                # uses in most places to increase concurrency limit
-                multi = {"max_concurrency": 50 if not self.debug else 1}
-
-                # answer 0 or 1 if the document is related
-                if not hasattr(self, "eval_llm"):
-                    self.eval_llm_params = litellm.get_supported_openai_params(
-                        model=self.query_eval_modelname,
-                        custom_llm_provider=self.query_eval_modelbackend,
-                    )
-                    eval_args = {}
-                    if "n" in self.eval_llm_params:
-                        eval_args["n"] = self.query_eval_check_number
-                    else:
-                        red(f"Model {self.query_eval_modelname} does not support parameter 'n' so will be called multiple times instead. This might cost more.")
-                    if "max_tokens" in self.eval_llm_params:
-                        eval_args["max_tokens"] = 2
-                    else:
-                        red(f"Model {self.query_eval_modelname} does not support parameter 'max_token' so the result might be of less quality.")
-                    self.eval_llm = load_llm(
-                        modelname=self.query_eval_modelname,
-                        backend=self.query_eval_modelbackend,
-                        llm_cache=self.llm_cache if not self.no_llm_cache else False,
-                        verbose=self.llm_verbosity,
-                        temperature=1,
-                        api_base=self.llms_api_bases["query_eval_model"],
-                        private=self.private,
-                        **eval_args,
-                    )
+
 
                 # for some reason I needed to have at least one chain object otherwise rag_chain is a dict
                 @chain
@@ -1210,35 +1219,6 @@ def retrieve_documents(inputs):
                         | StrOutputParser()
                 }
 
-            # uses in most places to increase concurrency limit
-            multi = {"max_concurrency": 50 if not self.debug else 1}
-
-            # answer 0 or 1 if the document is related
-            if not hasattr(self, "eval_llm"):
-                self.eval_llm_params = litellm.get_supported_openai_params(
-                    model=self.query_eval_modelname,
-                    custom_llm_provider=self.query_eval_modelbackend,
-                )
-                eval_args = {}
-                if "n" in self.eval_llm_params:
-                    eval_args["n"] = self.query_eval_check_number
-                else:
-                    red(f"Model {self.query_eval_modelname} does not support parameter 'n' so will be called multiple times instead. This might cost more.")
-                if "max_tokens" in self.eval_llm_params:
-                    eval_args["max_tokens"] = 2
-                else:
-                    red(f"Model {self.query_eval_modelname} does not support parameter 'max_token' so the result might be of less quality.")
-                self.eval_llm = load_llm(
-                    modelname=self.query_eval_modelname,
-                    backend=self.query_eval_modelbackend,
-                    llm_cache=self.llm_cache if not self.no_llm_cache else False,
-                    verbose=self.llm_verbosity,
-                    temperature=1,
-                    api_base=self.llms_api_bases["query_eval_model"],
-                    private=self.private,
-                    **eval_args,
-                )
-
             # the eval doc chain needs its own caching
             if self.no_llm_cache:
                 def eval_cache_wrapper(func): return func
@@ -1260,8 +1240,12 @@ def evaluate_doc_chain(
                     outputs = [gen.text for gen in out.generations]
                     assert outputs, "No generations found by query eval llm"
                     outputs = [parse_eval_output(o) for o in outputs]
-                    new_p = out.llm_output["token_usage"]["prompt_tokens"]
-                    new_c = out.llm_output["token_usage"]["completion_tokens"]
+                    if out.llm_output:
+                        new_p = out.llm_output["token_usage"]["prompt_tokens"]
+                        new_c = out.llm_output["token_usage"]["completion_tokens"]
+                    else:
+                        new_p = 0
+                        new_c = 0
                 else:
                     outputs = []
                     new_p = 0
@@ -1283,8 +1267,9 @@ async def eval(inputs):
                         outputs.append(out.generations[0].text)
                         finish_reason = out.generations[0].generation_info["finish_reason"]
                         assert finish_reason == "stop", f"unexpected finish_reason: '{finish_reason}'"
-                        new_p += out.llm_output["token_usage"]["prompt_tokens"]
-                        new_c += out.llm_output["token_usage"]["completion_tokens"]
+                        if out.llm_output:
+                            new_p += out.llm_output["token_usage"]["prompt_tokens"]
+                            new_c += out.llm_output["token_usage"]["completion_tokens"]
                     assert outputs, "No generations found by query eval llm"
                     outputs = [parse_eval_output(o) for o in outputs]
 
diff --git a/DocToolsLLM/utils/llm.py b/DocToolsLLM/utils/llm.py
@@ -39,7 +39,7 @@ def load_llm(
     modelname: str,
     backend: str,
     verbose: bool,
-    llm_cache: Union[bool, SQLiteCache],
+    llm_cache: Union[None, bool, SQLiteCache],
     api_base: Optional[str],
     private: bool,
     **extra_model_args,
@@ -81,7 +81,7 @@ def load_llm(
     else:
         assert os.environ["DOCTOOLS_PRIVATEMODE"] == "false"
 
-    if not private and backend == "openai" and api_base is None and llm_cache is not False:
+    if not private and backend == "openai" and api_base is None:
         red("Using ChatOpenAI instead of litellm because calling openai server anyway and the caching has a bug on langchain side :( The caching works on ChatOpenAI though. More at https://github.com/langchain-ai/langchain/issues/22389")
         max_tokens = litellm.get_model_info(modelname)["max_tokens"]
         if "max_tokens" not in extra_model_args:
@@ -98,7 +98,7 @@ def load_llm(
         max_tokens = litellm.get_model_info(modelname)["max_tokens"]
         if "max_tokens" not in extra_model_args:
             extra_model_args["max_tokens"] = max_tokens
-        if llm_cache is not False:
+        if llm_cache is not None:
             red(f"Reminder: caching is disabled for non openai models until langchain approves the fix.")
         llm = ChatLiteLLM(
             model_name=modelname,
@@ -111,6 +111,12 @@ def load_llm(
     if private:
         assert llm.api_base, "private is set but no api_base for llm were found"
         assert llm.api_base == api_base, "private is set but found unexpected llm.api_base value: '{litellm.api_base}'"
+
+    # fix: the SQLiteCache's str appearance is cancelling its own cache lookup!
+    if llm.cache:
+        cur = str(llm.cache)
+        llm.cache.__class__.__repr__ = lambda: cur.split(" at ")[0]
+        llm.cache.__class__.__str__ = lambda: cur.split(" at ")[0]
     return llm
 
 
@@ -133,6 +139,14 @@ def __init__(self, verbose, *args, **kwargs):
                 "on_chain_error",
         ]
 
+    def __repr__(self) -> str:
+        # setting __repr__ and __str__ is important because it can
+        # maybe be used for caching?
+        return "PriceCountingCallback"
+
+    def __str__(self) -> str:
+        return "PriceCountingCallback"
+
     def _check_methods_called(self) -> None:
         assert all(meth in dir(self) for meth in self.methods_called), (
             "unexpected method names!")
diff --git a/DocToolsLLM/utils/tasks/summary.py b/DocToolsLLM/utils/tasks/summary.py
@@ -35,8 +35,6 @@ def do_summarize(
     assert "[PROGRESS]" in metadata
     for ird, rd in tqdm(enumerate(docs), desc="Summarising splits", total=len(docs)):
         fixed_index = f"{ird + 1}/{len(docs)}"
-        if ird > 0:
-            assert llm.callbacks[0].total_tokens > 0
 
         messages = BASE_SUMMARY_PROMPT.format_messages(
             text=rd.page_content,
@@ -50,8 +48,12 @@ def do_summarize(
         assert finish == "stop", f"Unexpected finish_reason: '{finish}'"
         assert len(output.generations) == 1
         out = output.generations[0].text
-        new_p = output.llm_output["token_usage"]["prompt_tokens"]
-        new_c = output.llm_output["token_usage"]["completion_tokens"]
+        if output.llm_output:  # only present if not caching
+            new_p = output.llm_output["token_usage"]["prompt_tokens"]
+            new_c = output.llm_output["token_usage"]["completion_tokens"]
+        else:
+            new_p = 0
+            new_c = 0
         total_tokens += new_p + new_c
         total_cost += (new_p * llm_price[0] + new_c + llm_price[1]) / 1e6
 
diff --git a/bumpver.toml b/bumpver.toml
@@ -1,5 +1,5 @@
 [bumpver]
-current_version = "0.36"
+current_version = "0.37"
 version_pattern = "MAJOR.MINOR"
 commit_message = "bump version {old_version} -> {new_version}"
 tag_message = "{new_version}"
diff --git a/setup.py b/setup.py
@@ -28,7 +28,7 @@ def run(self):
 
 setup(
     name="DocToolsLLM",
-    version="0.36",
+    version="0.37",
     description="A perfect RAG and AI summary setup for my needs. Supports all LLM, virt. any filetypes (epub, youtube_playlist, pdf, mp3, etc)",
     long_description=long_description,
     long_description_content_type="text/markdown",