updated token calculation on parsenode

tm-robinson · tm-robinson · commit a8b0e4a35963 · 2024-09-02T08:01:21.000+01:00
diff --git a/scrapegraphai/graphs/abstract_graph.py b/scrapegraphai/graphs/abstract_graph.py
@@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object:
         try:
             self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
         except KeyError:
-            print("Model not found, using default token size (8192)")
+            print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)")
             self.model_token = 8192
 
         try:
diff --git a/scrapegraphai/graphs/deep_scraper_graph.py b/scrapegraphai/graphs/deep_scraper_graph.py
@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
        
diff --git a/scrapegraphai/graphs/markdown_scraper_graph.py b/scrapegraphai/graphs/markdown_scraper_graph.py
@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:
             output=["parsed_doc"],
             node_config={
                 "parse_html": False,
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/graphs/omni_scraper_graph.py b/scrapegraphai/graphs/omni_scraper_graph.py
@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         image_to_text_node = ImageToTextNode(
diff --git a/scrapegraphai/graphs/pdf_scraper_graph.py b/scrapegraphai/graphs/pdf_scraper_graph.py
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
             output=["parsed_doc"],
             node_config={
                 "parse_html": False,
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
 
diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -73,7 +73,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={"chunk_size": self.model_token,
-                         "parse_html": False
+                         "parse_html": False,
+                         "llm_model": self.llm_model
                          }
         )
         generate_scraper_node = GenerateScraperNode(
diff --git a/scrapegraphai/graphs/search_link_graph.py b/scrapegraphai/graphs/search_link_graph.py
@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         search_link_node = SearchLinkNode(
diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
             input="doc",
             output=["parsed_doc"],
             node_config={
-                "chunk_size": self.model_token
+                "chunk_size": self.model_token,
+                "llm_model": self.llm_model
             }
         )
         generate_answer_node = GenerateAnswerNode(
diff --git a/scrapegraphai/nodes/parse_node.py b/scrapegraphai/nodes/parse_node.py
@@ -40,6 +40,7 @@ def __init__(
         self.parse_html = (
             True if node_config is None else node_config.get("parse_html", True)
         )
+        self.llm_model = node_config['llm_model']
 
     def execute(self, state: dict) -> dict:
         """
@@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict:
         input_data = [state[key] for key in input_keys]
         docs_transformed = input_data[0]
 
+        def count_tokens(text):
+            from ..utils import token_count
+            return token_count(text, self.llm_model.model_name)
+
         if self.parse_html:
             docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
             docs_transformed = docs_transformed[0]
 
             chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=self.node_config.get("chunk_size", 4096)-250,
-                            token_counter=lambda text: len(text.split()),
+                            token_counter=count_tokens,
                             memoize=False)
         else:
             docs_transformed = docs_transformed[0]
-
             chunk_size = self.node_config.get("chunk_size", 4096)
             chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
 
             if isinstance(docs_transformed, Document):
                 chunks = chunk(text=docs_transformed.page_content,
                             chunk_size=chunk_size,
-                            token_counter=lambda text: len(text.split()),
+                            token_counter=count_tokens,
                             memoize=False)
             else:
                 chunks = chunk(text=docs_transformed,
                                 chunk_size=chunk_size,
-                                token_counter=lambda text: len(text.split()),
+                                token_counter=count_tokens,
                                 memoize=False)
 
         state.update({self.output[0]: chunks})
-
         return state
diff --git a/scrapegraphai/utils/__init__.py b/scrapegraphai/utils/__init__.py
@@ -11,3 +11,4 @@
 from .cleanup_html import cleanup_html
 from .logging import *
 from .convert_to_md import convert_to_md
+from .token_calculator import *
diff --git a/scrapegraphai/utils/token_calculator.py b/scrapegraphai/utils/token_calculator.py
@@ -6,27 +6,26 @@
 from ..helpers.models_tokens import models_tokens
 
 
-def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]:
+def truncate_text_tokens(text: str, model: str) -> List[str]:
     """
     Truncates text into chunks that are small enough to be processed by specified llm models.
 
     Args:
         text (str): The input text to be truncated.
         model (str): The name of the llm model to determine the maximum token limit.
-        encoding_name (str): The encoding strategy used to encode the text before truncation.
 
     Returns:
         List[str]: A list of text chunks, each within the token limit of the specified model.
 
     Example:
-        >>> truncate_text_tokens("This is a sample text for truncation.", "GPT-3", "EMBEDDING_ENCODING")
+        >>> truncate_text_tokens("This is a sample text for truncation.", "gpt-4o-mini")
         ["This is a sample text", "for truncation."]
 
     This function ensures that each chunk of text can be tokenized 
     by the specified model without exceeding the model's token limit.
     """
 
-    encoding = tiktoken.get_encoding(encoding_name)
+    encoding = tiktoken.encoding_for_model(model)
     max_tokens = min(models_tokens[model] - 500, int(models_tokens[model] * 0.9))
     encoded_text = encoding.encode(text)
 
@@ -36,3 +35,28 @@ def truncate_text_tokens(text: str, model: str, encoding_name: str) -> List[str]
     result = [encoding.decode(chunk) for chunk in chunks]
 
     return result
+
+
+def token_count(text: str, model: str) -> List[str]:
+    """
+    Return the number of tokens within the text, based on the encoding of the specified model.
+
+    Args:
+        text (str): The input text to be counted.
+        model (str): The name of the llm model to determine the encoding.
+
+    Returns:
+        int: Number of tokens.
+
+    Example:
+        >>> token_count("This is a sample text for counting.", "gpt-4o-mini")
+        9
+
+    This function ensures that each chunk of text can be tokenized 
+    by the specified model without exceeding the model's token limit.
+    """
+
+    encoding = tiktoken.encoding_for_model(model)
+    num_tokens = len(encoding.encode(text))
+
+    return num_tokens

Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:`
`75`	`75`	`input="doc",`
`76`	`76`	`output=["parsed_doc"],`
`77`	`77`	`node_config={`
`78`		`- "chunk_size": self.model_token`
	`78`	`+ "chunk_size": self.model_token,`
	`79`	`+ "llm_model": self.llm_model`
`79`	`80`	`}`
`80`	`81`	`)`
`81`	`82`
Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:`
`60`	`60`	`output=["parsed_doc"],`
`61`	`61`	`node_config={`
`62`	`62`	`"parse_html": False,`
`63`		`- "chunk_size": self.model_token`
	`63`	`+ "chunk_size": self.model_token,`
	`64`	`+ "llm_model": self.llm_model`
`64`	`65`	`}`
`65`	`66`	`)`
`66`	`67`	`generate_answer_node = GenerateAnswerNode(`
Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:`
`74`	`74`	`input="doc",`
`75`	`75`	`output=["parsed_doc"],`
`76`	`76`	`node_config={`
`77`		`- "chunk_size": self.model_token`
	`77`	`+ "chunk_size": self.model_token,`
	`78`	`+ "llm_model": self.llm_model`
`78`	`79`	`}`
`79`	`80`	`)`
`80`	`81`	`image_to_text_node = ImageToTextNode(`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:`
`68`	`68`	`output=["parsed_doc"],`
`69`	`69`	`node_config={`
`70`	`70`	`"parse_html": False,`
`71`		`- "chunk_size": self.model_token`
	`71`	`+ "chunk_size": self.model_token,`
	`72`	`+ "llm_model": self.llm_model`
`72`	`73`	`}`
`73`	`74`	`)`
`74`	`75`
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,8 @@ def _create_graph(self) -> BaseGraph:`
`73`	`73`	`input="doc",`
`74`	`74`	`output=["parsed_doc"],`
`75`	`75`	`node_config={"chunk_size": self.model_token,`
`76`		`- "parse_html": False`
	`76`	`+ "parse_html": False,`
	`77`	`+ "llm_model": self.llm_model`
`77`	`78`	`}`
`78`	`79`	`)`
`79`	`80`	`generate_scraper_node = GenerateScraperNode(`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:`
`64`	`64`	`input="doc",`
`65`	`65`	`output=["parsed_doc"],`
`66`	`66`	`node_config={`
`67`		`- "chunk_size": self.model_token`
	`67`	`+ "chunk_size": self.model_token,`
	`68`	`+ "llm_model": self.llm_model`
`68`	`69`	`}`
`69`	`70`	`)`
`70`	`71`	`search_link_node = SearchLinkNode(`