Skip to content

Commit a8b0e4a

Browse files
committed
updated token calculation on parsenode
1 parent fccf034 commit a8b0e4a

File tree

11 files changed

+52
-17
lines changed

11 files changed

+52
-17
lines changed

scrapegraphai/graphs/abstract_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ def _create_llm(self, llm_config: dict) -> object:
141141
try:
142142
self.model_token = models_tokens[llm_params["model_provider"]][llm_params["model"]]
143143
except KeyError:
144-
print("Model not found, using default token size (8192)")
144+
print(f"Model {llm_params['model_provider']}/{llm_params['model']} not found, using default token size (8192)")
145145
self.model_token = 8192
146146

147147
try:

scrapegraphai/graphs/deep_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,8 @@ def _create_repeated_graph(self) -> BaseGraph:
7575
input="doc",
7676
output=["parsed_doc"],
7777
node_config={
78-
"chunk_size": self.model_token
78+
"chunk_size": self.model_token,
79+
"llm_model": self.llm_model
7980
}
8081
)
8182

scrapegraphai/graphs/markdown_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def _create_graph(self) -> BaseGraph:
6060
output=["parsed_doc"],
6161
node_config={
6262
"parse_html": False,
63-
"chunk_size": self.model_token
63+
"chunk_size": self.model_token,
64+
"llm_model": self.llm_model
6465
}
6566
)
6667
generate_answer_node = GenerateAnswerNode(

scrapegraphai/graphs/omni_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ def _create_graph(self) -> BaseGraph:
7474
input="doc",
7575
output=["parsed_doc"],
7676
node_config={
77-
"chunk_size": self.model_token
77+
"chunk_size": self.model_token,
78+
"llm_model": self.llm_model
7879
}
7980
)
8081
image_to_text_node = ImageToTextNode(

scrapegraphai/graphs/pdf_scraper_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
6868
output=["parsed_doc"],
6969
node_config={
7070
"parse_html": False,
71-
"chunk_size": self.model_token
71+
"chunk_size": self.model_token,
72+
"llm_model": self.llm_model
7273
}
7374
)
7475

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ def _create_graph(self) -> BaseGraph:
7373
input="doc",
7474
output=["parsed_doc"],
7575
node_config={"chunk_size": self.model_token,
76-
"parse_html": False
76+
"parse_html": False,
77+
"llm_model": self.llm_model
7778
}
7879
)
7980
generate_scraper_node = GenerateScraperNode(

scrapegraphai/graphs/search_link_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ def _create_graph(self) -> BaseGraph:
6464
input="doc",
6565
output=["parsed_doc"],
6666
node_config={
67-
"chunk_size": self.model_token
67+
"chunk_size": self.model_token,
68+
"llm_model": self.llm_model
6869
}
6970
)
7071
search_link_node = SearchLinkNode(

scrapegraphai/graphs/speech_graph.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ def _create_graph(self) -> BaseGraph:
6868
input="doc",
6969
output=["parsed_doc"],
7070
node_config={
71-
"chunk_size": self.model_token
71+
"chunk_size": self.model_token,
72+
"llm_model": self.llm_model
7273
}
7374
)
7475
generate_answer_node = GenerateAnswerNode(

scrapegraphai/nodes/parse_node.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ def __init__(
4040
self.parse_html = (
4141
True if node_config is None else node_config.get("parse_html", True)
4242
)
43+
self.llm_model = node_config['llm_model']
4344

4445
def execute(self, state: dict) -> dict:
4546
"""
@@ -64,31 +65,33 @@ def execute(self, state: dict) -> dict:
6465
input_data = [state[key] for key in input_keys]
6566
docs_transformed = input_data[0]
6667

68+
def count_tokens(text):
69+
from ..utils import token_count
70+
return token_count(text, self.llm_model.model_name)
71+
6772
if self.parse_html:
6873
docs_transformed = Html2TextTransformer().transform_documents(input_data[0])
6974
docs_transformed = docs_transformed[0]
7075

7176
chunks = chunk(text=docs_transformed.page_content,
7277
chunk_size=self.node_config.get("chunk_size", 4096)-250,
73-
token_counter=lambda text: len(text.split()),
78+
token_counter=count_tokens,
7479
memoize=False)
7580
else:
7681
docs_transformed = docs_transformed[0]
77-
7882
chunk_size = self.node_config.get("chunk_size", 4096)
7983
chunk_size = min(chunk_size - 500, int(chunk_size * 0.9))
8084

8185
if isinstance(docs_transformed, Document):
8286
chunks = chunk(text=docs_transformed.page_content,
8387
chunk_size=chunk_size,
84-
token_counter=lambda text: len(text.split()),
88+
token_counter=count_tokens,
8589
memoize=False)
8690
else:
8791
chunks = chunk(text=docs_transformed,
8892
chunk_size=chunk_size,
89-
token_counter=lambda text: len(text.split()),
93+
token_counter=count_tokens,
9094
memoize=False)
9195

9296
state.update({self.output[0]: chunks})
93-
9497
return state

scrapegraphai/utils/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@
1111
from .cleanup_html import cleanup_html
1212
from .logging import *
1313
from .convert_to_md import convert_to_md
14+
from .token_calculator import *

0 commit comments

Comments
 (0)