Skip to content

Commit 3d265a8

Browse files
committed
change GenerateScraperNode to only use first chunk
1 parent a8b0e4a commit 3d265a8

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def _create_graph(self) -> BaseGraph:
7878
}
7979
)
8080
generate_scraper_node = GenerateScraperNode(
81-
input="user_prompt & (doc)",
81+
input="user_prompt & (parsed_doc)",
8282
output=["answer"],
8383
node_config={
8484
"llm_model": self.llm_model,

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict:
102102
TEMPLATE_NO_CHUNKS += self.additional_info
103103

104104
if len(doc) > 1:
105-
raise NotImplementedError(
106-
"Currently GenerateScraperNode cannot handle more than 1 context chunks"
107-
)
105+
# Short term partial fix for issue #543 (Context length exceeded)
106+
# If there are more than one chunks returned by ParseNode we just use the first one
107+
# on the basis that the structure of the remainder of the HTML page is probably
108+
# very similar to the first chunk therefore the generated script should still work.
109+
# The better fix is to generate multiple scripts then use the LLM to merge them.
110+
111+
#raise NotImplementedError(
112+
# "Currently GenerateScraperNode cannot handle more than 1 context chunks"
113+
#)
114+
self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only "
115+
"support 1, ignoring remaining chunks")
116+
doc = [doc[0]]
117+
template = TEMPLATE_NO_CHUNKS
108118
else:
109119
template = TEMPLATE_NO_CHUNKS
110120

0 commit comments

Comments
 (0)