change GenerateScraperNode to only use first chunk

tm-robinson · tm-robinson · commit 3d265a863173 · 2024-09-02T08:21:19.000+01:00
diff --git a/scrapegraphai/graphs/script_creator_graph.py b/scrapegraphai/graphs/script_creator_graph.py
@@ -78,7 +78,7 @@ def _create_graph(self) -> BaseGraph:
                          }
         )
         generate_scraper_node = GenerateScraperNode(
-            input="user_prompt & (doc)",
+            input="user_prompt & (parsed_doc)",
             output=["answer"],
             node_config={
                 "llm_model": self.llm_model,
diff --git a/scrapegraphai/nodes/generate_scraper_node.py b/scrapegraphai/nodes/generate_scraper_node.py
@@ -102,9 +102,19 @@ def execute(self, state: dict) -> dict:
             TEMPLATE_NO_CHUNKS += self.additional_info
 
         if len(doc) > 1:
-            raise NotImplementedError(
-                "Currently GenerateScraperNode cannot handle more than 1 context chunks"
-            )
+            # Short term partial fix for issue #543 (Context length exceeded)
+            # If there are more than one chunks returned by ParseNode we just use the first one
+            # on the basis that the structure of the remainder of the HTML page is probably
+            # very similar to the first chunk therefore the generated script should still work.
+            # The better fix is to generate multiple scripts then use the LLM to merge them.
+
+            #raise NotImplementedError(
+            #    "Currently GenerateScraperNode cannot handle more than 1 context chunks"
+            #)
+            self.logger.warn(f"Warning: {self.node_name} Node provided with {len(doc)} chunks but can only "
+                "support 1, ignoring remaining chunks")
+            doc = [doc[0]]
+            template = TEMPLATE_NO_CHUNKS
         else:
             template = TEMPLATE_NO_CHUNKS
 

Original file line number	Diff line number	Diff line change
`@@ -78,7 +78,7 @@ def _create_graph(self) -> BaseGraph:`
`78`	`78`	`}`
`79`	`79`	`)`
`80`	`80`	`generate_scraper_node = GenerateScraperNode(`
`81`		`- input="user_prompt & (doc)",`
	`81`	`+ input="user_prompt & (parsed_doc)",`
`82`	`82`	`output=["answer"],`
`83`	`83`	`node_config={`
`84`	`84`	`"llm_model": self.llm_model,`