File tree Expand file tree Collapse file tree 3 files changed +56
-4
lines changed Expand file tree Collapse file tree 3 files changed +56
-4
lines changed Original file line number Diff line number Diff line change 1+ """
2+ This example shows how to do not process the html code in the fetch phase
3+ """
4+
5+ import os , json
6+ from scrapegraphai .graphs import SmartScraperGraph
7+ from scrapegraphai .utils import prettify_exec_info
8+
9+
10+ # ************************************************
11+ # Define the configuration for the graph
12+ # ************************************************
13+
14+
15+ graph_config = {
16+ "llm" : {
17+ "api_key" : "s" ,
18+ "model" : "gpt-3.5-turbo" ,
19+ },
20+ "cut" : False ,
21+ "verbose" : True ,
22+ "headless" : False ,
23+ }
24+
25+ # ************************************************
26+ # Create the SmartScraperGraph instance and run it
27+ # ************************************************
28+
29+ smart_scraper_graph = SmartScraperGraph (
30+ prompt = "Extract me the python code inside the page" ,
31+ source = "https://www.exploit-db.com/exploits/51447" ,
32+ config = graph_config
33+ )
34+
35+ result = smart_scraper_graph .run ()
36+ print (json .dumps (result , indent = 4 ))
37+
38+ # ************************************************
39+ # Get graph execution info
40+ # ************************************************
41+
42+ graph_exec_info = smart_scraper_graph .get_execution_info ()
43+ print (prettify_exec_info (graph_exec_info ))
Original file line number Diff line number Diff line change @@ -66,6 +66,8 @@ def _create_graph(self) -> BaseGraph:
6666 output = ["doc" , "link_urls" , "img_urls" ],
6767 node_config = {
6868 "llm_model" : self .llm_model ,
69+ "force" : self .config .get ("force" , False ),
70+ "cut" : self .config .get ("cut" , True ),
6971 "loader_kwargs" : self .config .get ("loader_kwargs" , {}),
7072 }
7173 )
Original file line number Diff line number Diff line change @@ -71,6 +71,10 @@ def __init__(
7171 False if node_config is None else node_config .get ("script_creator" , False )
7272 )
7373
74+ self .cut = (
75+ False if node_config is None else node_config .get ("cut" , True )
76+ )
77+
7478 def execute (self , state ):
7579 """
7680 Executes the node's logic to fetch HTML content from a specified URL and
@@ -105,7 +109,7 @@ def execute(self, state):
105109 compressed_document = [
106110 source
107111 ]
108-
112+
109113 state .update ({self .output [0 ]: compressed_document })
110114 return state
111115 # handling pdf
@@ -165,10 +169,13 @@ def execute(self, state):
165169 if response .status_code == 200 :
166170 if not response .text .strip ():
167171 raise ValueError ("No HTML body content found in the response." )
172+
173+ parsed_content = response
174+
175+ if not self .cut :
176+ parsed_content = cleanup_html (response , source )
168177
169- parsed_content = cleanup_html (response , source )
170-
171- if isinstance (self .llm_model , OpenAI ) and not self .script_creator or self .force and not self .script_creator and not :
178+ if (isinstance (self .llm_model , OpenAI ) and not self .script_creator ) or (self .force and not self .script_creator ):
172179 parsed_content = convert_to_md (source )
173180 compressed_document = [Document (page_content = parsed_content )]
174181 else :
You can’t perform that action at this time.
0 commit comments