1+ """
2+ OmniScraperGraph Module
3+ """
4+
5+ from .base_graph import BaseGraph
6+ from ..nodes import (
7+ FetchNode ,
8+ ParseNode ,
9+ ImageToTextNode ,
10+ RAGNode ,
11+ GenerateAnswerOmniNode
12+ )
13+ from scrapegraphai .models import OpenAIImageToText
14+ from .abstract_graph import AbstractGraph
15+
16+
17+ class OmniScraperGraph (AbstractGraph ):
18+ """
19+ OmniScraper is a scraping pipeline that automates the process of
20+ extracting information from web pages
21+ using a natural language model to interpret and answer prompts.
22+
23+ Attributes:
24+ prompt (str): The prompt for the graph.
25+ source (str): The source of the graph.
26+ config (dict): Configuration parameters for the graph.
27+ llm_model: An instance of a language model client, configured for generating answers.
28+ embedder_model: An instance of an embedding model client,
29+ configured for generating embeddings.
30+ verbose (bool): A flag indicating whether to show print statements during execution.
31+ headless (bool): A flag indicating whether to run the graph in headless mode.
32+
33+ Args:
34+ prompt (str): The prompt for the graph.
35+ source (str): The source of the graph.
36+ config (dict): Configuration parameters for the graph.
37+
38+ Example:
39+ >>> omni_scraper = OmniScraperGraph(
40+ ... "List me all the attractions in Chioggia and describe their pictures.",
41+ ... "https://en.wikipedia.org/wiki/Chioggia",
42+ ... {"llm": {"model": "gpt-4o"}}
43+ ... )
44+ >>> result = omni_scraper.run()
45+ )
46+ """
47+
48+ def __init__ (self , prompt : str , source : str , config : dict ):
49+
50+ self .max_images = 5 if config is None else config .get ("max_images" , 5 )
51+
52+ super ().__init__ (prompt , config , source )
53+
54+ self .input_key = "url" if source .startswith ("http" ) else "local_dir"
55+
56+
57+ def _create_graph (self ) -> BaseGraph :
58+ """
59+ Creates the graph of nodes representing the workflow for web scraping.
60+
61+ Returns:
62+ BaseGraph: A graph instance representing the web scraping workflow.
63+ """
64+ fetch_node = FetchNode (
65+ input = "url | local_dir" ,
66+ output = ["doc" , "link_urls" , "img_urls" ],
67+ node_config = {
68+ "loader_kwargs" : self .config .get ("loader_kwargs" , {}),
69+ }
70+ )
71+ parse_node = ParseNode (
72+ input = "doc" ,
73+ output = ["parsed_doc" ],
74+ node_config = {
75+ "chunk_size" : self .model_token
76+ }
77+ )
78+ image_to_text_node = ImageToTextNode (
79+ input = "img_urls" ,
80+ output = ["img_desc" ],
81+ node_config = {
82+ "llm_model" : OpenAIImageToText (self .config ["llm" ]),
83+ "max_images" : self .max_images
84+ }
85+ )
86+ rag_node = RAGNode (
87+ input = "user_prompt & (parsed_doc | doc)" ,
88+ output = ["relevant_chunks" ],
89+ node_config = {
90+ "llm_model" : self .llm_model ,
91+ "embedder_model" : self .embedder_model
92+ }
93+ )
94+ generate_answer_omni_node = GenerateAnswerOmniNode (
95+ input = "user_prompt & (relevant_chunks | parsed_doc | doc) & img_desc" ,
96+ output = ["answer" ],
97+ node_config = {
98+ "llm_model" : self .llm_model
99+ }
100+ )
101+
102+ return BaseGraph (
103+ nodes = [
104+ fetch_node ,
105+ parse_node ,
106+ image_to_text_node ,
107+ rag_node ,
108+ generate_answer_omni_node ,
109+ ],
110+ edges = [
111+ (fetch_node , parse_node ),
112+ (parse_node , image_to_text_node ),
113+ (image_to_text_node , rag_node ),
114+ (rag_node , generate_answer_omni_node )
115+ ],
116+ entry_point = fetch_node
117+ )
118+
119+ def run (self ) -> str :
120+ """
121+ Executes the scraping process and returns the answer to the prompt.
122+
123+ Returns:
124+ str: The answer to the prompt.
125+ """
126+
127+ inputs = {"user_prompt" : self .prompt , self .input_key : self .source }
128+ self .final_state , self .execution_info = self .graph .execute (inputs )
129+
130+ return self .final_state .get ("answer" , "No answer found." )
0 commit comments