Skip to content

Commit d830d13

Browse files
committed
Resolve merge conflicts
2 parents 7599234 + d277b34 commit d830d13

File tree

10 files changed

+418
-8
lines changed

10 files changed

+418
-8
lines changed

CHANGELOG.md

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,47 @@
1+
## [0.8.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0...v0.8.0-beta.1) (2024-05-03)
2+
3+
4+
### Features
5+
6+
* add pdf scraper ([10a9453](https://github.com/VinciGit00/Scrapegraph-ai/commit/10a94530e3fd4dfde933ecfa96cb3e21df72e606))
7+
8+
9+
### CI
10+
11+
* **release:** 0.7.0-beta.3 [skip ci] ([fbb06ab](https://github.com/VinciGit00/Scrapegraph-ai/commit/fbb06ab551fac9cc9824ad567f042e55450277bd))
12+
13+
## [0.7.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.2...v0.7.0) (2024-05-03)
14+
15+
### Features
16+
17+
* add base_node to __init__.py ([cb1cb61](https://github.com/VinciGit00/Scrapegraph-ai/commit/cb1cb616b7998d3624bf57b19b5f1b1945fea4ef))
18+
* Azure implementation + embeddings refactoring ([aa9271e](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa9271e7bc4daa54860499d0615580b17550ff58))
19+
20+
21+
### Refactor
22+
23+
* Changed the way embedding model is created in AbstractGraph class and removed handling of embedding model creation from RAGNode. Now AbstractGraph will call a dedicated method for embedding models instead of _create_llm. This makes it easy to use any LLM with any supported embedding model. ([819cbcd](https://github.com/VinciGit00/Scrapegraph-ai/commit/819cbcd3be1a8cb195de0b44c6b6d4d824e2a42a))
24+
25+
26+
### CI
27+
28+
* **release:** 0.7.0-beta.1 [skip ci] ([98dec36](https://github.com/VinciGit00/Scrapegraph-ai/commit/98dec36c60d1dc8b072482e8d514c3869a45a3f8))
29+
* **release:** 0.7.0-beta.2 [skip ci] ([42fa02e](https://github.com/VinciGit00/Scrapegraph-ai/commit/42fa02e65a3a81796bd66e55cf9dd1d1b692cb89))
30+
31+
32+
## [0.7.0-beta.3](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0-beta.2...v0.7.0-beta.3) (2024-05-03)
33+
## [0.7.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.7.0-beta.1...v0.7.0-beta.2) (2024-05-03)
34+
35+
36+
### Features
37+
38+
* Azure implementation + embeddings refactoring ([aa9271e](https://github.com/VinciGit00/Scrapegraph-ai/commit/aa9271e7bc4daa54860499d0615580b17550ff58))
39+
* add pdf scraper ([10a9453](https://github.com/VinciGit00/Scrapegraph-ai/commit/10a94530e3fd4dfde933ecfa96cb3e21df72e606))
40+
41+
### Refactor
42+
43+
* Changed the way embedding model is created in AbstractGraph class and removed handling of embedding model creation from RAGNode. Now AbstractGraph will call a dedicated method for embedding models instead of _create_llm. This makes it easy to use any LLM with any supported embedding model. ([819cbcd](https://github.com/VinciGit00/Scrapegraph-ai/commit/819cbcd3be1a8cb195de0b44c6b6d4d824e2a42a))
44+
145
## [0.7.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.6.2...v0.7.0-beta.1) (2024-05-03)
246

347

examples/groq/smart_scraper_groq_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
},
2626
"embeddings": {
2727
"api_key": openai_key,
28-
"model": "gpt-3.5-turbo",
28+
"model": "openai",
2929
},
3030
"headless": False
3131
}

examples/openai/smart_scraper_openai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"api_key": openai_key,
2222
"model": "gpt-3.5-turbo",
2323
},
24-
"verbose":False,
24+
"verbose": True,
2525
}
2626

2727
# ************************************************

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22
name = "scrapegraphai"
33

4-
version = "0.7.0b1"
4+
version = "0.8.0b1"
55

66
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77
authors = [

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@
1010
from .xml_scraper_graph import XMLScraperGraph
1111
from .json_scraper_graph import JSONScraperGraph
1212
from .csv_scraper_graph import CSVScraperGraph
13+
from .pdf_scraper_graph import PDFScraperGraph

scrapegraphai/graphs/abstract_graph.py

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,12 @@
55
from abc import ABC, abstractmethod
66
from typing import Optional
77

8-
from ..models import OpenAI, Gemini, Ollama, AzureOpenAI, HuggingFace, Groq, Bedrock
8+
from langchain_aws.embeddings.bedrock import BedrockEmbeddings
9+
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
10+
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
11+
912
from ..helpers import models_tokens
13+
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI
1014

1115

1216
class AbstractGraph(ABC):
@@ -43,7 +47,8 @@ def __init__(self, prompt: str, config: dict, source: Optional[str] = None):
4347
self.source = source
4448
self.config = config
4549
self.llm_model = self._create_llm(config["llm"], chat=True)
46-
self.embedder_model = self.llm_model if "embeddings" not in config else self._create_llm(
50+
self.embedder_model = self._create_default_embedder(
51+
) if "embeddings" not in config else self._create_embedder(
4752
config["embeddings"])
4853

4954
# Set common configuration parameters
@@ -172,6 +177,85 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
172177
else:
173178
raise ValueError(
174179
"Model provided by the configuration not supported")
180+
181+
def _create_default_embedder(self) -> object:
182+
"""
183+
Create an embedding model instance based on the chosen llm model.
184+
185+
Returns:
186+
object: An instance of the embedding model client.
187+
188+
Raises:
189+
ValueError: If the model is not supported.
190+
"""
191+
192+
if isinstance(self.llm_model, OpenAI):
193+
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
194+
elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
195+
return self.llm_model
196+
elif isinstance(self.llm_model, AzureOpenAI):
197+
return AzureOpenAIEmbeddings()
198+
elif isinstance(self.llm_model, Ollama):
199+
# unwrap the kwargs from the model whihc is a dict
200+
params = self.llm_model._lc_kwargs
201+
# remove streaming and temperature
202+
params.pop("streaming", None)
203+
params.pop("temperature", None)
204+
205+
return OllamaEmbeddings(**params)
206+
elif isinstance(self.llm_model, HuggingFace):
207+
return HuggingFaceHubEmbeddings(model=self.llm_model.model)
208+
elif isinstance(self.llm_model, Bedrock):
209+
return BedrockEmbeddings(client=None, model_id=self.llm_model.model_id)
210+
else:
211+
raise ValueError("Embedding Model missing or not supported")
212+
213+
def _create_embedder(self, embedder_config: dict) -> object:
214+
"""
215+
Create an embedding model instance based on the configuration provided.
216+
217+
Args:
218+
embedder_config (dict): Configuration parameters for the embedding model.
219+
220+
Returns:
221+
object: An instance of the embedding model client.
222+
223+
Raises:
224+
KeyError: If the model is not supported.
225+
"""
226+
227+
# Instantiate the embedding model based on the model name
228+
if "openai" in embedder_config["model"]:
229+
return OpenAIEmbeddings(api_key=embedder_config["api_key"])
230+
231+
elif "azure" in embedder_config["model"]:
232+
return AzureOpenAIEmbeddings()
233+
234+
elif "ollama" in embedder_config["model"]:
235+
embedder_config["model"] = embedder_config["model"].split("/")[-1]
236+
try:
237+
models_tokens["ollama"][embedder_config["model"]]
238+
except KeyError:
239+
raise KeyError("Model not supported")
240+
return OllamaEmbeddings(**embedder_config)
241+
242+
elif "hugging_face" in embedder_config["model"]:
243+
try:
244+
models_tokens["hugging_face"][embedder_config["model"]]
245+
except KeyError:
246+
raise KeyError("Model not supported")
247+
return HuggingFaceHubEmbeddings(model=embedder_config["model"])
248+
249+
elif "bedrock" in embedder_config["model"]:
250+
embedder_config["model"] = embedder_config["model"].split("/")[-1]
251+
try:
252+
models_tokens["bedrock"][embedder_config["model"]]
253+
except KeyError:
254+
raise KeyError("Model not supported")
255+
return BedrockEmbeddings(client=None, model_id=embedder_config["model"])
256+
else:
257+
raise ValueError(
258+
"Model provided by the configuration not supported")
175259

176260
def get_state(self, key=None) -> dict:
177261
"""""
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""
2+
PDFScraperGraph Module
3+
"""
4+
5+
from .base_graph import BaseGraph
6+
from ..nodes import (
7+
FetchNode,
8+
ParseNode,
9+
RAGNode,
10+
GenerateAnswerNode
11+
)
12+
from .abstract_graph import AbstractGraph
13+
14+
15+
class PDFScraperGraph(AbstractGraph):
16+
"""
17+
PDFScraperGraph is a scraping pipeline that extracts information from pdf files using a natural
18+
language model to interpret and answer prompts.
19+
20+
Attributes:
21+
prompt (str): The prompt for the graph.
22+
source (str): The source of the graph.
23+
config (dict): Configuration parameters for the graph.
24+
llm_model: An instance of a language model client, configured for generating answers.
25+
embedder_model: An instance of an embedding model client,
26+
configured for generating embeddings.
27+
verbose (bool): A flag indicating whether to show print statements during execution.
28+
headless (bool): A flag indicating whether to run the graph in headless mode.
29+
model_token (int): The token limit for the language model.
30+
31+
Args:
32+
prompt (str): The prompt for the graph.
33+
source (str): The source of the graph.
34+
config (dict): Configuration parameters for the graph.
35+
36+
Example:
37+
>>> pdf_scraper = PDFScraperGraph(
38+
... "List me all the attractions in Chioggia.",
39+
... "data/chioggia.pdf",
40+
... {"llm": {"model": "gpt-3.5-turbo"}}
41+
... )
42+
>>> result = pdf_scraper.run()
43+
"""
44+
45+
def __init__(self, prompt: str, source: str, config: dict):
46+
super().__init__(prompt, config, source)
47+
48+
self.input_key = "pdf" if source.endswith("pdf") else "pdf_dir"
49+
50+
def _create_graph(self) -> BaseGraph:
51+
"""
52+
Creates the graph of nodes representing the workflow for web scraping.
53+
54+
Returns:
55+
BaseGraph: A graph instance representing the web scraping workflow.
56+
"""
57+
58+
fetch_node = FetchNode(
59+
input="pdf_dir",
60+
output=["doc"],
61+
node_config={
62+
"headless": self.headless,
63+
"verbose": self.verbose
64+
}
65+
)
66+
parse_node = ParseNode(
67+
input="doc",
68+
output=["parsed_doc"],
69+
node_config={
70+
"chunk_size": self.model_token,
71+
"verbose": self.verbose
72+
}
73+
)
74+
rag_node = RAGNode(
75+
input="user_prompt & (parsed_doc | doc)",
76+
output=["relevant_chunks"],
77+
node_config={
78+
"llm": self.llm_model,
79+
"embedder_model": self.embedder_model,
80+
"verbose": self.verbose
81+
}
82+
)
83+
generate_answer_node = GenerateAnswerNode(
84+
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
85+
output=["answer"],
86+
node_config={
87+
"llm": self.llm_model,
88+
"verbose": self.verbose
89+
}
90+
)
91+
92+
return BaseGraph(
93+
nodes=[
94+
fetch_node,
95+
parse_node,
96+
rag_node,
97+
generate_answer_node,
98+
],
99+
edges=[
100+
(fetch_node, parse_node),
101+
(parse_node, rag_node),
102+
(rag_node, generate_answer_node)
103+
],
104+
entry_point=fetch_node
105+
)
106+
107+
def run(self) -> str:
108+
"""
109+
Executes the web scraping process and returns the answer to the prompt.
110+
111+
Returns:
112+
str: The answer to the prompt.
113+
"""
114+
115+
inputs = {"user_prompt": self.prompt, self.input_key: self.source}
116+
self.final_state, self.execution_info = self.graph.execute(inputs)
117+
118+
return self.final_state.get("answer", "No answer found.")

scrapegraphai/nodes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,4 @@
1616
from .search_link_node import SearchLinkNode
1717
from .robots_node import RobotsNode
1818
from .generate_answer_csv_node import GenerateAnswerCSVNode
19+
from .generate_answer_pdf_node import GenerateAnswerPDFNode

0 commit comments

Comments
 (0)