Skip to content

Commit 7e8acd8

Browse files
authored
Merge branch 'pre/beta' into fix/fetch-node-proxybroker
2 parents 0c36a7e + 30758b4 commit 7e8acd8

24 files changed

+383
-193
lines changed

CHANGELOG.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
## [0.11.0-beta.4](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.11.0-beta.3...v0.11.0-beta.4) (2024-05-12)
2+
3+
4+
### Features
5+
6+
* add new prompt info ([e2350ed](https://github.com/VinciGit00/Scrapegraph-ai/commit/e2350eda6249d8e121344d12c92645a3887a5b76))
7+
8+
## [0.11.0-beta.3](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.11.0-beta.2...v0.11.0-beta.3) (2024-05-12)
9+
10+
11+
### Features
12+
13+
* add support for deepseek-chat ([156b67b](https://github.com/VinciGit00/Scrapegraph-ai/commit/156b67b91e1798f67082123e2c0087d358a32d4d)), closes [#222](https://github.com/VinciGit00/Scrapegraph-ai/issues/222)
14+
15+
16+
### Docs
17+
18+
* add diagram showing general structure/flow of the library ([13ae918](https://github.com/VinciGit00/Scrapegraph-ai/commit/13ae9180ac5e7ef11dad1a210cf8790e797397dd))
19+
* update overview diagram with more models ([b441b30](https://github.com/VinciGit00/Scrapegraph-ai/commit/b441b30a5c60dda105964f69bd4cef06825f5c74))
20+
121
## [0.11.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v0.11.0-beta.1...v0.11.0-beta.2) (2024-05-10)
222

323

51.8 KB
Binary file not shown.
82 KB
Loading

docs/source/introduction/overview.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,4 +22,14 @@ ScrapegraphAI, leveraging the power of LLMs, adapts to changes in website struct
2222
This flexibility ensures that scrapers remain functional even when website layouts change.
2323

2424
We support many Large Language Models (LLMs) including GPT, Gemini, Groq, Azure, Hugging Face etc.
25-
as well as local models which can run on your machine using Ollama.
25+
as well as local models which can run on your machine using Ollama.
26+
27+
Diagram
28+
=======
29+
With ScrapegraphAI you first construct a pipeline of steps you want to execute by combining nodes into a graph.
30+
Executing the graph takes care of all the steps that are often part of scraping: fetching, parsing etc...
31+
Finally the scraped and processed data gets fed to an LLM which generates a response.
32+
33+
.. image:: ../../assets/project_overview_diagram.png
34+
:align: center
35+
:alt: ScrapegraphAI Overview
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import SmartScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
deepseek_key = os.getenv("DEEPSEEK_APIKEY")
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "deepseek-chat",
22+
"openai_api_key": deepseek_key,
23+
"openai_api_base": 'https://api.deepseek.com/v1',
24+
},
25+
"verbose": True,
26+
}
27+
28+
# ************************************************
29+
# Create the SmartScraperGraph instance and run it
30+
# ************************************************
31+
32+
smart_scraper_graph = SmartScraperGraph(
33+
prompt="List me all the projects with their description.",
34+
# also accepts a string with the already downloaded HTML code
35+
source="https://perinim.github.io/projects/",
36+
config=graph_config
37+
)
38+
39+
result = smart_scraper_graph.run()
40+
print(result)
41+
42+
# ************************************************
43+
# Get graph execution info
44+
# ************************************************
45+
46+
graph_exec_info = smart_scraper_graph.get_execution_info()
47+
print(prettify_exec_info(graph_exec_info))
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
"""
2+
Basic example of scraping pipeline using SmartScraper
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
from scrapegraphai.graphs import DeepScraperGraph
8+
from scrapegraphai.utils import prettify_exec_info
9+
10+
load_dotenv()
11+
12+
13+
# ************************************************
14+
# Define the configuration for the graph
15+
# ************************************************
16+
17+
openai_key = os.getenv("OPENAI_APIKEY")
18+
19+
graph_config = {
20+
"llm": {
21+
"api_key": openai_key,
22+
"model": "gpt-4",
23+
},
24+
"verbose": True,
25+
}
26+
27+
# ************************************************
28+
# Create the SmartScraperGraph instance and run it
29+
# ************************************************
30+
31+
deep_scraper_graph = DeepScraperGraph(
32+
prompt="List me all the job titles and detailed job description.",
33+
# also accepts a string with the already downloaded HTML code
34+
source="https://www.google.com/about/careers/applications/jobs/results/?location=Bangalore%20India",
35+
config=graph_config
36+
)
37+
38+
result = deep_scraper_graph.run()
39+
print(result)
40+
41+
# ************************************************
42+
# Get graph execution info
43+
# ************************************************
44+
45+
graph_exec_info = deep_scraper_graph.get_execution_info()
46+
print(deep_scraper_graph.get_state("relevant_links"))
47+
print(prettify_exec_info(graph_exec_info))

examples/openai/script_generator_openai.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,20 @@
2727
# Create the ScriptCreatorGraph instance and run it
2828
# ************************************************
2929

30-
smart_scraper_graph = ScriptCreatorGraph(
31-
prompt="List me all the news with their description.",
30+
script_creator_graph = ScriptCreatorGraph(
31+
prompt="List me all the projects with their description.",
3232
# also accepts a string with the already downloaded HTML code
3333
source="https://perinim.github.io/projects",
3434
config=graph_config
3535
)
3636

37-
result = smart_scraper_graph.run()
37+
result = script_creator_graph.run()
3838
print(result)
3939

4040
# ************************************************
4141
# Get graph execution info
4242
# ************************************************
4343

44-
graph_exec_info = smart_scraper_graph.get_execution_info()
44+
graph_exec_info = script_creator_graph.get_execution_info()
4545
print(prettify_exec_info(graph_exec_info))
4646

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22
name = "scrapegraphai"
33

4-
version = "0.11.0b2"
4+
version = "0.11.0b4"
55

66
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
77
authors = [

scrapegraphai/graphs/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from .abstract_graph import AbstractGraph
66
from .base_graph import BaseGraph
77
from .smart_scraper_graph import SmartScraperGraph
8+
from .deep_scraper_graph import DeepScraperGraph
89
from .speech_graph import SpeechGraph
910
from .search_graph import SearchGraph
1011
from .script_creator_graph import ScriptCreatorGraph

scrapegraphai/graphs/abstract_graph.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings, BedrockEmbeddings
88
from langchain_google_genai import GoogleGenerativeAIEmbeddings
99
from ..helpers import models_tokens
10-
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, Claude
10+
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, Claude, DeepSeek
1111

1212

1313
class AbstractGraph(ABC):
@@ -203,6 +203,12 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
203203
elif "claude-3-" in llm_params["model"]:
204204
self.model_token = models_tokens["claude"]["claude3"]
205205
return Anthropic(llm_params)
206+
elif "deepseek" in llm_params["model"]:
207+
try:
208+
self.model_token = models_tokens["deepseek"][llm_params["model"]]
209+
except KeyError as exc:
210+
raise KeyError("Model not supported") from exc
211+
return DeepSeek(llm_params)
206212
else:
207213
raise ValueError(
208214
"Model provided by the configuration not supported")

0 commit comments

Comments
 (0)