Skip to content

Commit 1cb71ed

Browse files
authored
Merge pull request #289 from stoensin/patch-1
Update abstract_graph.py
2 parents aa14271 + f00ed35 commit 1cb71ed

File tree

7 files changed

+50
-18
lines changed

7 files changed

+50
-18
lines changed

CHANGELOG.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,25 @@
1+
12
## [1.4.0-beta.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.4.0-beta.1...v1.4.0-beta.2) (2024-05-19)
23

34

45
### Features
56

6-
* **docloaders:** undetected-playwright ([7b3ee4e](https://github.com/VinciGit00/Scrapegraph-ai/commit/7b3ee4e71e4af04edeb47999d70d398b67c93ac4))
7+
* Add new models and update existing ones ([58289ec](https://github.com/VinciGit00/Scrapegraph-ai/commit/58289eccc523814a2898650c41410f9a35b4e4c2))
78

8-
## [1.4.0-beta.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.0...v1.4.0-beta.1) (2024-05-19)
9+
## [1.3.2](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.1...v1.3.2) (2024-05-22)
910

1011

11-
### Features
12+
### Bug Fixes
1213

13-
* **base_graph:** alligned with main ([73fa31d](https://github.com/VinciGit00/Scrapegraph-ai/commit/73fa31db0f791d1fd63b489ac88cc6e595aa07f9))
14+
* pdf scraper bug ([f2dffe5](https://github.com/VinciGit00/Scrapegraph-ai/commit/f2dffe534f51aa83aed5ac491243604a443f4373))
1415

16+
## [1.3.1](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.3.0...v1.3.1) (2024-05-21)
1517

16-
### CI
1718

18-
* **release:** 1.2.0-beta.1 [skip ci] ([fd3e0aa](https://github.com/VinciGit00/Scrapegraph-ai/commit/fd3e0aa5823509dfb46b4f597521c24d4eb345f1))
19-
* **release:** 1.3.0-beta.1 [skip ci] ([191db0b](https://github.com/VinciGit00/Scrapegraph-ai/commit/191db0bc779e4913713b47b68ec4162a347da3ea))
19+
### Bug Fixes
20+
21+
* add deepseek embeddings ([659fad7](https://github.com/VinciGit00/Scrapegraph-ai/commit/659fad770a5b6ace87511513e5233a3bc1269009))
22+
2023

2124
## [1.3.0](https://github.com/VinciGit00/Scrapegraph-ai/compare/v1.2.4...v1.3.0) (2024-05-19)
2225

README.md

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,9 +180,14 @@ Wanna visualize the roadmap in a more interactive way? Check out the [markmap](h
180180
## ❤️ Contributors
181181
[![Contributors](https://contrib.rocks/image?repo=VinciGit00/Scrapegraph-ai)](https://github.com/VinciGit00/Scrapegraph-ai/graphs/contributors)
182182
## Sponsors
183-
<p align="center">
184-
<a href="https://serpapi.com?utm_source=scrapegraphai"><img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/serp_api_logo.png" alt="SerpAPI" style="width: 10%;"></a>
185-
</p>
183+
<div style="text-align: center;">
184+
<a href="https://serpapi.com?utm_source=scrapegraphai">
185+
<img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/serp_api_logo.png" alt="SerpAPI" style="width: 10%;">
186+
</a>
187+
<a href="https://dashboard.statproxies.com/?refferal=scrapegraph">
188+
<img src="https://raw.githubusercontent.com/VinciGit00/Scrapegraph-ai/main/docs/assets/transparent_stat.png" alt="Stats" style="width: 10%;">
189+
</a>
190+
</div>
186191

187192
## 🎓 Citations
188193
If you have used our library for research purposes please quote us with the following reference:

docs/assets/transparent_stat.png

217 KB
Loading

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
[project]
22
name = "scrapegraphai"
33

4+
45
version = "1.4.0b2"
56

7+
68
description = "A web scraping library based on LangChain which uses LLM and direct graph logic to create scraping pipelines."
79
authors = [
810
{ name = "Marco Vinciguerra", email = "[email protected]" },

scrapegraphai/graphs/abstract_graph.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
from langchain_openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
88
from langchain_community.embeddings import HuggingFaceHubEmbeddings, OllamaEmbeddings
99
from langchain_google_genai import GoogleGenerativeAIEmbeddings
10+
from ..helpers import models_tokens
11+
from ..models import AzureOpenAI, Bedrock, Gemini, Groq, HuggingFace, Ollama, OpenAI, Anthropic, DeepSeek
1012
from langchain_google_genai.embeddings import GoogleGenerativeAIEmbeddings
1113

1214
from ..helpers import models_tokens
@@ -169,7 +171,7 @@ def _create_llm(self, llm_config: dict, chat=False) -> object:
169171
raise KeyError("Model not supported") from exc
170172
return Anthropic(llm_params)
171173
elif "ollama" in llm_params["model"]:
172-
llm_params["model"] = llm_params["model"].split("/")[-1]
174+
llm_params["model"] = llm_params["model"].split("ollama/")[-1]
173175

174176
# allow user to set model_tokens in config
175177
try:
@@ -243,6 +245,8 @@ def _create_default_embedder(self, llm_config=None) -> object:
243245
model="models/embedding-001")
244246
if isinstance(self.llm_model, OpenAI):
245247
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
248+
elif isinstance(self.llm_model, DeepSeek):
249+
return OpenAIEmbeddings(api_key=self.llm_model.openai_api_key)
246250
elif isinstance(self.llm_model, AzureOpenAIEmbeddings):
247251
return self.llm_model
248252
elif isinstance(self.llm_model, AzureOpenAI):
@@ -283,7 +287,7 @@ def _create_embedder(self, embedder_config: dict) -> object:
283287
elif "azure" in embedder_config["model"]:
284288
return AzureOpenAIEmbeddings()
285289
elif "ollama" in embedder_config["model"]:
286-
embedder_config["model"] = embedder_config["model"].split("/")[-1]
290+
embedder_config["model"] = embedder_config["model"].split("ollama/")[-1]
287291
try:
288292
models_tokens["ollama"][embedder_config["model"]]
289293
except KeyError as exc:

scrapegraphai/helpers/models_tokens.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,18 @@
3333
},
3434

3535
"ollama": {
36+
"command-r": 12800,
37+
"command-r-plus": 12800,
38+
"codellama": 16000,
39+
"dbrx": 32768,
40+
"dbrx:instruct": 32768,
41+
"deepseek-coder:33b": 16000,
42+
"dolphin-mixtral": 32000,
3643
"llama2": 4096,
3744
"llama3": 8192,
45+
"llama3:70b-instruct": 8192,
3846
"llava": 4096,
47+
"llava:34b": 4096,
3948
"llava_next": 4096,
4049
"mistral": 8192,
4150
"falcon": 2048,
@@ -46,13 +55,21 @@
4655
"command-r-plus": 12800,
4756
"command-r": 12800,
4857
"mistral:7b-instruct": 32768,
49-
"llama3:70b-instruct": 8192,
58+
"mistral-openorca": 32000,
5059
"mixtral:8x22b-instruct": 65536,
51-
"wizardlm2:8x22b": 65536,
52-
"dbrx": 32768,
53-
"dbrx:instruct": 32768,
5460
"nous-hermes2:34b": 4096,
5561
"orca-mini": 2048,
62+
"phi3:3.8b": 12800,
63+
"phi3:14b": 12800,
64+
"qwen:0.5b": 32000,
65+
"qwen:1.8b": 32000,
66+
"qwen:4b": 32000,
67+
"qwen:14b": 32000,
68+
"qwen:32b": 32000,
69+
"qwen:72b": 32000,
70+
"qwen:110b": 32000,
71+
"stablelm-zephyr": 8192,
72+
"wizardlm2:8x22b": 65536,
5673
# embedding models
5774
"nomic-embed-text": 8192,
5875
"snowflake-arctic-embed:335m": 8192,

scrapegraphai/nodes/fetch_node.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,14 @@ def execute(self, state):
8686
input_keys[0] == "json_dir"
8787
or input_keys[0] == "xml_dir"
8888
or input_keys[0] == "csv_dir"
89+
or input_keys[0] == "pdf_dir"
8990
):
9091
compressed_document = [
9192
Document(page_content=source, metadata={"source": "local_dir"})
9293
]
9394
state.update({self.output[0]: compressed_document})
9495
return state
95-
96+
9697
# handling for pdf
9798
elif input_keys[0] == "pdf":
9899
loader = PyPDFLoader(source)
@@ -108,7 +109,7 @@ def execute(self, state):
108109
]
109110
state.update({self.output[0]: compressed_document})
110111
return state
111-
112+
112113
elif input_keys[0] == "json":
113114
f = open(source)
114115
compressed_document = [

0 commit comments

Comments
 (0)