Skip to content

Commit e5cdedf

Browse files
committed
add possibility to choose the python library for scripting the scraping
1 parent 1107630 commit e5cdedf

File tree

12 files changed

+73
-57
lines changed

12 files changed

+73
-57
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,7 @@ examples/graph_examples/ScrapeGraphAI_generated_graph
3131
examples/**/*.csv
3232
examples/**/*.json
3333
main.py
34+
poetry.lock
35+
36+
# lock files
37+
*.lock

examples/gemini/script_generator_gemini.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"api_key": gemini_key,
2222
"model": "gpt-3.5-turbo",
2323
},
24+
"library": "beautifoulsoup"
2425
}
2526

2627
# ************************************************

examples/local_models/Docker/script_generator_docker.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@
1818
"embeddings": {
1919
"model": "ollama/nomic-embed-text",
2020
"temperature": 0,
21-
}
21+
},
22+
"library": "beautifoulsoup"
2223
}
2324

2425
# ************************************************

examples/local_models/Ollama/script_generator_ollama.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
"""
1+
"""
22
Basic example of scraping pipeline using ScriptCreatorGraph
33
"""
44
from scrapegraphai.graphs import ScriptCreatorGraph
@@ -19,7 +19,8 @@
1919
"model": "ollama/nomic-embed-text",
2020
"temperature": 0,
2121
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
22-
}
22+
},
23+
"library": "beautifoulsoup"
2324
}
2425

2526
# ************************************************

examples/openai/script_generator_openai.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"api_key": openai_key,
2121
"model": "gpt-3.5-turbo",
2222
},
23+
"library": "beautifoulsoup"
2324
}
2425

2526
# ************************************************

manual deployment/commit_and_push.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ cd ..
2121
commit_message="$1"
2222

2323
# Run Pylint on the specified Python files
24-
pylint pylint scrapegraphai/**/*.py scrapegraphai/*.py
24+
pylint pylint scrapegraphai/**/*.py scrapegraphai/*.py tests/*.py
2525
#Make the pull
2626
git pull
2727

poetry.lock

Lines changed: 38 additions & 34 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

scrapegraphai/graphs/script_creator_graph.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ def __init__(self, prompt: str, source: str, config: dict):
2525

2626
self.input_key = "url" if source.startswith("http") else "local_dir"
2727

28+
self.library = config['library']
29+
2830
def _create_graph(self):
2931
"""
3032
Creates the graph of nodes representing the workflow for web scraping.
@@ -50,6 +52,7 @@ def _create_graph(self):
5052
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
5153
output=["answer"],
5254
node_config={"llm": self.llm_model},
55+
library=self.library
5356
)
5457

5558
return BaseGraph(

scrapegraphai/nodes/fetch_node.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,16 @@ def execute(self, state):
7272

7373
# if it is a local directory
7474
if not source.startswith("http"):
75-
compressedDocument = [Document(page_content=remover(source), metadata={
75+
compressed_document = [Document(page_content=remover(source), metadata={
7676
"source": "local_dir"
7777
})]
7878

7979
# if it is a URL
8080
else:
8181
loader = AsyncHtmlLoader(source)
8282
document = loader.load()
83-
compressedDocument = [Document(page_content=remover(str(document)))]
83+
compressed_document = [
84+
Document(page_content=remover(str(document)))]
8485

85-
state.update({self.output[0]: compressedDocument})
86+
state.update({self.output[0]: compressed_document})
8687
return state

scrapegraphai/nodes/generate_scraper_node.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class GenerateScraperNode(BaseNode):
4040
"""
4141

4242
def __init__(self, input: str, output: List[str], node_config: dict,
43-
node_name: str = "GenerateAnswer"):
43+
library: str, node_name: str = "GenerateAnswer"):
4444
"""
4545
Initializes the GenerateScraperNode with a language model client and a node name.
4646
Args:
@@ -49,6 +49,7 @@ def __init__(self, input: str, output: List[str], node_config: dict,
4949
"""
5050
super().__init__(node_name, "node", input, output, 2, node_config)
5151
self.llm_model = node_config["llm"]
52+
self.library = library
5253

5354
def execute(self, state):
5455
"""
@@ -87,21 +88,23 @@ def execute(self, state):
8788
PROMPT:
8889
You are a website scraper script creator and you have just scraped the
8990
following content from a website.
90-
Write the code in python with the Beautiful Soup library to extract the informations requested by the task.\n \n
91+
Write the code in python for extracting the informations requested by the task.\n The library to use is specified in the instructions \n
9192
The website is big so I am giving you one chunk at the time to be merged later with the other chunks.\n
9293
CONTENT OF {chunk_id}: {context}.
9394
Ignore all the context sentences that ask you not to extract information from the html code
95+
LIBRARY: {library}
9496
INSTRUCTIONS: {format_instructions}
9597
QUESTION: {question}
9698
"""
9799
template_no_chunks = """
98100
PROMPT:
99101
You are a website scraper script creator and you have just scraped the
100102
following content from a website.
101-
Write the code in python with the Beautiful Soup library to extract the informations requested by the task.\n \n
103+
Write the code in python for extracting the informations requested by the task.\n The library to use is specified in the instructions \n
102104
The website is big so I am giving you one chunk at the time to be merged later with the other chunks.\n
103105
CONTENT OF {chunk_id}: {context}.
104106
Ignore all the context sentences that ask you not to extract information from the html code
107+
LIBRARY: {library}
105108
INSTRUCTIONS: {format_instructions}
106109
QUESTION: {question}
107110
"""
@@ -130,8 +133,10 @@ def execute(self, state):
130133
template=template,
131134
input_variables=["question"],
132135
partial_variables={"context": chunk.page_content,
133-
"chunk_id": i + 1,
134-
"format_instructions": format_instructions},
136+
"chunk_id": i + 1,
137+
"format_instructions": format_instructions,
138+
"library": self.library
139+
},
135140
)
136141
# Dynamically name the chains based on their index
137142
chain_name = f"chunk{i+1}"

0 commit comments

Comments
 (0)