Skip to content

Commit 61c732f

Browse files
authored
Merge pull request #124 from VinciGit00/csv-scraper
add csv scraper
2 parents 9356124 + 4d542a8 commit 61c732f

File tree

17 files changed

+680
-1
lines changed

17 files changed

+680
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@ venv/
2828
*.sqlite
2929
*.google-cookie
3030
examples/graph_examples/ScrapeGraphAI_generated_graph
31-
examples/**/*.csv
31+
examples/**/result.csv
32+
examples/**/result.json
3233
main.py
3334
poetry.lock
3435

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
13+
# ************************************************
14+
# Read the csv file
15+
# ************************************************
16+
17+
text = pd.read_csv("inputs/username.csv")
18+
19+
# ************************************************
20+
# Define the configuration for the graph
21+
# ************************************************
22+
23+
graph_config = {
24+
"llm": {
25+
"model": "ollama/mistral",
26+
"temperature": 0,
27+
"format": "json", # Ollama needs the format to be specified explicitly
28+
# "model_tokens": 2000, # set context length arbitrarily
29+
"base_url": "http://localhost:11434",
30+
},
31+
"embeddings": {
32+
"model": "ollama/nomic-embed-text",
33+
"temperature": 0,
34+
"base_url": "http://localhost:11434",
35+
}
36+
}
37+
38+
# ************************************************
39+
# Create the CSVScraperGraph instance and run it
40+
# ************************************************
41+
42+
csv_scraper_graph = CSVScraperGraph(
43+
prompt="List me all the last names",
44+
source=str(text), # Pass the content of the file, not the file object
45+
config=graph_config
46+
)
47+
48+
result = csv_scraper_graph.run()
49+
print(result)
50+
51+
# ************************************************
52+
# Get graph execution info
53+
# ************************************************
54+
55+
graph_exec_info = csv_scraper_graph.get_execution_info()
56+
print(prettify_exec_info(graph_exec_info))
57+
58+
# Save to json or csv
59+
convert_to_csv(result, "result")
60+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

examples/gemini/scrape_xml_gemini.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from dotenv import load_dotenv
77
from scrapegraphai.graphs import SmartScraperGraph
88
from scrapegraphai.utils import prettify_exec_info
9+
910
load_dotenv()
1011

1112
# ************************************************
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
},
26+
"embeddings": {
27+
"model": "ollama/nomic-embed-text",
28+
"temperature": 0,
29+
}
30+
}
31+
32+
# ************************************************
33+
# Create the CSVScraperGraph instance and run it
34+
# ************************************************
35+
36+
csv_scraper_graph = CSVScraperGraph(
37+
prompt="List me all the last names",
38+
source=str(text), # Pass the content of the file, not the file object
39+
config=graph_config
40+
)
41+
42+
result = csv_scraper_graph.run()
43+
print(result)
44+
45+
# ************************************************
46+
# Get graph execution info
47+
# ************************************************
48+
49+
graph_exec_info = csv_scraper_graph.get_execution_info()
50+
print(prettify_exec_info(graph_exec_info))
51+
52+
# Save to json or csv
53+
convert_to_csv(result, "result")
54+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import pandas as pd
6+
from scrapegraphai.graphs import CSVScraperGraph
7+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
8+
9+
# ************************************************
10+
# Read the csv file
11+
# ************************************************
12+
13+
text = pd.read_csv("inputs/username.csv")
14+
15+
# ************************************************
16+
# Define the configuration for the graph
17+
# ************************************************
18+
19+
graph_config = {
20+
"llm": {
21+
"model": "ollama/mistral",
22+
"temperature": 0,
23+
"format": "json", # Ollama needs the format to be specified explicitly
24+
# "model_tokens": 2000, # set context length arbitrarily
25+
"base_url": "http://localhost:11434",
26+
},
27+
"embeddings": {
28+
"model": "ollama/nomic-embed-text",
29+
"temperature": 0,
30+
"base_url": "http://localhost:11434",
31+
}
32+
}
33+
34+
# ************************************************
35+
# Create the CSVScraperGraph instance and run it
36+
# ************************************************
37+
38+
csv_scraper_graph = CSVScraperGraph(
39+
prompt="List me all the last names",
40+
source=str(text), # Pass the content of the file, not the file object
41+
config=graph_config
42+
)
43+
44+
result = csv_scraper_graph.run()
45+
print(result)
46+
47+
# ************************************************
48+
# Get graph execution info
49+
# ************************************************
50+
51+
graph_exec_info = csv_scraper_graph.get_execution_info()
52+
print(prettify_exec_info(graph_exec_info))
53+
54+
# Save to json or csv
55+
convert_to_csv(result, "result")
56+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
"""
2+
Basic example of scraping pipeline using CSVScraperGraph from CSV documents
3+
"""
4+
5+
import os
6+
from dotenv import load_dotenv
7+
import pandas as pd
8+
from scrapegraphai.graphs import CSVScraperGraph
9+
from scrapegraphai.utils import convert_to_csv, convert_to_json, prettify_exec_info
10+
11+
load_dotenv()
12+
# ************************************************
13+
# Read the csv file
14+
# ************************************************
15+
16+
text = pd.read_csv("inputs/username.csv")
17+
18+
# ************************************************
19+
# Define the configuration for the graph
20+
# ************************************************
21+
22+
openai_key = os.getenv("OPENAI_APIKEY")
23+
24+
graph_config = {
25+
"llm": {
26+
"api_key": openai_key,
27+
"model": "gpt-3.5-turbo",
28+
},
29+
}
30+
31+
# ************************************************
32+
# Create the CSVScraperGraph instance and run it
33+
# ************************************************
34+
35+
csv_scraper_graph = CSVScraperGraph(
36+
prompt="List me all the last names",
37+
source=str(text), # Pass the content of the file, not the file object
38+
config=graph_config
39+
)
40+
41+
result = csv_scraper_graph.run()
42+
print(result)
43+
44+
# ************************************************
45+
# Get graph execution info
46+
# ************************************************
47+
48+
graph_exec_info = csv_scraper_graph.get_execution_info()
49+
print(prettify_exec_info(graph_exec_info))
50+
51+
# Save to json or csv
52+
convert_to_csv(result, "result")
53+
convert_to_json(result, "result")
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Username; Identifier;First name;Last name
2+
booker12;9012;Rachel;Booker
3+
grey07;2070;Laura;Grey
4+
johnson81;4081;Craig;Johnson
5+
jenkins46;9346;Mary;Jenkins
6+
smith79;5079;Jamie;Smith
7+

0 commit comments

Comments
 (0)