Skip to content

Commit 9bd06e3

Browse files
author
Thordata
committed
feat: v0.2.0 - Add new tools, update for SDK v0.4.0
- Add ThordataUniversalTool for advanced scraping - Add ThordataProxyTool for geo-targeted requests - Update existing tools for SDK v0.4.0 compatibility - Update examples - Improve documentation
1 parent 77faf82 commit 9bd06e3

File tree

10 files changed

+695
-340
lines changed

10 files changed

+695
-340
lines changed

.env.example

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
1-
# Copy this file to .env and fill in your Thordata credentials.
2-
# Never commit your .env file to version control.
1+
# Thordata LangChain Tools Configuration
2+
# Copy this file to .env and fill in your credentials
33

4-
# Found at the bottom of the Thordata Dashboard
5-
THORDATA_SCRAPER_TOKEN=replace_with_your_scraper_token
4+
# Required: Get from Thordata Dashboard
5+
THORDATA_SCRAPER_TOKEN=your_scraper_token_here
66

7-
# Found in the "Public API" section
8-
THORDATA_PUBLIC_TOKEN=replace_with_your_public_token
9-
THORDATA_PUBLIC_KEY=replace_with_your_public_key
7+
# Optional: For advanced features
8+
THORDATA_PUBLIC_TOKEN=your_public_token_here
9+
THORDATA_PUBLIC_KEY=your_public_key_here
10+
11+
# Optional: For geo-targeted proxy requests
12+
THORDATA_USERNAME=your_proxy_username
13+
THORDATA_PASSWORD=your_proxy_password
14+
15+
# Optional: For agent examples
16+
OPENAI_API_KEY=your_openai_key_here

examples/simple_agent.py

Lines changed: 100 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1,105 +1,127 @@
1-
from __future__ import annotations
1+
"""
2+
Simple LangChain Agent Example
23
3-
from typing import Any, Dict, List, Optional
4+
Demonstrates using Thordata tools with a LangChain agent to:
5+
1. Search for information
6+
2. Scrape a webpage
7+
3. Summarize the content
48
9+
Requirements:
10+
pip install langchain-openai openai
11+
12+
Usage:
13+
export OPENAI_API_KEY=your_key
14+
python examples/simple_agent.py
15+
"""
16+
17+
import os
18+
import sys
519
from dotenv import load_dotenv
20+
21+
load_dotenv()
22+
23+
# Check required environment variables
24+
if not os.getenv("THORDATA_SCRAPER_TOKEN"):
25+
print("❌ Error: Set THORDATA_SCRAPER_TOKEN in your .env file")
26+
sys.exit(1)
27+
28+
if not os.getenv("OPENAI_API_KEY"):
29+
print("❌ Error: Set OPENAI_API_KEY in your .env file")
30+
sys.exit(1)
31+
632
from langchain_openai import ChatOpenAI
733
from langchain_core.messages import HumanMessage
834

935
from thordata_langchain_tools import ThordataSerpTool, ThordataScrapeTool
1036

11-
load_dotenv() # Load OPENAI_API_KEY and THORDATA_* from a local .env file
12-
1337

14-
def find_thordata_homepage(max_results: int = 5) -> Optional[str]:
15-
"""
16-
Use ThordataSerpTool to find the official Thordata homepage URL.
17-
18-
Returns:
19-
The first organic result's link, or None if nothing is found.
20-
"""
38+
def search_for_homepage(query: str) -> str:
39+
"""Use SERP tool to find a homepage URL."""
40+
print(f"🔍 Searching for: '{query}'")
41+
2142
serp_tool = ThordataSerpTool()
43+
results = serp_tool.invoke({
44+
"query": query,
45+
"engine": "google",
46+
"num": 3,
47+
})
2248

23-
serp_result: Dict[str, Any] = serp_tool.invoke(
24-
{
25-
"query": "Thordata official homepage",
26-
"engine": "google",
27-
"num": max_results,
28-
}
29-
)
30-
31-
organic: List[Dict[str, Any]] = serp_result.get("organic") or []
49+
organic = results.get("organic", [])
3250
for item in organic:
33-
link = item.get("link")
34-
if link:
51+
link = item.get("link", "")
52+
if link and "thordata" in link.lower():
3553
return link
54+
55+
# Return first result if no thordata link found
56+
if organic:
57+
return organic[0].get("link", "")
58+
59+
raise RuntimeError("No results found")
60+
61+
62+
def scrape_page(url: str) -> str:
63+
"""Use Scrape tool to get page content."""
64+
print(f"📄 Scraping: {url}")
65+
66+
scrape_tool = ThordataScrapeTool()
67+
html = scrape_tool.invoke({
68+
"url": url,
69+
"js_render": False,
70+
"max_length": 5000,
71+
})
72+
73+
return html
74+
75+
76+
def summarize_with_llm(html: str, topic: str) -> str:
77+
"""Use LLM to summarize the content."""
78+
print("🤖 Summarizing with LLM...")
79+
80+
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
3681

37-
return None
82+
prompt = f"""You are a helpful assistant that summarizes web content.
3883
84+
Based on the following HTML content, provide a brief summary about {topic}.
85+
Focus on the key products, services, or features mentioned.
3986
40-
def scrape_url(url: str) -> str:
41-
"""
42-
Use ThordataScrapeTool to fetch the HTML of a given URL.
87+
Provide your summary in 3-5 bullet points.
4388
44-
The tool itself truncates overly long HTML to avoid huge LLM inputs.
45-
"""
46-
scrape_tool = ThordataScrapeTool()
47-
result = scrape_tool.invoke(
48-
{
49-
"url": url,
50-
"js_render": False,
51-
"output_format": "HTML",
52-
}
53-
)
54-
55-
if isinstance(result, str):
56-
return result
57-
return str(result)
58-
59-
60-
def summarize_html_with_llm(html: str) -> str:
61-
"""
62-
Call OpenAI (via LangChain ChatOpenAI) exactly once to summarize the HTML.
63-
We deliberately truncate the HTML to keep the token count very small,
64-
so that it fits comfortably under strict TPM limits.
65-
"""
66-
# Hard cap: we only keep the first 3000 characters of the HTML.
67-
# 3000 chars ~= 1000–1500 tokens, which is safe for your 60k TPM limit.
68-
MAX_HTML_FOR_LLM = 3000
69-
if len(html) > MAX_HTML_FOR_LLM:
70-
html = (
71-
html[:MAX_HTML_FOR_LLM]
72-
+ "\n\n[Truncated to first "
73-
f"{MAX_HTML_FOR_LLM} characters before sending to the LLM]"
74-
)
75-
76-
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
77-
78-
prompt = (
79-
"You are a technical writer.\n"
80-
"You will be given (a truncated portion of) the Thordata homepage HTML.\n"
81-
"Based on this excerpt, summarize Thordata's core products and services "
82-
"in at most 5 bullet points.\n"
83-
"Be concise and concrete, and avoid marketing fluff.\n\n"
84-
f"HTML content (truncated):\n{html}"
85-
)
89+
HTML Content:
90+
{html[:4000]}
91+
"""
8692

8793
response = llm.invoke([HumanMessage(content=prompt)])
8894
return response.content
8995

9096

91-
if __name__ == "__main__":
92-
homepage_url = find_thordata_homepage()
97+
def main():
98+
print("=" * 60)
99+
print("🚀 Thordata LangChain Agent Demo")
100+
print("=" * 60)
101+
print()
93102

94-
if not homepage_url:
95-
print("Could not determine Thordata homepage URL from SERP results.")
96-
raise SystemExit(1)
103+
try:
104+
# Step 1: Search for Thordata
105+
url = search_for_homepage("Thordata proxy network official site")
106+
print(f" Found URL: {url}\n")
97107

98-
print(f"Detected Thordata homepage URL: {homepage_url}")
108+
# Step 2: Scrape the page
109+
html = scrape_page(url)
110+
print(f" Scraped {len(html)} characters\n")
99111

100-
html = scrape_url(homepage_url)
112+
# Step 3: Summarize
113+
summary = summarize_with_llm(html, "Thordata's services")
114+
115+
print()
116+
print("=" * 60)
117+
print("📋 Summary:")
118+
print("=" * 60)
119+
print(summary)
101120

102-
summary = summarize_html_with_llm(html)
121+
except Exception as e:
122+
print(f"❌ Error: {e}")
123+
sys.exit(1)
103124

104-
print("\n=== Summary of Thordata Services ===")
105-
print(summary)
125+
126+
if __name__ == "__main__":
127+
main()

examples/simple_scrape.py

Lines changed: 42 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,49 @@
1-
from dotenv import load_dotenv
1+
"""
2+
Simple Web Scraping Example
23
3-
from thordata_langchain_tools import ThordataScrapeTool
4+
Demonstrates using ThordataScrapeTool to fetch webpage content.
5+
6+
Usage:
7+
python examples/simple_scrape.py
8+
"""
9+
10+
import os
11+
import sys
12+
from dotenv import load_dotenv
413

514
load_dotenv()
615

16+
if not os.getenv("THORDATA_SCRAPER_TOKEN"):
17+
print("❌ Error: Set THORDATA_SCRAPER_TOKEN in your .env file")
18+
sys.exit(1)
719

8-
if __name__ == "__main__":
20+
from thordata_langchain_tools import ThordataScrapeTool
21+
22+
23+
def main():
924
tool = ThordataScrapeTool()
1025

11-
result = tool.invoke(
12-
{
13-
"url": "https://www.thordata.com",
14-
"js_render": False,
15-
"output_format": "HTML",
16-
}
17-
)
18-
19-
# For HTML output this will be a long string.
20-
# Print only the first 1000 characters to keep the console readable.
21-
if isinstance(result, str):
22-
print(result[:1000])
23-
else:
24-
print(result)
26+
url = "https://example.com"
27+
print(f"🌐 Scraping: {url}")
28+
print()
29+
30+
# Scrape the page
31+
html = tool.invoke({
32+
"url": url,
33+
"js_render": False,
34+
"max_length": 2000,
35+
})
36+
37+
if html.startswith("Error"):
38+
print(f"❌ {html}")
39+
return
40+
41+
print("📄 HTML Content (first 1000 chars):")
42+
print("-" * 50)
43+
print(html[:1000])
44+
print("-" * 50)
45+
print(f"\n✅ Successfully scraped {len(html)} characters")
46+
47+
48+
if __name__ == "__main__":
49+
main()

examples/simple_serp.py

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,61 @@
1+
"""
2+
Simple SERP Search Example
3+
4+
Demonstrates using ThordataSerpTool to search the web.
5+
6+
Usage:
7+
python examples/simple_serp.py
8+
"""
9+
10+
import os
11+
import sys
112
from dotenv import load_dotenv
213

3-
from thordata_langchain_tools import ThordataSerpTool
14+
# Load environment variables
15+
load_dotenv()
416

5-
load_dotenv() # Load THORDATA_* tokens and keys from a local .env file
17+
# Check for required token
18+
if not os.getenv("THORDATA_SCRAPER_TOKEN"):
19+
print("❌ Error: Set THORDATA_SCRAPER_TOKEN in your .env file")
20+
sys.exit(1)
621

22+
from thordata_langchain_tools import ThordataSerpTool
723

8-
if __name__ == "__main__":
24+
25+
def main():
26+
# Create the tool
927
tool = ThordataSerpTool()
1028

11-
result = tool.invoke(
12-
{
13-
"query": "Thordata proxy network",
14-
"engine": "google",
15-
"num": 3,
16-
}
17-
)
18-
print(result)
29+
print("🔍 Searching for: 'Python web scraping best practices'")
30+
print()
31+
32+
# Execute search
33+
results = tool.invoke({
34+
"query": "Python web scraping best practices",
35+
"engine": "google",
36+
"num": 5,
37+
})
38+
39+
# Check for errors
40+
if "error" in results:
41+
print(f"❌ Error: {results['error']}")
42+
return
43+
44+
# Display organic results
45+
organic = results.get("organic", [])
46+
print(f"📊 Found {len(organic)} organic results:\n")
47+
48+
for i, item in enumerate(organic, 1):
49+
title = item.get("title", "No title")
50+
link = item.get("link", "No link")
51+
snippet = item.get("snippet", "")[:100]
52+
53+
print(f"{i}. {title}")
54+
print(f" 🔗 {link}")
55+
if snippet:
56+
print(f" 📝 {snippet}...")
57+
print()
58+
59+
60+
if __name__ == "__main__":
61+
main()

0 commit comments

Comments
 (0)