|
30 | 30 | "| Class | Package | Serializable | JS support | Package latest |\n",
|
31 | 31 | "| :--- | :--- | :---: | :---: | :---: |\n",
|
32 | 32 | "| [SmartScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
| 33 | + "| [SmartCrawlerTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
33 | 34 | "| [MarkdownifyTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
34 |
| - "| [LocalScraperTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n", |
35 | 35 | "| [GetCreditsTool](https://python.langchain.com/docs/integrations/tools/scrapegraph) | langchain-scrapegraph | ✅ | ❌ |  |\n",
|
36 | 36 | "\n",
|
37 | 37 | "### Tool features\n",
|
38 | 38 | "\n",
|
39 | 39 | "| Tool | Purpose | Input | Output |\n",
|
40 | 40 | "| :--- | :--- | :--- | :--- |\n",
|
41 | 41 | "| SmartScraperTool | Extract structured data from websites | URL + prompt | JSON |\n",
|
| 42 | + "| SmartCrawlerTool | Extract data from multiple pages with crawling | URL + prompt + crawl options | JSON |\n", |
42 | 43 | "| MarkdownifyTool | Convert webpages to markdown | URL | Markdown text |\n",
|
43 |
| - "| LocalScraperTool | Extract data from HTML content | HTML + prompt | JSON |\n", |
44 | 44 | "| GetCreditsTool | Check API credits | None | Credit info |\n",
|
45 | 45 | "\n",
|
46 | 46 | "\n",
|
|
122 | 122 | },
|
123 | 123 | {
|
124 | 124 | "cell_type": "code",
|
125 |
| - "execution_count": 7, |
| 125 | + "execution_count": null, |
126 | 126 | "id": "8b3ddfe9",
|
127 | 127 | "metadata": {},
|
128 | 128 | "outputs": [],
|
129 | 129 | "source": [
|
| 130 | + "from scrapegraph_py.logger import sgai_logger\n", |
| 131 | + "import json\n", |
| 132 | + "\n", |
130 | 133 | "from langchain_scrapegraph.tools import (\n",
|
131 | 134 | " GetCreditsTool,\n",
|
132 |
| - " LocalScraperTool,\n", |
133 | 135 | " MarkdownifyTool,\n",
|
| 136 | + " SmartCrawlerTool,\n", |
134 | 137 | " SmartScraperTool,\n",
|
135 | 138 | ")\n",
|
136 | 139 | "\n",
|
| 140 | + "sgai_logger.set_logging(level=\"INFO\")\n", |
| 141 | + "\n", |
137 | 142 | "smartscraper = SmartScraperTool()\n",
|
| 143 | + "smartcrawler = SmartCrawlerTool()\n", |
138 | 144 | "markdownify = MarkdownifyTool()\n",
|
139 |
| - "localscraper = LocalScraperTool()\n", |
140 | 145 | "credits = GetCreditsTool()"
|
141 | 146 | ]
|
142 | 147 | },
|
|
152 | 157 | "Let's try each tool individually:"
|
153 | 158 | ]
|
154 | 159 | },
|
| 160 | + { |
| 161 | + "cell_type": "markdown", |
| 162 | + "id": "d5a88cf2", |
| 163 | + "metadata": { |
| 164 | + "vscode": { |
| 165 | + "languageId": "raw" |
| 166 | + } |
| 167 | + }, |
| 168 | + "source": [ |
| 169 | + "### SmartCrawler Tool\n", |
| 170 | + "\n", |
| 171 | + "The SmartCrawlerTool allows you to crawl multiple pages from a website and extract structured data with advanced crawling options like depth control, page limits, and domain restrictions.\n" |
| 172 | + ] |
| 173 | + }, |
155 | 174 | {
|
156 | 175 | "cell_type": "code",
|
157 |
| - "execution_count": 6, |
| 176 | + "execution_count": null, |
158 | 177 | "id": "65310a8b",
|
159 | 178 | "metadata": {},
|
160 | 179 | "outputs": [
|
|
189 | 208 | "markdown = markdownify.invoke({\"website_url\": \"https://scrapegraphai.com\"})\n",
|
190 | 209 | "print(\"\\nMarkdownify Result (first 200 chars):\", markdown[:200])\n",
|
191 | 210 | "\n",
|
192 |
| - "local_html = \"\"\"\n", |
193 |
| - "<html>\n", |
194 |
| - " <body>\n", |
195 |
| - " <h1>Company Name</h1>\n", |
196 |
| - " <p>We are a technology company focused on AI solutions.</p>\n", |
197 |
| - " <div class=\"contact\">\n", |
198 |
| - " <p>Email: [email protected]</p>\n", |
199 |
| - " <p>Phone: (555) 123-4567</p>\n", |
200 |
| - " </div>\n", |
201 |
| - " </body>\n", |
202 |
| - "</html>\n", |
203 |
| - "\"\"\"\n", |
204 |
| - "\n", |
205 |
| - "# LocalScraper\n", |
206 |
| - "result_local = localscraper.invoke(\n", |
| 211 | + "# SmartCrawler\n", |
| 212 | + "url = \"https://scrapegraphai.com/\"\n", |
| 213 | + "prompt = (\n", |
| 214 | + " \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 215 | + ")\n", |
| 216 | + "\n", |
| 217 | + "# Use the tool with crawling parameters\n", |
| 218 | + "result_crawler = smartcrawler.invoke(\n", |
207 | 219 | " {\n",
|
208 |
| - " \"user_prompt\": \"Make a summary of the webpage and extract the email and phone number\",\n", |
209 |
| - " \"website_html\": local_html,\n", |
| 220 | + " \"url\": url,\n", |
| 221 | + " \"prompt\": prompt,\n", |
| 222 | + " \"cache_website\": True,\n", |
| 223 | + " \"depth\": 2,\n", |
| 224 | + " \"max_pages\": 2,\n", |
| 225 | + " \"same_domain_only\": True,\n", |
210 | 226 | " }\n",
|
211 | 227 | ")\n",
|
212 |
| - "print(\"LocalScraper Result:\", result_local)\n", |
| 228 | + "\n", |
| 229 | + "print(\"\\nSmartCrawler Result:\")\n", |
| 230 | + "print(json.dumps(result_crawler, indent=2))\n", |
213 | 231 | "\n",
|
214 | 232 | "# Check credits\n",
|
215 | 233 | "credits_info = credits.invoke({})\n",
|
216 | 234 | "print(\"\\nCredits Info:\", credits_info)"
|
217 | 235 | ]
|
218 | 236 | },
|
| 237 | + { |
| 238 | + "cell_type": "code", |
| 239 | + "execution_count": null, |
| 240 | + "id": "f13fb466", |
| 241 | + "metadata": {}, |
| 242 | + "outputs": [], |
| 243 | + "source": [ |
| 244 | + "# SmartCrawler example\n", |
| 245 | + "from scrapegraph_py.logger import sgai_logger\n", |
| 246 | + "import json\n", |
| 247 | + "\n", |
| 248 | + "from langchain_scrapegraph.tools import SmartCrawlerTool\n", |
| 249 | + "\n", |
| 250 | + "sgai_logger.set_logging(level=\"INFO\")\n", |
| 251 | + "\n", |
| 252 | + "# Will automatically get SGAI_API_KEY from environment\n", |
| 253 | + "tool = SmartCrawlerTool()\n", |
| 254 | + "\n", |
| 255 | + "# Example based on the provided code snippet\n", |
| 256 | + "url = \"https://scrapegraphai.com/\"\n", |
| 257 | + "prompt = (\n", |
| 258 | + " \"What does the company do? and I need text content from their privacy and terms\"\n", |
| 259 | + ")\n", |
| 260 | + "\n", |
| 261 | + "# Use the tool with crawling parameters\n", |
| 262 | + "result = tool.invoke(\n", |
| 263 | + " {\n", |
| 264 | + " \"url\": url,\n", |
| 265 | + " \"prompt\": prompt,\n", |
| 266 | + " \"cache_website\": True,\n", |
| 267 | + " \"depth\": 2,\n", |
| 268 | + " \"max_pages\": 2,\n", |
| 269 | + " \"same_domain_only\": True,\n", |
| 270 | + " }\n", |
| 271 | + ")\n", |
| 272 | + "\n", |
| 273 | + "print(json.dumps(result, indent=2))" |
| 274 | + ] |
| 275 | + }, |
219 | 276 | {
|
220 | 277 | "cell_type": "markdown",
|
221 | 278 | "id": "d6e73897",
|
|
350 | 407 | "source": [
|
351 | 408 | "## API reference\n",
|
352 | 409 | "\n",
|
353 |
| - "For detailed documentation of all ScrapeGraph features and configurations head to the Langchain API reference: https://python.langchain.com/docs/integrations/tools/scrapegraph\n", |
| 410 | + "For detailed documentation of all ScrapeGraph features and configurations head to [the Langchain API reference](https://python.langchain.com/docs/integrations/tools/scrapegraph).\n", |
354 | 411 | "\n",
|
355 |
| - "Or to the official SDK repo: https://github.com/ScrapeGraphAI/langchain-scrapegraph" |
| 412 | + "Or to [the official SDK repo](https://github.com/ScrapeGraphAI/langchain-scrapegraph)." |
356 | 413 | ]
|
| 414 | + }, |
| 415 | + { |
| 416 | + "cell_type": "markdown", |
| 417 | + "id": "d710dad8", |
| 418 | + "metadata": {}, |
| 419 | + "source": [] |
357 | 420 | }
|
358 | 421 | ],
|
359 | 422 | "metadata": {
|
360 | 423 | "kernelspec": {
|
361 |
| - "display_name": "Python 3", |
| 424 | + "display_name": "langchain", |
362 | 425 | "language": "python",
|
363 | 426 | "name": "python3"
|
364 | 427 | },
|
|
372 | 435 | "name": "python",
|
373 | 436 | "nbconvert_exporter": "python",
|
374 | 437 | "pygments_lexer": "ipython3",
|
375 |
| - "version": "3.11.9" |
| 438 | + "version": "3.10.16" |
376 | 439 | }
|
377 | 440 | },
|
378 | 441 | "nbformat": 4,
|
|
0 commit comments