diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 0c362a8..8431e3e 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -6,15 +6,11 @@ name: Upload Python Package
on:
release:
types: [published]
- paths:
- - 'scrapegraph-py/**'
jobs:
deploy:
runs-on: ubuntu-latest
- # Only run if scrapegraph-py has changes
- if: contains(github.event.release.body, 'scrapegraph-py/')
steps:
- uses: actions/checkout@v4
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index cb2e3a6..2f7a203 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -4,15 +4,11 @@ on:
branches:
- main
- pre/*
- paths:
- - 'scrapegraph-py/**'
jobs:
build:
name: Build
runs-on: ubuntu-latest
- # Only run if scrapegraph-py has changes
- if: contains(github.event.head_commit.modified, 'scrapegraph-py/') || contains(github.event.head_commit.added, 'scrapegraph-py/') || contains(github.event.head_commit.removed, 'scrapegraph-py/')
steps:
- name: Install git
run: |
diff --git a/scrapegraph-js/README.md b/scrapegraph-js/README.md
index 9ed7150..fe68c7e 100644
--- a/scrapegraph-js/README.md
+++ b/scrapegraph-js/README.md
@@ -35,6 +35,7 @@ yarn add scrapegraph-js
```javascript
import { smartScraper } from 'scrapegraph-js';
+import 'dotenv/config';
// Initialize variables
const apiKey = process.env.SGAI_APIKEY; // Set your API key as an environment variable
@@ -105,12 +106,43 @@ const schema = z.object({
})();
```
+### Scraping local HTML
+
+Extract structured data from local HTML content
+
+```javascript
+import { localScraper } from 'scrapegraph-js';
+
+const apiKey = 'your_api_key';
+const prompt = 'What does the company do?';
+
+const websiteHtml = `
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+ `;
+(async () => {
+ try {
+ const response = await localScraper(apiKey, websiteHtml, prompt);
+ console.log(response);
+ } catch (error) {
+ console.error(error);
+ }
+})();
+```
+
### Markdownify
+
Converts a webpage into clean, well-structured markdown format.
+
```javascript
import { smartScraper } from 'scrapegraph-js';
-const apiKey = "your_api_key";
+const apiKey = 'your_api_key';
const url = 'https://scrapegraphai.com/';
(async () => {
@@ -123,7 +155,6 @@ const url = 'https://scrapegraphai.com/';
})();
```
-
### Checking API Credits
```javascript
diff --git a/scrapegraph-js/examples/localScraper_example.js b/scrapegraph-js/examples/localScraper_example.js
new file mode 100644
index 0000000..95552e5
--- /dev/null
+++ b/scrapegraph-js/examples/localScraper_example.js
@@ -0,0 +1,33 @@
+import { localScraper, getLocalScraperRequest } from 'scrapegraph-js';
+import 'dotenv/config';
+
+// localScraper function example
+const apiKey = process.env.SGAI_APIKEY;
+const prompt = 'What does the company do?';
+
+const websiteHtml = `
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+ `;
+
+try {
+ const response = await localScraper(apiKey, websiteHtml, prompt);
+ console.log(response);
+} catch (error) {
+ console.error(error);
+}
+
+// getLocalScraperFunctionExample
+const requestId = 'b8d97545-9ed3-441b-a01f-4b661b4f0b4c';
+
+try {
+ const response = await getLocalScraperRequest(apiKey, requestId);
+ console.log(response);
+} catch (error) {
+ console.log(error);
+}
diff --git a/scrapegraph-js/examples/schema_localScraper_example.js b/scrapegraph-js/examples/schema_localScraper_example.js
new file mode 100644
index 0000000..1de6344
--- /dev/null
+++ b/scrapegraph-js/examples/schema_localScraper_example.js
@@ -0,0 +1,28 @@
+import { localScraper } from 'scrapegraph-js';
+import { z } from 'zod';
+import 'dotenv/config';
+
+// localScraper function example
+const apiKey = process.env.SGAI_APIKEY;
+const prompt = 'extract contact';
+
+const websiteHtml = `
+
+ Company Name
+ We are a technology company focused on AI solutions.
+
+
+ `;
+
+const schema = z.object({
+ contact: z.string().describe('email contact'),
+});
+
+try {
+ const response = await localScraper(apiKey, websiteHtml, prompt, schema);
+ console.log(response);
+} catch (error) {
+ console.error(error);
+}
diff --git a/scrapegraph-js/index.js b/scrapegraph-js/index.js
index ca4dbb7..1e4c1c5 100644
--- a/scrapegraph-js/index.js
+++ b/scrapegraph-js/index.js
@@ -1,4 +1,5 @@
export { smartScraper, getSmartScraperRequest } from './src/smartScraper.js';
export { markdownify, getMarkdownifyRequest } from './src/markdownify.js';
+export { localScraper, getLocalScraperRequest } from './src/localScraper.js';
export { getCredits } from './src/credits.js';
export { sendFeedback } from './src/feedback.js';
diff --git a/scrapegraph-js/src/localScraper.js b/scrapegraph-js/src/localScraper.js
new file mode 100644
index 0000000..412aac4
--- /dev/null
+++ b/scrapegraph-js/src/localScraper.js
@@ -0,0 +1,66 @@
+import axios from 'axios';
+import handleError from './utils/handleError.js';
+import { ZodType } from 'zod';
+import { zodToJsonSchema } from 'zod-to-json-schema';
+
+/**
+ * Extract structured data from local HTML content using ScrapeGraph AI.
+ *
+ * @param {string} apiKey - The API key for ScrapeGraph AI.
+ * @param {string} websiteHtml - HTML content as a string from the local web page to scrape.
+ * @param {string} prompt - A natural language description of the data to extract.
+ * @param {Object} [schema] - (Optional) Schema object defining the structure of the desired output.
+ * @returns {Promise} A JSON string containing the extracted data, formatted to match the schema.
+ * @throws {Error} If an HTTP error or validation issue occurs.
+ */
+export async function localScraper(apiKey, websiteHtml, prompt, schema = null) {
+ const endpoint = 'https://api.scrapegraphai.com/v1/localscraper';
+ const headers = {
+ 'accept': 'application/json',
+ 'SGAI-APIKEY': apiKey,
+ 'Content-Type': 'application/json',
+ };
+
+ const payload = {
+ website_html: websiteHtml,
+ user_prompt: prompt,
+ };
+
+ if (schema) {
+ if (schema instanceof ZodType) {
+ payload.output_schema = zodToJsonSchema(schema);
+ } else {
+ throw new Error('The schema must be an instance of a valid Zod schema');
+ }
+ }
+
+ try {
+ const response = await axios.post(endpoint, payload, { headers });
+ return response.data;
+ } catch (error) {
+ handleError(error);
+ }
+}
+
+/**
+ * Retrieve the status or result of a localScraper request, including results of previous requests.
+ *
+ * @param {string} apiKey - The API key for ScrapeGraph AI.
+ * @param {string} requestId - The unique ID associated with the localScraper request.
+ * @returns {Promise} A JSON string containing the status or result of the scraping request.
+ * @throws {Error} If an error occurs while retrieving the request details.
+ */
+export async function getLocalScraperRequest(apiKey, requestId) {
+ const endpoint = 'https://api.scrapegraphai.com/v1/localscraper/' + requestId;
+ const headers = {
+ 'accept': 'application/json',
+ 'SGAI-APIKEY': apiKey,
+ };
+
+ try {
+ const response = await axios.get(endpoint, { headers });
+ return response.data;
+ } catch (error) {
+ handleError(error);
+ }
+}
diff --git a/scrapegraph-js/src/markdownify.js b/scrapegraph-js/src/markdownify.js
index 5a1d4e5..14ae0e3 100644
--- a/scrapegraph-js/src/markdownify.js
+++ b/scrapegraph-js/src/markdownify.js
@@ -9,7 +9,7 @@ import handleError from './utils/handleError.js';
* @returns {Promise} A promise that resolves to the markdown representation of the webpage.
* @throws {Error} Throws an error if the HTTP request fails.
*/
-export async function markdownify(apiKey, url){
+export async function markdownify(apiKey, url) {
const endpoint = 'https://api.scrapegraphai.com/v1/markdownify';
const headers = {
'accept': 'application/json',
@@ -24,7 +24,7 @@ export async function markdownify(apiKey, url){
const response = await axios.post(endpoint, payload, { headers });
return response.data;
} catch (error) {
- handleError(error)
+ handleError(error);
}
}
@@ -36,7 +36,7 @@ export async function markdownify(apiKey, url){
* @returns {Promise} A promise that resolves with details about the status or outcome of the specified request.
* @throws {Error} Throws an error if the HTTP request fails.
*/
-export async function getMarkdownifyRequest(apiKey, requestId){
+export async function getMarkdownifyRequest(apiKey, requestId) {
const endpoint = 'https://api.scrapegraphai.com/v1/markdownify/' + requestId;
const headers = {
'accept': 'application/json',
@@ -47,6 +47,6 @@ export async function getMarkdownifyRequest(apiKey, requestId){
const response = await axios.get(endpoint, { headers });
return response.data;
} catch (error) {
- handleError(error)
+ handleError(error);
}
-}
\ No newline at end of file
+}
diff --git a/scrapegraph-py/CHANGELOG.md b/scrapegraph-py/CHANGELOG.md
index 26cc6f1..a293570 100644
--- a/scrapegraph-py/CHANGELOG.md
+++ b/scrapegraph-py/CHANGELOG.md
@@ -1,3 +1,34 @@
+## [1.9.0-beta.3](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.2...v1.9.0-beta.3) (2024-12-10)
+
+
+### Bug Fixes
+
+* come back to py 3.10 ([26d3a75](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/26d3a75ed973590e21d55c985bf71f3905a3ac0e))
+
+## [1.9.0-beta.2](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.9.0-beta.1...v1.9.0-beta.2) (2024-12-10)
+
+
+### Bug Fixes
+
+* add new python compatibility ([77b67f6](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/77b67f646d75abd3a558b40cb31c52c12cc7182e))
+
+## [1.9.0-beta.1](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.8.0...v1.9.0-beta.1) (2024-12-10)
+
+
+### Features
+
+* add localScraper functionality ([8701eb2](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/8701eb2ca7f108b922eb1617c850a58c0f88f8f9))
+* revert to old release ([d88a3ac](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/d88a3ac6969a0abdf1f6b8eccde9ad8284d41d20))
+
+
+### Bug Fixes
+
+* .toml file ([e719881](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/e7198817d8dac802361ab84bc4d5d961fb926767))
+* add revert ([09257e0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/09257e08246d8aee96b3944ac14cc14b88e5f818))
+* minor fix version ([0b972c6](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/0b972c69a9ea843d8ec89327f35c287b0d7a2bb4))
+* pyproject ([2440f7f](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/2440f7f2a5179c6e3a86faf4eefa1d5edf7524c8))
+* python version ([24366b0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/commit/24366b08eefe0789da9a0ccafb8058e8744ee58b))
+
## [1.8.0](https://github.com/ScrapeGraphAI/scrapegraph-sdk/compare/v1.7.0...v1.8.0) (2024-12-08)
diff --git a/scrapegraph-py/examples/smartscraper_example.py b/scrapegraph-py/examples/smartscraper_example.py
index 37e4542..75891e9 100644
--- a/scrapegraph-py/examples/smartscraper_example.py
+++ b/scrapegraph-py/examples/smartscraper_example.py
@@ -1,10 +1,11 @@
from scrapegraph_py import Client
from scrapegraph_py.logger import sgai_logger
-sgai_logger.set_logging(level="INFO")
+# Set logging to DEBUG level to see all logs
+sgai_logger.set_logging(level="DEBUG")
# Initialize the client with explicit API key
-sgai_client = Client(api_key="your-api-key-here")
+sgai_client = Client(api_key="your_api_key")
# SmartScraper request
response = sgai_client.smartscraper(
diff --git a/scrapegraph-py/pyproject.toml b/scrapegraph-py/pyproject.toml
index da5ef04..fb53b2e 100644
--- a/scrapegraph-py/pyproject.toml
+++ b/scrapegraph-py/pyproject.toml
@@ -97,4 +97,4 @@ build-backend = "hatchling.build"
[tool.poe.tasks]
pylint-local = "pylint scrapegraph_py/**/*.py"
-pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraph_py/**/*.py"
+pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraph_py/**/*.py"
\ No newline at end of file
diff --git a/scrapegraph-py/uv.lock b/scrapegraph-py/uv.lock
index 1990785..bb5cf94 100644
--- a/scrapegraph-py/uv.lock
+++ b/scrapegraph-py/uv.lock
@@ -557,11 +557,11 @@ wheels = [
[[package]]
name = "idna"
-version = "3.10"
+version = "3.9"
source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 }
+sdist = { url = "https://files.pythonhosted.org/packages/00/6f/93e724eafe34e860d15d37a4f72a1511dd37c43a76a8671b22a15029d545/idna-3.9.tar.gz", hash = "sha256:e5c5dafde284f26e9e0f28f6ea2d6400abd5ca099864a67f576f3981c6476124", size = 191636 }
wheels = [
- { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
+ { url = "https://files.pythonhosted.org/packages/6d/15/61933d1999bc5ad8cad612d67f02fa5b16a423076ea0816e39c2e797af12/idna-3.9-py3-none-any.whl", hash = "sha256:69297d5da0cc9281c77efffb4e730254dd45943f45bbfb461de5991713989b1e", size = 71671 },
]
[[package]]
diff --git a/scrapegraph_py/utils/helpers.py b/scrapegraph_py/utils/helpers.py
new file mode 100644
index 0000000..2157562
--- /dev/null
+++ b/scrapegraph_py/utils/helpers.py
@@ -0,0 +1,39 @@
+from urllib.parse import urlparse
+import socket
+import requests
+from scrapegraph_py.logger import sgai_logger as logger
+
+def validate_website_url(url: str) -> None:
+ """Validate if website URL is reachable."""
+ logger.info(f"🔍 Validating website URL: {url}")
+
+ try:
+ # Validate URL format
+ parsed = urlparse(url)
+ if not all([parsed.scheme, parsed.netloc]):
+ logger.error(f"❌ Invalid URL format: {url}")
+ raise ValueError("Invalid URL format")
+ logger.info("✅ URL format is valid")
+
+ # Try to resolve domain
+ logger.info(f"🔍 Checking domain accessibility: {parsed.netloc}")
+ socket.gethostbyname(parsed.netloc)
+
+ # Try to make a HEAD request to verify the website responds
+ logger.info(f"🔍 Verifying website response...")
+ response = requests.head(url, timeout=5, allow_redirects=True)
+ response.raise_for_status()
+ logger.info(f"✅ Website is accessible and responding")
+
+ except socket.gaierror:
+ error_msg = f"Could not resolve domain: {url}"
+ logger.error(f"❌ {error_msg}")
+ raise ValueError(error_msg)
+ except requests.exceptions.RequestException as e:
+ error_msg = f"Website not reachable: {url} - {str(e)}"
+ logger.error(f"❌ {error_msg}")
+ raise ValueError(error_msg)
+ except Exception as e:
+ error_msg = f"Invalid URL: {str(e)}"
+ logger.error(f"❌ {error_msg}")
+ raise ValueError(error_msg)
\ No newline at end of file