Skip to content

Commit 5a7092c

Browse files
committed
scrapegraphai init
1 parent d00e08f commit 5a7092c

File tree

8 files changed

+748
-3
lines changed

8 files changed

+748
-3
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import scrapegraphai from "../../scrapegraphai.app.mjs";
2+
import { axios } from "@pipedream/platform";
3+
4+
export default {
5+
key: "scrapegraphai-fetch-scraping-results",
6+
name: "Fetch Scraping Results",
7+
description: "Retrieves the results of a completed scraping job. [See the documentation]().",
8+
version: "0.0.{{ts}}",
9+
type: "action",
10+
props: {
11+
scrapegraphai,
12+
jobId: {
13+
propDefinition: [
14+
scrapegraphai,
15+
"jobId",
16+
],
17+
},
18+
filterDataFields: {
19+
propDefinition: [
20+
scrapegraphai,
21+
"filterDataFields",
22+
],
23+
optional: true,
24+
},
25+
},
26+
async run({ $ }) {
27+
const results = await this.scrapegraphai.retrieveScrapingResults({
28+
jobId: this.jobId,
29+
filterDataFields: this.filterDataFields,
30+
});
31+
$.export("$summary", `Successfully retrieved scraping results for job ${this.jobId}`);
32+
return results;
33+
},
34+
};
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import scrapegraphai from "../../scrapegraphai.app.mjs";
2+
import { axios } from "@pipedream/platform";
3+
4+
export default {
5+
key: "scrapegraphai-start-scraping-job",
6+
name: "Start Scraping Job",
7+
description: "Starts a new web scraping job. [See the documentation](${{docsLink}})",
8+
version: "0.0.{{ts}}",
9+
type: "action",
10+
props: {
11+
scrapegraphai: {
12+
type: "app",
13+
app: "scrapegraphai",
14+
},
15+
url: {
16+
propDefinition: [
17+
"scrapegraphai",
18+
"url",
19+
],
20+
},
21+
dataFields: {
22+
propDefinition: [
23+
"scrapegraphai",
24+
"dataFields",
25+
],
26+
optional: true,
27+
},
28+
paginationSettings: {
29+
propDefinition: [
30+
"scrapegraphai",
31+
"paginationSettings",
32+
],
33+
optional: true,
34+
},
35+
headers: {
36+
propDefinition: [
37+
"scrapegraphai",
38+
"headers",
39+
],
40+
optional: true,
41+
},
42+
},
43+
async run({ $ }) {
44+
const response = await this.scrapegraphai.startScrapingJob({
45+
url: this.url,
46+
dataFields: this.dataFields,
47+
paginationSettings: this.paginationSettings,
48+
headers: this.headers,
49+
});
50+
$.export("$summary", `Started scraping job with Job ID: ${response.job_id}`);
51+
return response;
52+
},
53+
};
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
import scrapegraphai from "../../scrapegraphai.app.mjs";
2+
import { axios } from "@pipedream/platform";
3+
4+
export default {
5+
key: "scrapegraphai-stop-running-job",
6+
name: "Stop Running Job",
7+
description: "Stops a currently running web scraping job. [See the documentation](https://docs.scrapegraphai.com/)",
8+
version: "0.0.{{ts}}",
9+
type: "action",
10+
props: {
11+
scrapegraphai,
12+
jobId: {
13+
propDefinition: [
14+
scrapegraphai,
15+
"jobId",
16+
],
17+
},
18+
},
19+
async run({ $ }) {
20+
const response = await this.scrapegraphai.stopScrapingJob({
21+
jobId: this.jobId,
22+
});
23+
$.export("$summary", `Stopped scraping job ${this.jobId}`);
24+
return response;
25+
},
26+
};

components/scrapegraphai/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
"publishConfig": {
1313
"access": "public"
1414
}
15-
}
15+
}
Lines changed: 215 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,224 @@
1+
import { axios } from "@pipedream/platform";
2+
13
export default {
24
type: "app",
35
app: "scrapegraphai",
4-
propDefinitions: {},
6+
version: "0.0.{ts}",
7+
propDefinitions: {
8+
url: {
9+
type: "string",
10+
label: "URL to Scrape",
11+
description: "The URL of the website to scrape.",
12+
},
13+
jobId: {
14+
type: "string",
15+
label: "Job ID",
16+
description: "The ID of the scraping job.",
17+
},
18+
dataFields: {
19+
type: "string[]",
20+
label: "Data Fields",
21+
description: "Optional data fields to extract from the scraped content.",
22+
optional: true,
23+
},
24+
paginationSettings: {
25+
type: "string[]",
26+
label: "Pagination Settings",
27+
description: "Optional pagination settings for the scraping job.",
28+
optional: true,
29+
},
30+
headers: {
31+
type: "string[]",
32+
label: "Headers",
33+
description: "Optional headers to include in the scraping request.",
34+
optional: true,
35+
},
36+
filterDataFields: {
37+
type: "string[]",
38+
label: "Filter Data Fields",
39+
description: "Optional data fields to filter the results.",
40+
optional: true,
41+
},
42+
taskId: {
43+
type: "string",
44+
label: "Task ID",
45+
description: "The ID of the scraping task to monitor.",
46+
optional: true,
47+
},
48+
scrapingJobFilter: {
49+
type: "string",
50+
label: "Scraping Job Filter",
51+
description: "Filter events by specific scraping jobs.",
52+
optional: true,
53+
},
54+
dataTypeFilter: {
55+
type: "string",
56+
label: "Data Type Filter",
57+
description: "Filter events by specific data types.",
58+
optional: true,
59+
},
60+
scrapingTaskNameFilter: {
61+
type: "string",
62+
label: "Scraping Task Name Filter",
63+
description: "Filter events by specific scraping task names.",
64+
optional: true,
65+
},
66+
errorTypeFilter: {
67+
type: "string",
68+
label: "Error Type Filter",
69+
description: "Filter error events by specific error types.",
70+
optional: true,
71+
},
72+
},
573
methods: {
674
// this.$auth contains connected account data
775
authKeys() {
876
console.log(Object.keys(this.$auth));
977
},
78+
_baseUrl() {
79+
return "https://api.scrapegraphai.com/v1";
80+
},
81+
async _makeRequest(opts = {}) {
82+
const {
83+
$ = this, method = "GET", path = "/", headers, ...otherOpts
84+
} = opts;
85+
return axios($, {
86+
...otherOpts,
87+
method,
88+
url: this._baseUrl() + path,
89+
headers: {
90+
...headers,
91+
Authorization: `Bearer ${this.$auth.api_key}`,
92+
},
93+
});
94+
},
95+
async startScrapingJob(opts = {}) {
96+
const {
97+
url,
98+
dataFields,
99+
paginationSettings,
100+
headers,
101+
...otherOpts
102+
} = opts;
103+
const data = {
104+
url: this.url,
105+
};
106+
if (this.dataFields) {
107+
data.data_fields = this.dataFields.map(JSON.parse);
108+
}
109+
if (this.paginationSettings) {
110+
data.pagination_settings = this.paginationSettings.map(JSON.parse);
111+
}
112+
if (this.headers) {
113+
data.headers = this.headers.map(JSON.parse);
114+
}
115+
return this._makeRequest({
116+
method: "POST",
117+
path: "/smartscraper/start",
118+
data,
119+
...otherOpts,
120+
});
121+
},
122+
async retrieveScrapingResults(opts = {}) {
123+
const {
124+
jobId, filterDataFields, ...otherOpts
125+
} = opts;
126+
const params = {
127+
job_id: this.jobId,
128+
};
129+
if (this.filterDataFields) {
130+
params.filter_data_fields = this.filterDataFields;
131+
}
132+
return this._makeRequest({
133+
method: "GET",
134+
path: "/smartscraper/get-results",
135+
params,
136+
...otherOpts,
137+
});
138+
},
139+
async stopScrapingJob(opts = {}) {
140+
const {
141+
jobId, ...otherOpts
142+
} = opts;
143+
return this._makeRequest({
144+
method: "POST",
145+
path: "/smartscraper/stop",
146+
data: {
147+
job_id: this.jobId,
148+
},
149+
...otherOpts,
150+
});
151+
},
152+
async onTaskCompleted(opts = {}) {
153+
const {
154+
taskId, scrapingJobFilter, ...otherOpts
155+
} = opts;
156+
const params = {};
157+
if (this.taskId) {
158+
params.task_id = this.taskId;
159+
}
160+
if (this.scrapingJobFilter) {
161+
params.scraping_job = this.scrapingJobFilter;
162+
}
163+
return this._makeRequest({
164+
method: "GET",
165+
path: "/events/task-completed",
166+
params,
167+
...otherOpts,
168+
});
169+
},
170+
async onNewDataAvailable(opts = {}) {
171+
const {
172+
dataTypeFilter, scrapingTaskNameFilter, ...otherOpts
173+
} = opts;
174+
const params = {};
175+
if (this.dataTypeFilter) {
176+
params.data_type = this.dataTypeFilter;
177+
}
178+
if (this.scrapingTaskNameFilter) {
179+
params.scraping_task_name = this.scrapingTaskNameFilter;
180+
}
181+
return this._makeRequest({
182+
method: "GET",
183+
path: "/events/new-data",
184+
params,
185+
...otherOpts,
186+
});
187+
},
188+
async onErrorOccurred(opts = {}) {
189+
const {
190+
errorTypeFilter, scrapingJobFilter, ...otherOpts
191+
} = opts;
192+
const params = {};
193+
if (this.errorTypeFilter) {
194+
params.error_type = this.errorTypeFilter;
195+
}
196+
if (this.scrapingJobFilter) {
197+
params.scraping_job = this.scrapingJobFilter;
198+
}
199+
return this._makeRequest({
200+
method: "GET",
201+
path: "/events/error",
202+
params,
203+
...otherOpts,
204+
});
205+
},
206+
async paginate(fn, ...opts) {
207+
const results = [];
208+
const fetchPage = async (page = 1) => {
209+
const response = await fn({
210+
page,
211+
...opts,
212+
});
213+
if (response && response.items && response.items.length > 0) {
214+
results.push(...response.items);
215+
if (response.has_more) {
216+
await fetchPage(page + 1);
217+
}
218+
}
219+
};
220+
await fetchPage();
221+
return results;
222+
},
10223
},
11-
};
224+
};

0 commit comments

Comments
 (0)