Skip to content

Commit a25c5c4

Browse files
Add Oxylabs Document Loader (#4625)
* Add Oxylabs Document Loader * Update Oxylabs.ts --------- Co-authored-by: Henry Heng <[email protected]>
1 parent 768de61 commit a25c5c4

File tree

3 files changed

+361
-0
lines changed

3 files changed

+361
-0
lines changed
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import { INodeParams, INodeCredential } from '../src/Interface'
2+
3+
class OxylabsApiCredential implements INodeCredential {
4+
label: string
5+
name: string
6+
version: number
7+
description: string
8+
inputs: INodeParams[]
9+
10+
constructor() {
11+
this.label = 'Oxylabs API'
12+
this.name = 'oxylabsApi'
13+
this.version = 1.0
14+
this.description = 'Oxylabs API credentials description, to add more info'
15+
this.inputs = [
16+
{
17+
label: 'Oxylabs Username',
18+
name: 'username',
19+
type: 'string'
20+
},
21+
{
22+
label: 'Oxylabs Password',
23+
name: 'password',
24+
type: 'password'
25+
}
26+
]
27+
}
28+
}
29+
30+
module.exports = { credClass: OxylabsApiCredential }
Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
import { TextSplitter } from 'langchain/text_splitter'
2+
import { DocumentInterface } from '@langchain/core/documents'
3+
import { BaseDocumentLoader } from 'langchain/document_loaders/base'
4+
import { INode, INodeData, INodeParams, ICommonObject, INodeOutputsValue } from '../../../src/Interface'
5+
import { getCredentialData, getCredentialParam, handleEscapeCharacters } from '../../../src/utils'
6+
import axios, { AxiosResponse } from 'axios'
7+
8+
interface OxylabsDocument extends DocumentInterface {}
9+
10+
interface OxylabsResponse {
11+
results: Result[]
12+
job: Job
13+
}
14+
15+
interface Result {
16+
content: any
17+
created_at: string
18+
updated_at: string
19+
page: number
20+
url: string
21+
job_id: string
22+
is_render_forced: boolean
23+
status_code: number
24+
parser_type: string
25+
}
26+
27+
interface Job {
28+
callback_url: string
29+
client_id: number
30+
context: any
31+
created_at: string
32+
domain: string
33+
geo_location: any
34+
id: string
35+
limit: number
36+
locale: any
37+
pages: number
38+
parse: boolean
39+
parser_type: any
40+
parser_preset: any
41+
parsing_instructions: any
42+
browser_instructions: any
43+
render: any
44+
url: any
45+
query: string
46+
source: string
47+
start_page: number
48+
status: string
49+
storage_type: any
50+
storage_url: any
51+
subdomain: string
52+
content_encoding: string
53+
updated_at: string
54+
user_agent_type: string
55+
is_premium_domain: boolean
56+
}
57+
58+
interface OxylabsLoaderParameters {
59+
username: string
60+
password: string
61+
query: string
62+
source: string
63+
geo_location: string
64+
render: boolean
65+
parse: boolean
66+
user_agent_type: string
67+
}
68+
69+
export class OxylabsLoader extends BaseDocumentLoader {
70+
private params: OxylabsLoaderParameters
71+
72+
constructor(loaderParams: OxylabsLoaderParameters) {
73+
super()
74+
this.params = loaderParams
75+
}
76+
77+
private async sendAPIRequest<R>(params: any): Promise<AxiosResponse<R, any>> {
78+
params = Object.fromEntries(Object.entries(params).filter(([_, value]) => value !== null && value !== '' && value !== undefined))
79+
80+
const auth = Buffer.from(`${this.params.username}:${this.params.password}`).toString('base64')
81+
82+
const response = await axios.post<R>('https://realtime.oxylabs.io/v1/queries', params, {
83+
headers: {
84+
'Content-Type': 'application/json',
85+
'x-oxylabs-sdk': 'oxylabs-integration-flowise/1.0.0 (1.0.0; 64bit)',
86+
Authorization: `Basic ${auth}`
87+
}
88+
})
89+
90+
if (response.status >= 400) {
91+
throw new Error(`Oxylabs: Failed to call Oxylabs API: ${response.status}`)
92+
}
93+
94+
return response
95+
}
96+
97+
public async load(): Promise<DocumentInterface[]> {
98+
const response = await this.sendAPIRequest<OxylabsResponse>({
99+
url: this.params.query,
100+
source: this.params.source,
101+
geo_location: this.params.geo_location,
102+
render: this.params.render,
103+
parse: this.params.parse,
104+
user_agent_type: this.params.user_agent_type
105+
})
106+
107+
const docs: OxylabsDocument[] = response.data.results.map((result, index) => ({
108+
id: `${response.data.job.id.toString()}-${index}`,
109+
pageContent: result.content,
110+
metadata: {}
111+
}))
112+
113+
return docs
114+
}
115+
}
116+
117+
class Oxylabs_DocumentLoaders implements INode {
118+
label: string
119+
name: string
120+
description: string
121+
type: string
122+
icon: string
123+
version: number
124+
category: string
125+
baseClasses: string[]
126+
inputs: INodeParams[]
127+
credential: INodeParams
128+
outputs: INodeOutputsValue[]
129+
130+
constructor() {
131+
this.label = 'Oxylabs'
132+
this.name = 'oxylabs'
133+
this.type = 'Document'
134+
this.icon = 'oxylabs.svg'
135+
this.version = 1.0
136+
this.category = 'Document Loaders'
137+
this.description = 'Extract data from URLs using Oxylabs'
138+
this.baseClasses = [this.type]
139+
this.credential = {
140+
label: 'Oxylabs API',
141+
name: 'credential',
142+
type: 'credential',
143+
credentialNames: ['oxylabsApi']
144+
}
145+
this.inputs = [
146+
{
147+
label: 'Text Splitter',
148+
name: 'textSplitter',
149+
type: 'TextSplitter',
150+
optional: false
151+
},
152+
{
153+
label: 'Query',
154+
name: 'query',
155+
type: 'string',
156+
description: 'Website URL of query keyword.'
157+
},
158+
{
159+
label: 'Source',
160+
name: 'source',
161+
type: 'options',
162+
description: 'Target website to scrape.',
163+
options: [
164+
{
165+
label: 'Universal',
166+
name: 'universal'
167+
},
168+
{
169+
label: 'Google Search',
170+
name: 'google_search'
171+
},
172+
{
173+
label: 'Amazon Product',
174+
name: 'amazon_product'
175+
},
176+
{
177+
label: 'Amazon Search',
178+
name: 'amazon_search'
179+
}
180+
],
181+
default: 'universal'
182+
},
183+
{
184+
label: 'Geolocation',
185+
name: 'geo_location',
186+
type: 'string',
187+
description: "Sets the proxy's geo location to retrieve data. Check Oxylabs documentation for more details.",
188+
optional: true
189+
},
190+
{
191+
label: 'Render',
192+
name: 'render',
193+
type: 'boolean',
194+
description: 'Enables JavaScript rendering when set to true.',
195+
optional: true,
196+
default: false
197+
},
198+
{
199+
label: 'Parse',
200+
name: 'parse',
201+
type: 'boolean',
202+
description:
203+
"Returns parsed data when set to true, as long as a dedicated parser exists for the submitted URL's page type.",
204+
optional: true,
205+
default: false
206+
},
207+
{
208+
label: 'User Agent Type',
209+
name: 'user_agent_type',
210+
type: 'options',
211+
description: 'Device type and browser.',
212+
options: [
213+
{
214+
label: 'Desktop',
215+
name: 'desktop'
216+
},
217+
{
218+
label: 'Desktop Chrome',
219+
name: 'desktop_chrome'
220+
},
221+
{
222+
label: 'Desktop Edge',
223+
name: 'desktop_edge'
224+
},
225+
{
226+
label: 'Desktop Firefox',
227+
name: 'desktop_firefox'
228+
},
229+
{
230+
label: 'Desktop Opera',
231+
name: 'desktop_opera'
232+
},
233+
{
234+
label: 'Desktop Safari',
235+
name: 'desktop_safari'
236+
},
237+
{
238+
label: 'Mobile',
239+
name: 'mobile'
240+
},
241+
{
242+
label: 'Mobile Android',
243+
name: 'mobile_android'
244+
},
245+
{
246+
label: 'Mobile iOS',
247+
name: 'mobile_ios'
248+
},
249+
{
250+
label: 'Tablet',
251+
name: 'tablet'
252+
},
253+
{
254+
label: 'Tablet Android',
255+
name: 'tablet_android'
256+
},
257+
{
258+
label: 'Tablet iOS',
259+
name: 'tablet_ios'
260+
}
261+
],
262+
optional: true
263+
}
264+
]
265+
this.outputs = [
266+
{
267+
label: 'Document',
268+
name: 'document',
269+
description: 'Array of document objects containing metadata and pageContent',
270+
baseClasses: [...this.baseClasses, 'json']
271+
},
272+
{
273+
label: 'Text',
274+
name: 'text',
275+
description: 'Concatenated string from pageContent of documents',
276+
baseClasses: ['string', 'json']
277+
}
278+
]
279+
}
280+
281+
async init(nodeData: INodeData, _: string, options: ICommonObject): Promise<any> {
282+
const query = nodeData.inputs?.query as string
283+
const textSplitter = nodeData.inputs?.textSplitter as TextSplitter
284+
const source = nodeData.inputs?.source as string
285+
const geo_location = nodeData.inputs?.geo_location as string
286+
const render = nodeData.inputs?.render as boolean
287+
const parse = nodeData.inputs?.parse as boolean
288+
const user_agent_type = nodeData.inputs?.user_agent_type as string
289+
290+
const credentialData = await getCredentialData(nodeData.credential ?? '', options)
291+
const username = getCredentialParam('username', credentialData, nodeData)
292+
const password = getCredentialParam('password', credentialData, nodeData)
293+
294+
const output = nodeData.outputs?.output as string
295+
296+
const input: OxylabsLoaderParameters = {
297+
username,
298+
password,
299+
query,
300+
source,
301+
geo_location,
302+
render,
303+
parse,
304+
user_agent_type
305+
}
306+
307+
const loader = new OxylabsLoader(input)
308+
309+
let docs: OxylabsDocument[] = await loader.load()
310+
311+
if (textSplitter && docs.length > 0) {
312+
docs = await textSplitter.splitDocuments(docs)
313+
}
314+
315+
if (output === 'document') {
316+
return docs
317+
} else {
318+
let finaltext = ''
319+
for (const doc of docs) {
320+
finaltext += `${doc.pageContent}\n`
321+
}
322+
return handleEscapeCharacters(finaltext, false)
323+
}
324+
}
325+
}
326+
327+
module.exports = { nodeClass: Oxylabs_DocumentLoaders }
Lines changed: 4 additions & 0 deletions
Loading

0 commit comments

Comments
 (0)