Skip to content

Commit d50d0d3

Browse files
authored
refactor(auditor): update Firecrawl API to v2 and improve error handling (#1878)
1 parent c432067 commit d50d0d3

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

apps/app/src/jobs/tasks/auditor/generate-auditor-content.ts

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { getOrganizationContext } from '@/jobs/tasks/onboarding/onboard-organization-helpers';
2-
import { openai } from '@ai-sdk/openai';
2+
import { groq } from '@ai-sdk/groq';
33
import { db } from '@db';
44
import { logger, metadata, schemaTask } from '@trigger.dev/sdk';
55
import { generateText } from 'ai';
@@ -134,7 +134,8 @@ async function scrapeWebsite(website: string): Promise<string> {
134134
throw new Error('Firecrawl API key is not configured');
135135
}
136136

137-
const initialResponse = await fetch('https://api.firecrawl.dev/v1/extract', {
137+
// Start extraction job using v2 API
138+
const initialResponse = await fetch('https://api.firecrawl.dev/v2/extract', {
138139
method: 'POST',
139140
headers: {
140141
'Content-Type': 'application/json',
@@ -143,27 +144,31 @@ async function scrapeWebsite(website: string): Promise<string> {
143144
body: JSON.stringify({
144145
urls: [website],
145146
prompt:
146-
'Extract all text content from this website, including company information, services, mission, vision, and any other relevant business information.',
147-
scrapeOptions: {
148-
onlyMainContent: true,
149-
removeBase64Images: true,
150-
},
147+
'Extract all text content from this website, including company information, services, mission, vision, and any other relevant business information. Return the content as plain text or markdown.',
151148
}),
152149
});
153150

154151
const initialData = await initialResponse.json();
155152

156-
if (!initialData.success || !initialData.id) {
153+
if (!initialData.success) {
154+
logger.error('Failed to start Firecrawl extraction', { initialData });
157155
throw new Error('Failed to start Firecrawl extraction');
158156
}
159157

158+
if (!initialData.id) {
159+
logger.error('Firecrawl did not return job ID', { initialData });
160+
throw new Error('Firecrawl did not return job ID');
161+
}
162+
160163
const jobId = initialData.id;
161164
const startTime = Date.now();
165+
logger.info('Firecrawl extraction started, polling for completion', { jobId });
162166

167+
// Poll for completion
163168
while (Date.now() - startTime < MAX_POLL_DURATION_MS) {
164169
await sleep(POLL_INTERVAL_MS);
165170

166-
const statusResponse = await fetch(`https://api.firecrawl.dev/v1/extract/${jobId}`, {
171+
const statusResponse = await fetch(`https://api.firecrawl.dev/v2/extract/${jobId}`, {
167172
method: 'GET',
168173
headers: {
169174
'Content-Type': 'application/json',
@@ -173,26 +178,38 @@ async function scrapeWebsite(website: string): Promise<string> {
173178

174179
const statusData = await statusResponse.json();
175180

176-
if (statusData.status === 'completed' && statusData.data) {
181+
logger.info('Firecrawl status check', {
182+
status: statusData.status,
183+
jobId,
184+
hasData: !!statusData.data,
185+
});
186+
187+
if (statusData.status === 'completed') {
188+
if (!statusData.data) {
189+
logger.error('Firecrawl completed but no data returned', { statusData, jobId });
190+
throw new Error('Firecrawl extraction completed but returned no data');
191+
}
192+
193+
// v2 API returns data as an object, convert to string for processing
177194
const extractedData = statusData.data;
178195
if (typeof extractedData === 'string') {
179196
return extractedData;
180197
}
181-
if (typeof extractedData === 'object' && extractedData.content) {
182-
return typeof extractedData.content === 'string'
183-
? extractedData.content
184-
: JSON.stringify(extractedData.content);
185-
}
186-
return JSON.stringify(extractedData);
198+
// Convert structured data to readable text format
199+
return JSON.stringify(extractedData, null, 2);
187200
}
188201

189202
if (statusData.status === 'failed') {
203+
logger.error('Firecrawl extraction failed', { statusData, jobId });
190204
throw new Error('Firecrawl extraction failed');
191205
}
192206

193207
if (statusData.status === 'cancelled') {
208+
logger.error('Firecrawl extraction was cancelled', { statusData, jobId });
194209
throw new Error('Firecrawl extraction was cancelled');
195210
}
211+
212+
// Status is still 'processing', continue polling
196213
}
197214

198215
throw new Error('Firecrawl extraction timed out');
@@ -205,7 +222,7 @@ async function generateSectionContent(
205222
contextHubText: string,
206223
): Promise<string> {
207224
const { text } = await generateText({
208-
model: openai('gpt-4.1'),
225+
model: groq('openai/gpt-oss-120b'),
209226
system: `You are an expert at extracting and organizing company information for audit purposes.
210227
211228
CRITICAL RULES:

0 commit comments

Comments
 (0)