Skip to content

Commit 0c895ff

Browse files
committed
Täsäfication improvements in dalai
1 parent 21fc9dd commit 0c895ff

File tree

3 files changed

+67
-55
lines changed

3 files changed

+67
-55
lines changed

dalai/package-lock.json

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dalai/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"type": "module",
33
"scripts": {
44
"start": "node worker.ts",
5-
"dev": "tsx watch worker.ts"
5+
"dev": "tsx watch worker.ts --clear-screen=false"
66
},
77
"dependencies": {
88
"@aws-sdk/client-s3": "3.882.0",

dalai/worker.ts

Lines changed: 65 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,29 @@
1+
import { GetObjectCommand, PutObjectCommand, S3Client } from '@aws-sdk/client-s3'
12
import { Worker } from 'bullmq'
2-
import path from 'node:path'
3+
import dotenv from 'dotenv'
4+
import Redis, { type RedisOptions } from 'ioredis'
5+
import { createWriteStream } from 'node:fs'
36
import fs from 'node:fs/promises'
4-
import { S3Client, GetObjectCommand, PutObjectCommand } from '@aws-sdk/client-s3'
7+
import path from 'node:path'
58
import { pipeline } from 'node:stream'
6-
import { createWriteStream, createReadStream } from 'node:fs'
7-
import Redis from 'ioredis'
8-
import { pdfToPng } from 'pdf-to-png-converter'
9-
import { v4 as uuidv4 } from 'uuid'
109
import { promisify } from 'node:util'
1110
import pdfToText from 'pdf-parse-fork'
12-
import dotenv from 'dotenv'
11+
import { pdfToPng, type PngPageOutput } from 'pdf-to-png-converter'
1312

1413
dotenv.config()
1514

1615
const pipelineAsync = promisify(pipeline)
1716

18-
async function downloadS3ToFile(s3, bucket, key, destPath) {
17+
async function downloadS3ToFile(s3: S3Client, bucket, key: string, destPath: string) {
1918
const res = await s3.send(new GetObjectCommand({ Bucket: bucket, Key: key }))
2019
await fs.mkdir(path.dirname(destPath), { recursive: true })
20+
if (!res.Body) {
21+
throw new Error('No Body in S3 GetObject response')
22+
}
2123
await pipelineAsync(res.Body, createWriteStream(destPath))
2224
}
2325

24-
async function uploadFileToS3(s3, bucket, key, filePath, contentType) {
26+
async function uploadFileToS3(s3: S3Client, bucket, key: string, filePath: string, contentType: string) {
2527
const Body = await fs.readFile(filePath)
2628
await s3.send(new PutObjectCommand({ Bucket: bucket, Key: key, Body, ContentType: contentType }))
2729
}
@@ -35,7 +37,7 @@ async function pathExists(p) {
3537
}
3638
}
3739

38-
function guessContentType(filePath) {
40+
function guessContentType(filePath: string) {
3941
const ext = path.extname(filePath).toLowerCase()
4042
if (ext === '.txt') return 'text/plain charset=utf-8'
4143
if (ext === '.json') return 'application/json'
@@ -55,9 +57,9 @@ const CA = process.env.CA || undefined
5557
const CERT = process.env.CERT
5658
const KEY = process.env.KEY
5759

58-
let creds = {
60+
let creds: RedisOptions = {
5961
host: REDIS_HOST,
60-
port: REDIS_PORT,
62+
port: Number(REDIS_PORT) || 6379,
6163
maxRetriesPerRequest: null,
6264
}
6365

@@ -69,7 +71,7 @@ if (CA !== undefined) {
6971
cert: CERT,
7072
key: KEY,
7173
servername: REDIS_HOST,
72-
}
74+
},
7375
}
7476
}
7577

@@ -79,10 +81,10 @@ const connection = new Redis(creds)
7981

8082
const QUEUE_NAME = process.env.LLAMA_SCAN_QUEUE || 'llama-scan-queue'
8183
const S3_HOST = process.env.S3_HOST || ''
82-
const S3_ACCESS_KEY = process.env.S3_ACCESS_KEY
83-
const S3_SECRET_ACCESS_KEY = process.env.S3_SECRET_ACCESS_KEY
84+
const S3_ACCESS_KEY = process.env.S3_ACCESS_KEY || ''
85+
const S3_SECRET_ACCESS_KEY = process.env.S3_SECRET_ACCESS_KEY || ''
8486
const OLLAMA_URL = process.env.LAAMA_API_URL ?? process.env.OLLAMA_URL
85-
const LAAMA_API_TOKEN = process.LAAMA_API_TOKEN ?? ''
87+
const LAAMA_API_TOKEN = process.env.LAAMA_API_TOKEN ?? ''
8688

8789
const s3 = new S3Client({
8890
region: 'eu-north-1',
@@ -94,15 +96,15 @@ const s3 = new S3Client({
9496
},
9597
})
9698

97-
async function retryOllamaCall(fn, maxRetries = 3) {
98-
let lastError
99+
async function retryOllamaCall<T>(fn: () => Promise<T>, maxRetries = 3): Promise<T> {
100+
let lastError: any
99101
for (let i = 0; i < maxRetries; i++) {
100102
// Health check before each attempt
101103
try {
102104
return await fn()
103105
} catch (err) {
104106
lastError = err
105-
await new Promise(r => setTimeout(r, 1000 * (i + 1)))
107+
await new Promise((r) => setTimeout(r, 1000 * (i + 1)))
106108
}
107109
}
108110
throw lastError
@@ -113,11 +115,7 @@ async function retryOllamaCall(fn, maxRetries = 3) {
113115
const worker = new Worker(
114116
QUEUE_NAME,
115117
async (job) => {
116-
const {
117-
s3Bucket,
118-
s3Key,
119-
outputBucket,
120-
} = job.data || {}
118+
const { s3Bucket, s3Key, outputBucket } = job.data || {}
121119

122120
console.log(`Processing job ${job.id}`)
123121

@@ -128,7 +126,11 @@ const worker = new Worker(
128126
throw new Error('outputBucket is required in job data')
129127
}
130128

131-
const jobIdPath = job.id.replaceAll('\/', '_')
129+
const jobId = job.id
130+
if (!jobId) {
131+
throw new Error('Job ID is missing')
132+
}
133+
const jobIdPath = jobId.replaceAll('\/', '_')
132134

133135
const uploadsDir = './uploads'
134136
const jobRootDir = path.join(uploadsDir, jobIdPath)
@@ -154,24 +156,25 @@ const worker = new Worker(
154156
}
155157

156158
/**
157-
* Convert PDF pages to text
158-
*/
159+
* Convert PDF pages to text
160+
*/
159161
function pagerender(pageData) {
160162
let render_options = {
161163
normalizeWhitespace: false,
162164
disableCombineTextItems: false,
163165
}
164166
return pageData.getTextContent(render_options).then((textContent) => {
165-
let lastY, text = ''
167+
let lastY: number | null = null,
168+
text = ''
166169
for (let item of textContent.items) {
167-
if (lastY == item.transform[5] || !lastY) {
170+
if (lastY === item.transform[5] || !lastY) {
168171
text += item.str
169172
} else {
170-
text += "\n" + item.str
173+
text += '\n' + item.str
171174
}
172175
lastY = item.transform[5]
173176
}
174-
return `${JSON.stringify({ text, pageNumber: pageData.pageNumber })}\n`;
177+
return `${JSON.stringify({ text, pageNumber: pageData.pageNumber })}\n`
175178
})
176179
}
177180

@@ -180,16 +183,19 @@ const worker = new Worker(
180183
try {
181184
const dataBuffer = await fs.readFile(inputLocalPath)
182185
const data = await pdfToText(dataBuffer, { pagerender })
183-
const jsonObjStrs = data.text.split('\n').filter(line => line.trim().startsWith('{') && line.trim().endsWith('}'))
184-
jsonObjStrs.map(line => {
185-
try {
186-
return JSON.parse(line)
187-
} catch {
188-
return null
189-
}
190-
}).filter(page => page !== null && typeof page.pageNumber === 'number' && typeof page.text === 'string').forEach(page => {
191-
pages[page.pageNumber] = page.text
192-
})
186+
const jsonObjStrs = data.text.split('\n').filter((line) => line.trim().startsWith('{') && line.trim().endsWith('}'))
187+
jsonObjStrs
188+
.map((line) => {
189+
try {
190+
return JSON.parse(line)
191+
} catch {
192+
return null
193+
}
194+
})
195+
.filter((page) => page !== null && typeof page.pageNumber === 'number' && typeof page.text === 'string')
196+
.forEach((page) => {
197+
pages[page.pageNumber] = page.text
198+
})
193199
console.log(`Job ${job.id}: PDF to text conversion complete`)
194200
} catch (error) {
195201
console.error(`Job ${job.id} failed: PDF to text conversion failed`, error)
@@ -199,12 +205,12 @@ const worker = new Worker(
199205
/**
200206
* Convert PDF pages to PNG images
201207
*/
202-
let pngPages
208+
let pngPages: PngPageOutput[] = []
203209
try {
204210
pngPages = await pdfToPng(inputLocalPath, {
205211
outputFileMaskFunc: (pageNumber) => `page_${pageNumber}.png`,
206212
outputFolder: outputImagesDir,
207-
});
213+
})
208214
} catch (error) {
209215
console.error(`Job ${job.id} failed: PDF to PNG conversion failed`, error)
210216
throw new Error('PDF to PNG conversion failed')
@@ -251,8 +257,8 @@ const worker = new Worker(
251257
But you are always obligated to keep the **image** tags intact.`,
252258
prompt: `Parsed PDF text:\n${pdfText}\n\nImage transcription:`,
253259
stream: false,
254-
images: [image.toString('base64')]
255-
})
260+
images: [image.toString('base64')],
261+
}),
256262
})
257263
if (!response.ok) {
258264
const errorBody = await response.text()
@@ -269,7 +275,7 @@ const worker = new Worker(
269275
method: 'POST',
270276
headers: {
271277
'Content-Type': 'application/json',
272-
'token': LAAMA_API_TOKEN
278+
token: LAAMA_API_TOKEN,
273279
},
274280
body: JSON.stringify({
275281
model: 'qwen2.5vl:7b',
@@ -282,16 +288,19 @@ const worker = new Worker(
282288
Remeber you are always obligated to keep the **image** tags and tags insides intact.`,
283289
prompt: `Transcription:\n${transcription}\n\nPDF:\n${pdfText}\n\nCombined Markdown:`,
284290
stream: false,
285-
})
291+
}),
286292
})
287293
if (!response2.ok) {
288294
const errorBody = await response2.text()
289295
throw new Error(`Ollama Markdown API request failed with status ${response2.status}: ${errorBody}`)
290296
}
291297
const data2 = await response2.json()
292298
let text = data2?.response || ''
293-
if (text.trim().startsWith("```markdown")) {
294-
text = text.replace(/^```markdown/, '').replace(/```$/, '').trim()
299+
if (text.trim().startsWith('```markdown')) {
300+
text = text
301+
.replace(/^```markdown/, '')
302+
.replace(/```$/, '')
303+
.trim()
295304
}
296305
// Add page number to the end of the first line if it's a heading
297306
function appendToFirstLine(content, suffix) {
@@ -312,7 +321,6 @@ const worker = new Worker(
312321
}
313322

314323
resultingMarkdown += `\n\n${finalText}`
315-
316324
}
317325

318326
const resultFileName = `${inputFileName}.md`
@@ -333,12 +341,14 @@ const worker = new Worker(
333341
output: { bucket: outputBucket },
334342
}
335343
} finally {
336-
try { await fs.rm(jobRootDir, { recursive: true, force: true }) } catch { }
344+
try {
345+
await fs.rm(jobRootDir, { recursive: true, force: true })
346+
} catch {}
337347
}
338348
},
339349
{
340350
connection,
341-
}
351+
},
342352
)
343353

344354
console.log(`Worker started. Listening to queue "${QUEUE_NAME}"...`)
@@ -353,7 +363,9 @@ worker.on('failed', (job, err) => {
353363

354364
async function shutdown() {
355365
console.log('Shutting down worker...')
356-
try { await worker.close() } catch { }
366+
try {
367+
await worker.close()
368+
} catch {}
357369
process.exit(0)
358370
}
359371
process.on('SIGINT', shutdown)

0 commit comments

Comments
 (0)