Skip to content

Commit 9807b3f

Browse files
authored
Merge pull request #1813 from boazsender/automated-transcriptions
send only audio to assembly
2 parents fc8b6d1 + eba856e commit 9807b3f

File tree

5 files changed

+672
-462
lines changed

5 files changed

+672
-462
lines changed

firebase.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"functions": {
88
"predeploy": ["yarn build:functions"],
99
"source": "functions",
10-
"runtime": "nodejs18"
10+
"runtime": "nodejs18",
11+
"runtimeConfig": ".runtimeconfig.json"
1112
},
1213
"firestore": {
1314
"rules": "firestore.rules",

functions/.runtimeconfig.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"runtime": {
3+
"nodejs18": {
4+
"apt": {
5+
"packages": ["ffmpeg"]
6+
}
7+
}
8+
}
9+
}

functions/package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
"date-fns": "^2.30.0",
2121
"firebase-admin": "^10",
2222
"firebase-functions": "^3.22.0",
23+
"fluent-ffmpeg": "^2.1.3",
2324
"fuse.js": "6.5.3",
2425
"handlebars": "^4.7.8",
2526
"js-sha256": "^0.11.0",
@@ -34,6 +35,7 @@
3435
"zod": "^3.20.2"
3536
},
3637
"devDependencies": {
38+
"@types/fluent-ffmpeg": "^2.1.27",
3739
"@types/jest": "^27.4.0",
3840
"@types/jsdom": "^21.1.7",
3941
"@types/luxon": "^2.0.9",

functions/src/events/scrapeEvents.ts

Lines changed: 76 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { DateTime } from "luxon"
33
import { JSDOM } from "jsdom"
44
import { AssemblyAI } from "assemblyai"
55
import { logFetchError } from "../common"
6-
import { db, Timestamp } from "../firebase"
6+
import { admin, db, Timestamp } from "../firebase"
77
import * as api from "../malegislature"
88
import {
99
BaseEvent,
@@ -20,7 +20,8 @@ import { currentGeneralCourt } from "../shared"
2020
import { randomBytes } from "node:crypto"
2121
import { sha256 } from "js-sha256"
2222
import { withinCutoff } from "./helpers"
23-
23+
import ffmpeg from "fluent-ffmpeg"
24+
import fs from "fs"
2425
abstract class EventScraper<ListItem, Event extends BaseEvent> {
2526
private schedule
2627
private timeout
@@ -33,7 +34,8 @@ abstract class EventScraper<ListItem, Event extends BaseEvent> {
3334
get function() {
3435
return runWith({
3536
timeoutSeconds: this.timeout,
36-
secrets: ["ASSEMBLY_API_KEY"]
37+
secrets: ["ASSEMBLY_API_KEY"],
38+
memory: "2GB"
3739
})
3840
.pubsub.schedule(this.schedule)
3941
.onRun(() => this.run())
@@ -94,7 +96,7 @@ class SpecialEventsScraper extends EventScraper<
9496
SpecialEvent
9597
> {
9698
constructor() {
97-
super("every 60 minutes", 120)
99+
super("every 60 minutes", 540)
98100
}
99101

100102
async listEvents() {
@@ -136,6 +138,53 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
136138
}
137139
}
138140

141+
const extractAudioFromVideo = async (
142+
EventId: number,
143+
videoUrl: string
144+
): Promise<string> => {
145+
const tmpFilePath = `/tmp/hearing-${EventId}-${Date.now()}.wav`
146+
147+
// Stream directly from URL to MP3
148+
await new Promise<void>((resolve, reject) => {
149+
ffmpeg(videoUrl)
150+
.noVideo()
151+
.audioCodec("copy")
152+
.format("wav")
153+
.on("end", () => resolve())
154+
.on("error", reject)
155+
.save(tmpFilePath)
156+
})
157+
158+
// Upload the audio file
159+
const bucket = admin.storage().bucket()
160+
const audioFileName = `hearing-${EventId}-${Date.now()}.wav`
161+
const file = bucket.file(audioFileName)
162+
await file.save(tmpFilePath)
163+
164+
// Clean up temporary file
165+
await fs.promises.unlink(tmpFilePath)
166+
167+
const [url] = await file.getSignedUrl({
168+
action: "read",
169+
expires: Date.now() + 24 * 60 * 60 * 1000
170+
})
171+
172+
// Delete old files
173+
const [files] = await bucket.getFiles({
174+
prefix: "hearing-",
175+
maxResults: 1000
176+
})
177+
const oneDayAgo = Date.now() - 24 * 60 * 60 * 1000
178+
const oldFiles = files.filter(file => {
179+
const timestamp = parseInt(file.name.split("-").pop()?.split(".")[0] || "0")
180+
return timestamp < oneDayAgo
181+
})
182+
await Promise.all(oldFiles.map(file => file.delete()))
183+
184+
// Return the new audio url
185+
return url
186+
}
187+
139188
const submitTranscription = async ({
140189
EventId,
141190
maybeVideoUrl
@@ -148,11 +197,12 @@ const submitTranscription = async ({
148197
})
149198

150199
const newToken = randomBytes(16).toString("hex")
200+
const audioUrl = await extractAudioFromVideo(EventId, maybeVideoUrl)
151201

152202
const transcript = await assembly.transcripts.submit({
153203
audio:
154204
// test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
155-
maybeVideoUrl,
205+
audioUrl,
156206
webhook_url:
157207
// make sure process.env.FUNCTIONS_API_BASE equals
158208
// https://us-central1-digital-testimony-prod.cloudfunctions.net
@@ -226,25 +276,34 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
226276
const content = HearingContent.check(data)
227277

228278
if (await shouldScrapeVideo(EventId)) {
229-
const maybeVideoUrl = await getHearingVideoUrl(EventId)
230-
if (maybeVideoUrl) {
231-
const transcriptId = await submitTranscription({
232-
maybeVideoUrl,
233-
EventId
234-
})
235-
279+
try {
280+
const maybeVideoUrl = await getHearingVideoUrl(EventId)
281+
if (maybeVideoUrl) {
282+
const transcriptId = await submitTranscription({
283+
maybeVideoUrl,
284+
EventId
285+
})
286+
287+
return {
288+
id: `hearing-${EventId}`,
289+
type: "hearing",
290+
content,
291+
...this.timestamps(content),
292+
videoURL: maybeVideoUrl,
293+
videoFetchedAt: Timestamp.now(),
294+
videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
295+
} as Hearing
296+
}
297+
} catch (error) {
298+
console.error(`Failed to process audio for hearing ${EventId}:`, error)
236299
return {
237300
id: `hearing-${EventId}`,
238301
type: "hearing",
239302
content,
240-
...this.timestamps(content),
241-
videoURL: maybeVideoUrl,
242-
videoFetchedAt: Timestamp.now(),
243-
videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
303+
...this.timestamps(content)
244304
} as Hearing
245305
}
246306
}
247-
248307
return {
249308
id: `hearing-${EventId}`,
250309
type: "hearing",

0 commit comments

Comments
 (0)