@@ -3,7 +3,7 @@ import { DateTime } from "luxon"
3
3
import { JSDOM } from "jsdom"
4
4
import { AssemblyAI } from "assemblyai"
5
5
import { logFetchError } from "../common"
6
- import { db , Timestamp } from "../firebase"
6
+ import { admin , db , Timestamp } from "../firebase"
7
7
import * as api from "../malegislature"
8
8
import {
9
9
BaseEvent ,
@@ -20,7 +20,8 @@ import { currentGeneralCourt } from "../shared"
20
20
import { randomBytes } from "node:crypto"
21
21
import { sha256 } from "js-sha256"
22
22
import { withinCutoff } from "./helpers"
23
-
23
+ import ffmpeg from "fluent-ffmpeg"
24
+ import fs from "fs"
24
25
abstract class EventScraper < ListItem , Event extends BaseEvent > {
25
26
private schedule
26
27
private timeout
@@ -33,7 +34,8 @@ abstract class EventScraper<ListItem, Event extends BaseEvent> {
33
34
get function ( ) {
34
35
return runWith ( {
35
36
timeoutSeconds : this . timeout ,
36
- secrets : [ "ASSEMBLY_API_KEY" ]
37
+ secrets : [ "ASSEMBLY_API_KEY" ] ,
38
+ memory : "2GB"
37
39
} )
38
40
. pubsub . schedule ( this . schedule )
39
41
. onRun ( ( ) => this . run ( ) )
@@ -94,7 +96,7 @@ class SpecialEventsScraper extends EventScraper<
94
96
SpecialEvent
95
97
> {
96
98
constructor ( ) {
97
- super ( "every 60 minutes" , 120 )
99
+ super ( "every 60 minutes" , 540 )
98
100
}
99
101
100
102
async listEvents ( ) {
@@ -136,6 +138,53 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
136
138
}
137
139
}
138
140
141
+ const extractAudioFromVideo = async (
142
+ EventId : number ,
143
+ videoUrl : string
144
+ ) : Promise < string > => {
145
+ const tmpFilePath = `/tmp/hearing-${ EventId } -${ Date . now ( ) } .wav`
146
+
147
+ // Stream directly from URL to MP3
148
+ await new Promise < void > ( ( resolve , reject ) => {
149
+ ffmpeg ( videoUrl )
150
+ . noVideo ( )
151
+ . audioCodec ( "copy" )
152
+ . format ( "wav" )
153
+ . on ( "end" , ( ) => resolve ( ) )
154
+ . on ( "error" , reject )
155
+ . save ( tmpFilePath )
156
+ } )
157
+
158
+ // Upload the audio file
159
+ const bucket = admin . storage ( ) . bucket ( )
160
+ const audioFileName = `hearing-${ EventId } -${ Date . now ( ) } .wav`
161
+ const file = bucket . file ( audioFileName )
162
+ await file . save ( tmpFilePath )
163
+
164
+ // Clean up temporary file
165
+ await fs . promises . unlink ( tmpFilePath )
166
+
167
+ const [ url ] = await file . getSignedUrl ( {
168
+ action : "read" ,
169
+ expires : Date . now ( ) + 24 * 60 * 60 * 1000
170
+ } )
171
+
172
+ // Delete old files
173
+ const [ files ] = await bucket . getFiles ( {
174
+ prefix : "hearing-" ,
175
+ maxResults : 1000
176
+ } )
177
+ const oneDayAgo = Date . now ( ) - 24 * 60 * 60 * 1000
178
+ const oldFiles = files . filter ( file => {
179
+ const timestamp = parseInt ( file . name . split ( "-" ) . pop ( ) ?. split ( "." ) [ 0 ] || "0" )
180
+ return timestamp < oneDayAgo
181
+ } )
182
+ await Promise . all ( oldFiles . map ( file => file . delete ( ) ) )
183
+
184
+ // Return the new audio url
185
+ return url
186
+ }
187
+
139
188
const submitTranscription = async ( {
140
189
EventId,
141
190
maybeVideoUrl
@@ -148,11 +197,12 @@ const submitTranscription = async ({
148
197
} )
149
198
150
199
const newToken = randomBytes ( 16 ) . toString ( "hex" )
200
+ const audioUrl = await extractAudioFromVideo ( EventId , maybeVideoUrl )
151
201
152
202
const transcript = await assembly . transcripts . submit ( {
153
203
audio :
154
204
// test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
155
- maybeVideoUrl ,
205
+ audioUrl ,
156
206
webhook_url :
157
207
// make sure process.env.FUNCTIONS_API_BASE equals
158
208
// https://us-central1-digital-testimony-prod.cloudfunctions.net
@@ -226,25 +276,34 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
226
276
const content = HearingContent . check ( data )
227
277
228
278
if ( await shouldScrapeVideo ( EventId ) ) {
229
- const maybeVideoUrl = await getHearingVideoUrl ( EventId )
230
- if ( maybeVideoUrl ) {
231
- const transcriptId = await submitTranscription ( {
232
- maybeVideoUrl,
233
- EventId
234
- } )
235
-
279
+ try {
280
+ const maybeVideoUrl = await getHearingVideoUrl ( EventId )
281
+ if ( maybeVideoUrl ) {
282
+ const transcriptId = await submitTranscription ( {
283
+ maybeVideoUrl,
284
+ EventId
285
+ } )
286
+
287
+ return {
288
+ id : `hearing-${ EventId } ` ,
289
+ type : "hearing" ,
290
+ content,
291
+ ...this . timestamps ( content ) ,
292
+ videoURL : maybeVideoUrl ,
293
+ videoFetchedAt : Timestamp . now ( ) ,
294
+ videoTranscriptionId : transcriptId // using the assembly Id as our transcriptionId
295
+ } as Hearing
296
+ }
297
+ } catch ( error ) {
298
+ console . error ( `Failed to process audio for hearing ${ EventId } :` , error )
236
299
return {
237
300
id : `hearing-${ EventId } ` ,
238
301
type : "hearing" ,
239
302
content,
240
- ...this . timestamps ( content ) ,
241
- videoURL : maybeVideoUrl ,
242
- videoFetchedAt : Timestamp . now ( ) ,
243
- videoTranscriptionId : transcriptId // using the assembly Id as our transcriptionId
303
+ ...this . timestamps ( content )
244
304
} as Hearing
245
305
}
246
306
}
247
-
248
307
return {
249
308
id : `hearing-${ EventId } ` ,
250
309
type : "hearing" ,
0 commit comments