@@ -137,6 +137,75 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
137
137
}
138
138
}
139
139
140
+ const submitTranscription = async ( {
141
+ EventId,
142
+ maybeVideoUrl
143
+ } : {
144
+ EventId : number
145
+ maybeVideoUrl : string
146
+ } ) => {
147
+ const newToken = randomBytes ( 16 ) . toString ( "hex" )
148
+
149
+ const transcript = await assembly . transcripts . submit ( {
150
+ audio :
151
+ // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
152
+ maybeVideoUrl ,
153
+ webhook_url :
154
+ // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
155
+ process . env . NODE_ENV === "development"
156
+ ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
157
+ : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription" ,
158
+ speaker_labels : true ,
159
+ webhook_auth_header_name : "x-maple-webhook" ,
160
+ webhook_auth_header_value : newToken
161
+ } )
162
+
163
+ await db
164
+ . collection ( "events" )
165
+ . doc ( `hearing-${ String ( EventId ) } ` )
166
+ . collection ( "private" )
167
+ . doc ( "webhookAuth" )
168
+ . set ( {
169
+ videoAssemblyWebhookToken : sha256 ( newToken )
170
+ } )
171
+
172
+ return transcript . id
173
+ }
174
+
175
+ const getHearingVideoUrl = async ( EventId : number ) => {
176
+ const req = await fetch (
177
+ `https://malegislature.gov/Events/Hearings/Detail/${ EventId } `
178
+ )
179
+ const res = await req . text ( )
180
+ if ( res ) {
181
+ const dom = new JSDOM ( res )
182
+ if ( dom ) {
183
+ const maybeVideoSource =
184
+ dom . window . document . querySelectorAll ( "video source" )
185
+ if ( maybeVideoSource . length && maybeVideoSource [ 0 ] ) {
186
+ const firstVideoSource = maybeVideoSource [ 0 ] as HTMLSourceElement
187
+ return firstVideoSource . src
188
+ }
189
+ }
190
+ }
191
+ return null
192
+ }
193
+
194
+ const shouldScrapeVideo = async ( EventId : number ) => {
195
+ const eventInDb = await db
196
+ . collection ( "events" )
197
+ . doc ( `hearing-${ String ( EventId ) } ` )
198
+ . get ( )
199
+ const eventData = eventInDb . data ( )
200
+ if ( ! eventData ) {
201
+ return false
202
+ }
203
+ if ( ! eventData . videoFetchedAt ) {
204
+ return withinCutoff ( new Date ( eventData . StartTime ) )
205
+ }
206
+ return false
207
+ }
208
+
140
209
class HearingScraper extends EventScraper < HearingListItem , Hearing > {
141
210
constructor ( ) {
142
211
super ( "every 60 minutes" , 240 )
@@ -150,69 +219,24 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
150
219
async getEvent ( { EventId } : HearingListItem /* e.g. 4962 */ ) {
151
220
const data = await api . getHearing ( EventId )
152
221
const content = HearingContent . check ( data )
153
- const eventInDb = await db
154
- . collection ( "events" )
155
- . doc ( `hearing-${ String ( EventId ) } ` )
156
- . get ( )
157
- const eventData = eventInDb . data ( )
158
- const hearing = Hearing . check ( eventData )
159
- const shouldScrape = withinCutoff ( hearing . startsAt . toDate ( ) )
160
-
161
- let maybeVideoURL = null
162
- let transcript = null
163
-
164
- if ( ! hearing . videoFetchedAt && shouldScrape ) {
165
- const req = await fetch (
166
- `https://malegislature.gov/Events/Hearings/Detail/${ EventId } `
167
- )
168
- const res = await req . text ( )
169
- if ( res ) {
170
- const dom = new JSDOM ( res )
171
- if ( dom ) {
172
- const maybeVideoSource =
173
- dom . window . document . querySelectorAll ( "video source" )
174
- if ( maybeVideoSource . length && maybeVideoSource [ 0 ] ) {
175
- const newToken = randomBytes ( 16 ) . toString ( "hex" )
176
- const firstVideoSource = maybeVideoSource [ 0 ] as HTMLSourceElement
177
- maybeVideoURL = firstVideoSource . src
178
-
179
- transcript = await assembly . transcripts . submit ( {
180
- audio :
181
- // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
182
- firstVideoSource . src ,
183
- webhook_url :
184
- // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
185
- process . env . NODE_ENV === "development"
186
- ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
187
- : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription" ,
188
- speaker_labels : true ,
189
- webhook_auth_header_name : "x-maple-webhook" ,
190
- webhook_auth_header_value : newToken
191
- } )
192
-
193
- await db
194
- . collection ( "events" )
195
- . doc ( `hearing-${ String ( EventId ) } ` )
196
- . set ( {
197
- id : `hearing-${ EventId } ` ,
198
- type : "hearing" ,
199
- content,
200
- ...this . timestamps ( content ) ,
201
- videoURL : maybeVideoURL ,
202
- videoFetchedAt : Timestamp . now ( ) ,
203
- videoAssemblyId : transcript . id
204
- } )
205
-
206
- await db
207
- . collection ( "events" )
208
- . doc ( `hearing-${ String ( EventId ) } ` )
209
- . collection ( "private" )
210
- . doc ( "webhookAuth" )
211
- . set ( {
212
- videoAssemblyWebhookToken : sha256 ( newToken )
213
- } )
214
- }
215
- }
222
+
223
+ if ( await shouldScrapeVideo ( EventId ) ) {
224
+ const maybeVideoUrl = await getHearingVideoUrl ( EventId )
225
+ if ( maybeVideoUrl ) {
226
+ const transcriptId = await submitTranscription ( {
227
+ maybeVideoUrl,
228
+ EventId
229
+ } )
230
+
231
+ return {
232
+ id : `hearing-${ EventId } ` ,
233
+ type : "hearing" ,
234
+ content,
235
+ ...this . timestamps ( content ) ,
236
+ videoURL : maybeVideoUrl ,
237
+ videoFetchedAt : Timestamp . now ( ) ,
238
+ videoTranscriptionId : transcriptId // using the assembly Id as our transcriptionId
239
+ } as Hearing
216
240
}
217
241
}
218
242
0 commit comments