codeforboston · Mephistic · Apr 22, 2025 · Mar 26, 2025 · Mar 27, 2025 · Apr 5, 2025
@@ -137,6 +137,75 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
   }
 }
 
+const submitTranscription = async ({
+  EventId,
+  maybeVideoUrl
+}: {
+  EventId: number
+  maybeVideoUrl: string
+}) => {
+  const newToken = randomBytes(16).toString("hex")
+
+  const transcript = await assembly.transcripts.submit({
+    audio:
+      // test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
+      maybeVideoUrl,
+    webhook_url:
+      // test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
+      process.env.NODE_ENV === "development"
+        ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
+        : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
+    speaker_labels: true,
+    webhook_auth_header_name: "x-maple-webhook",
+    webhook_auth_header_value: newToken
+  })
+
+  await db
+    .collection("events")
+    .doc(`hearing-${String(EventId)}`)
+    .collection("private")
+    .doc("webhookAuth")
+    .set({
+      videoAssemblyWebhookToken: sha256(newToken)
+    })
+
+  return transcript.id
+}
+
+const getHearingVideoUrl = async (EventId: number) => {
+  const req = await fetch(
+    `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
+  )
+  const res = await req.text()
+  if (res) {
+    const dom = new JSDOM(res)
+    if (dom) {
+      const maybeVideoSource =
+        dom.window.document.querySelectorAll("video source")
+      if (maybeVideoSource.length && maybeVideoSource[0]) {
+        const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
+        return firstVideoSource.src
+      }
+    }
+  }
+  return null
+}
+
+const shouldScrapeVideo = async (EventId: number) => {
+  const eventInDb = await db
+    .collection("events")
+    .doc(`hearing-${String(EventId)}`)
+    .get()
+  const eventData = eventInDb.data()
+  if (!eventData) {
+    return false
+  }
+  if (!eventData.videoFetchedAt) {
+    return withinCutoff(new Date(eventData.StartTime))
+  }
+  return false
+}
+
 class HearingScraper extends EventScraper<HearingListItem, Hearing> {
   constructor() {
     super("every 60 minutes", 240)
@@ -150,88 +219,33 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
   async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
     const data = await api.getHearing(EventId)
     const content = HearingContent.check(data)
-    const eventInDb = await db
-      .collection("events")
-      .doc(`hearing-${String(EventId)}`)
-      .get()
-    const eventData = eventInDb.data()
-    const hearing = Hearing.check(eventData)
-    const shouldScrape = withinCutoff(hearing.startsAt.toDate())
-
-    let payload: Hearing = {
+
+    if (await shouldScrapeVideo(EventId)) {
+      const maybeVideoUrl = await getHearingVideoUrl(EventId)
+      if (maybeVideoUrl) {
+        const transcriptId = await submitTranscription({
+          maybeVideoUrl,
+          EventId
+        })
+
+        return {
+          id: `hearing-${EventId}`,
+          type: "hearing",
+          content,
+          ...this.timestamps(content),
+          videoURL: maybeVideoUrl,
+          videoFetchedAt: Timestamp.now(),
+          videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
+        } as Hearing
+      }
+    }
+
+    return {
       id: `hearing-${EventId}`,
       type: "hearing",
       content,
       ...this.timestamps(content)
-    }
-    if (hearing) {
-      payload = {
-        ...payload,
-        videoURL: hearing.videoURL,
-        videoFetchedAt: hearing.videoFetchedAt,
-        videoAssemblyId: hearing.videoAssemblyId
-      }
-    }
-    let maybeVideoURL = null
-    let transcript = null
-
-    if (!hearing.videoFetchedAt && shouldScrape) {
-      const req = await fetch(
-        `https://malegislature.gov/Events/Hearings/Detail/${EventId}`
-      )
-      const res = await req.text()
-      if (res) {
-        const dom = new JSDOM(res)
-        if (dom) {
-          const maybeVideoSource =
-            dom.window.document.querySelectorAll("video source")
-          if (maybeVideoSource.length && maybeVideoSource[0]) {
-            const newToken = randomBytes(16).toString("hex")
-            const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
-            maybeVideoURL = firstVideoSource.src
-
-            transcript = await assembly.transcripts.submit({
-              webhook_url:
-                process.env.NODE_ENV === "development"
-                  ? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
-                  : "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
-              webhook_auth_header_name: "X-Maple-Webhook",
-              webhook_auth_header_value: newToken,
-              audio: firstVideoSource.src,
-              auto_highlights: true,
-              custom_topics: true,
-              entity_detection: true,
-              iab_categories: false,
-              format_text: true,
-              punctuate: true,
-              speaker_labels: true,
-              summarization: true,
-              summary_model: "informative",
-              summary_type: "bullets"
-            })
-
-            await db
-              .collection("events")
-              .doc(`hearing-${String(EventId)}`)
-              .collection("private")
-              .doc("webhookAuth")
-              .set({
-                videoAssemblyWebhookToken: sha256(newToken)
-              })
-
-            payload = {
-              ...payload,
-              videoURL: maybeVideoURL,
-              videoFetchedAt: Timestamp.now(),
-              videoAssemblyId: transcript.id
-            }
-          }
-        }
-      }
-    }
-
-    const event: Hearing = payload
-    return event
+    } as Hearing
   }
 }
 

@@ -8,23 +8,19 @@ const assembly = new AssemblyAI({
 })
 
 export const transcription = functions.https.onRequest(async (req, res) => {
-  if (
-    req.headers["X-Maple-Webhook"] &&
-    req.headers["webhook_auth_header_value"]
-  ) {
+  if (req.headers["x-maple-webhook"]) {
     if (req.body.status === "completed") {
       const transcript = await assembly.transcripts.get(req.body.transcript_id)
       if (transcript && transcript.webhook_auth) {
         const maybeEventInDb = await db
           .collection("events")
           .where("videoAssemblyId", "==", transcript.id)
           .get()
+
         if (maybeEventInDb.docs.length) {
           const authenticatedEventsInDb = maybeEventInDb.docs.filter(
             async e => {
-              const hashedToken = sha256(
-                String(req.headers["webhook_auth_header_value"])
-              )
+              const hashedToken = sha256(String(req.headers["x-maple-webhook"]))
 
               const tokenInDb = await db
                 .collection("events")
@@ -33,24 +29,62 @@ export const transcription = functions.https.onRequest(async (req, res) => {
                 .doc("webhookAuth")
                 .get()
               const tokenInDbData = tokenInDb.data()
+              console.log("tokenInDbData", tokenInDbData)
+
               if (tokenInDbData) {
                 return hashedToken === tokenInDbData.videoAssemblyWebhookToken
               }
               return false
             }
           )
+
+          const { id, text, audio_url, utterances, words } = transcript
           if (authenticatedEventsInDb) {
             try {
-              await db
+              const transcriptionInDb = db
                 .collection("transcriptions")
                 .doc(transcript.id)
-                .set({ _timestamp: new Date(), ...transcript })
 
-              authenticatedEventsInDb.forEach(async d => {
-                await d.ref.update({
-                  ["webhook_auth_header_value"]: null
+              transcriptionInDb.set({
+                id,
+                text,
+                timestamp: new Date(),
+                audio_url,
+                words
+              })
+
+              transcriptionInDb
+                .collection("timestamps")
+                .doc("utterances")
+                .set({
+                  utterances: utterances?.map(
+                    ({ speaker, confidence, start, end, text }) => ({
+                      speaker,
+                      confidence,
+                      start,
+                      end,
+                      text
+                    })
+                  )
                 })
+
+              transcriptionInDb.collection("timestamps").doc("words").set({
+                words
               })
+
+              const batch = db.batch()
+
+              batch.set(db.collection("transcriptions").doc(transcript.id), {
+                _timestamp: new Date(),
+                ...transcript
+              })
+
+              authenticatedEventsInDb.forEach(doc => {
+                batch.update(doc.ref, { ["x-maple-webhook"]: null })
+              })
+
+              await batch.commit()
+
               console.log("transcript saved in db")
             } catch (error) {
               console.log(error)