Skip to content

Commit f16f46e

Browse files
committed
Address review feedback on transcriptions system.
1 parent ca0f2bb commit f16f46e

File tree

2 files changed

+128
-74
lines changed

2 files changed

+128
-74
lines changed

functions/src/events/scrapeEvents.ts

Lines changed: 87 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,75 @@ class SessionScraper extends EventScraper<SessionContent, Session> {
137137
}
138138
}
139139

140+
const submitTranscription = async ({
141+
EventId,
142+
maybeVideoUrl
143+
}: {
144+
EventId: number
145+
maybeVideoUrl: string
146+
}) => {
147+
const newToken = randomBytes(16).toString("hex")
148+
149+
const transcript = await assembly.transcripts.submit({
150+
audio:
151+
// test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
152+
maybeVideoUrl,
153+
webhook_url:
154+
// test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
155+
process.env.NODE_ENV === "development"
156+
? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
157+
: "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
158+
speaker_labels: true,
159+
webhook_auth_header_name: "x-maple-webhook",
160+
webhook_auth_header_value: newToken
161+
})
162+
163+
await db
164+
.collection("events")
165+
.doc(`hearing-${String(EventId)}`)
166+
.collection("private")
167+
.doc("webhookAuth")
168+
.set({
169+
videoAssemblyWebhookToken: sha256(newToken)
170+
})
171+
172+
return transcript.id
173+
}
174+
175+
const getHearingVideoUrl = async (EventId: number) => {
176+
const req = await fetch(
177+
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
178+
)
179+
const res = await req.text()
180+
if (res) {
181+
const dom = new JSDOM(res)
182+
if (dom) {
183+
const maybeVideoSource =
184+
dom.window.document.querySelectorAll("video source")
185+
if (maybeVideoSource.length && maybeVideoSource[0]) {
186+
const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
187+
return firstVideoSource.src
188+
}
189+
}
190+
}
191+
return null
192+
}
193+
194+
const shouldScrapeVideo = async (EventId: number) => {
195+
const eventInDb = await db
196+
.collection("events")
197+
.doc(`hearing-${String(EventId)}`)
198+
.get()
199+
const eventData = eventInDb.data()
200+
if (!eventData) {
201+
return false
202+
}
203+
if (!eventData.videoFetchedAt) {
204+
return withinCutoff(new Date(eventData.StartTime))
205+
}
206+
return false
207+
}
208+
140209
class HearingScraper extends EventScraper<HearingListItem, Hearing> {
141210
constructor() {
142211
super("every 60 minutes", 240)
@@ -150,69 +219,24 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
150219
async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
151220
const data = await api.getHearing(EventId)
152221
const content = HearingContent.check(data)
153-
const eventInDb = await db
154-
.collection("events")
155-
.doc(`hearing-${String(EventId)}`)
156-
.get()
157-
const eventData = eventInDb.data()
158-
const hearing = Hearing.check(eventData)
159-
const shouldScrape = withinCutoff(hearing.startsAt.toDate())
160-
161-
let maybeVideoURL = null
162-
let transcript = null
163-
164-
if (!hearing.videoFetchedAt && shouldScrape) {
165-
const req = await fetch(
166-
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
167-
)
168-
const res = await req.text()
169-
if (res) {
170-
const dom = new JSDOM(res)
171-
if (dom) {
172-
const maybeVideoSource =
173-
dom.window.document.querySelectorAll("video source")
174-
if (maybeVideoSource.length && maybeVideoSource[0]) {
175-
const newToken = randomBytes(16).toString("hex")
176-
const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
177-
maybeVideoURL = firstVideoSource.src
178-
179-
transcript = await assembly.transcripts.submit({
180-
audio:
181-
// test with: "https://assemblyaiusercontent.com/playground/aKUqpEtmYmI.flac",
182-
firstVideoSource.src,
183-
webhook_url:
184-
// test with: "https://ngrokid.ngrok-free.app/demo-dtp/us-central1/transcription",
185-
process.env.NODE_ENV === "development"
186-
? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
187-
: "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
188-
speaker_labels: true,
189-
webhook_auth_header_name: "x-maple-webhook",
190-
webhook_auth_header_value: newToken
191-
})
192-
193-
await db
194-
.collection("events")
195-
.doc(`hearing-${String(EventId)}`)
196-
.set({
197-
id: `hearing-${EventId}`,
198-
type: "hearing",
199-
content,
200-
...this.timestamps(content),
201-
videoURL: maybeVideoURL,
202-
videoFetchedAt: Timestamp.now(),
203-
videoAssemblyId: transcript.id
204-
})
205-
206-
await db
207-
.collection("events")
208-
.doc(`hearing-${String(EventId)}`)
209-
.collection("private")
210-
.doc("webhookAuth")
211-
.set({
212-
videoAssemblyWebhookToken: sha256(newToken)
213-
})
214-
}
215-
}
222+
223+
if (await shouldScrapeVideo(EventId)) {
224+
const maybeVideoUrl = await getHearingVideoUrl(EventId)
225+
if (maybeVideoUrl) {
226+
const transcriptId = await submitTranscription({
227+
maybeVideoUrl,
228+
EventId
229+
})
230+
231+
return {
232+
id: `hearing-${EventId}`,
233+
type: "hearing",
234+
content,
235+
...this.timestamps(content),
236+
videoURL: maybeVideoUrl,
237+
videoFetchedAt: Timestamp.now(),
238+
videoTranscriptionId: transcriptId // using the assembly Id as our transcriptionId
239+
} as Hearing
216240
}
217241
}
218242

functions/src/webhooks/transcription.ts

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,15 @@ const assembly = new AssemblyAI({
88
})
99

1010
export const transcription = functions.https.onRequest(async (req, res) => {
11-
console.log("req.headers", req.headers)
1211
if (req.headers["x-maple-webhook"]) {
13-
console.log("req.body.status", req.body.status)
14-
1512
if (req.body.status === "completed") {
1613
const transcript = await assembly.transcripts.get(req.body.transcript_id)
17-
console.log("transcript.webhook_auth", transcript.webhook_auth)
1814
if (transcript && transcript.webhook_auth) {
1915
const maybeEventInDb = await db
2016
.collection("events")
2117
.where("videoAssemblyId", "==", transcript.id)
2218
.get()
23-
console.log("maybeEventInDb.docs.length", maybeEventInDb.docs.length)
19+
2420
if (maybeEventInDb.docs.length) {
2521
const authenticatedEventsInDb = maybeEventInDb.docs.filter(
2622
async e => {
@@ -41,20 +37,54 @@ export const transcription = functions.https.onRequest(async (req, res) => {
4137
return false
4238
}
4339
)
44-
console.log("authenticatedEventsInDb", authenticatedEventsInDb)
4540

41+
const { id, text, audio_url, utterances, words } = transcript
4642
if (authenticatedEventsInDb) {
4743
try {
48-
await db
44+
const transcriptionInDb = db
4945
.collection("transcriptions")
5046
.doc(transcript.id)
51-
.set({ _timestamp: new Date(), ...transcript })
5247

53-
authenticatedEventsInDb.forEach(async d => {
54-
await d.ref.update({
55-
["x-maple-webhook"]: null
48+
transcriptionInDb.set({
49+
id,
50+
text,
51+
timestamp: new Date(),
52+
audio_url,
53+
words
54+
})
55+
56+
transcriptionInDb
57+
.collection("timestamps")
58+
.doc("utterances")
59+
.set({
60+
utterances: utterances?.map(
61+
({ speaker, confidence, start, end, text }) => ({
62+
speaker,
63+
confidence,
64+
start,
65+
end,
66+
text
67+
})
68+
)
5669
})
70+
71+
transcriptionInDb.collection("timestamps").doc("words").set({
72+
words
73+
})
74+
75+
const batch = db.batch()
76+
77+
batch.set(db.collection("transcriptions").doc(transcript.id), {
78+
_timestamp: new Date(),
79+
...transcript
80+
})
81+
82+
authenticatedEventsInDb.forEach(doc => {
83+
batch.update(doc.ref, { ["x-maple-webhook"]: null })
5784
})
85+
86+
await batch.commit()
87+
5888
console.log("transcript saved in db")
5989
} catch (error) {
6090
console.log(error)

0 commit comments

Comments
 (0)