Skip to content

Commit 3ff729e

Browse files
authored
Merge pull request #1747 from boazsender/automated-transcriptions
WIP automated transcriptions
2 parents 61ee874 + 5fdb0f2 commit 3ff729e

File tree

9 files changed

+539
-9
lines changed

9 files changed

+539
-9
lines changed

functions/package.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,15 @@
1515
"dependencies": {
1616
"@google-cloud/firestore": "^5.0.2",
1717
"@google-cloud/pubsub": "^3.0.1",
18+
"assemblyai": "^4.9.0",
1819
"axios": "^0.25.0",
1920
"date-fns": "^2.30.0",
2021
"firebase-admin": "^10",
2122
"firebase-functions": "^3.22.0",
2223
"fuse.js": "6.5.3",
2324
"handlebars": "^4.7.8",
25+
"js-sha256": "^0.11.0",
26+
"jsdom": "^26.0.0",
2427
"lodash": "^4.17.21",
2528
"luxon": "^2.3.1",
2629
"nanoid": "^3.3.2",
@@ -32,6 +35,7 @@
3235
},
3336
"devDependencies": {
3437
"@types/jest": "^27.4.0",
38+
"@types/jsdom": "^21.1.7",
3539
"@types/luxon": "^2.0.9",
3640
"@types/object-hash": "^2.2.1",
3741
"copyfiles": "^2.4.1",

functions/src/events/helpers.test.ts

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import { addDays, subDays } from "date-fns"
2+
import { withinCutoff } from "./helpers"
3+
4+
describe("withinCutoff true", () => {
5+
beforeEach(() => {
6+
jest.useFakeTimers()
7+
})
8+
9+
afterEach(() => {
10+
jest.useRealTimers()
11+
})
12+
13+
it("should return true for a date within three days", () => {
14+
const now = new Date()
15+
16+
const threeDaysAgo = subDays(now, 3)
17+
18+
const result = withinCutoff(threeDaysAgo)
19+
expect(result).toEqual(true)
20+
})
21+
22+
it("should return false for a date that is 9 days ago", () => {
23+
const now = new Date()
24+
25+
const threeDaysAgo = subDays(now, 9)
26+
27+
const result = withinCutoff(threeDaysAgo)
28+
expect(result).toEqual(false)
29+
})
30+
31+
it("should return false for a date that is 2 days in the future", () => {
32+
const now = new Date()
33+
34+
const threeDaysAgo = addDays(now, 2)
35+
36+
const result = withinCutoff(threeDaysAgo)
37+
expect(result).toEqual(false)
38+
})
39+
})

functions/src/events/helpers.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import { isAfter, subDays } from "date-fns"
2+
3+
export const withinCutoff = (date: Date) => {
4+
const now = new Date()
5+
const cutoff = subDays(now, 8)
6+
7+
return isAfter(date, cutoff) && !isAfter(date, now)
8+
}

functions/src/events/scrapeEvents.ts

Lines changed: 74 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import { runWith } from "firebase-functions"
22
import { DateTime } from "luxon"
3+
import { JSDOM } from "jsdom"
4+
import { AssemblyAI } from "assemblyai"
35
import { logFetchError } from "../common"
46
import { db, Timestamp } from "../firebase"
57
import * as api from "../malegislature"
@@ -15,6 +17,13 @@ import {
1517
SpecialEventContent
1618
} from "./types"
1719
import { currentGeneralCourt } from "../shared"
20+
import { randomBytes } from "node:crypto"
21+
import { sha256 } from "js-sha256"
22+
import { withinCutoff } from "./helpers"
23+
24+
const assembly = new AssemblyAI({
25+
apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
26+
})
1827

1928
abstract class EventScraper<ListItem, Event extends BaseEvent> {
2029
private schedule
@@ -40,7 +49,7 @@ abstract class EventScraper<ListItem, Event extends BaseEvent> {
4049
if (!list) return
4150

4251
const writer = db.bulkWriter()
43-
const upcomingOrRecentCutoff = DateTime.now().minus({ days: 1 })
52+
const upcomingOrRecentCutoff = DateTime.now().minus({ days: 8 })
4453

4554
for (let item of list) {
4655
const id = (item as any)?.EventId,
@@ -138,12 +147,73 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
138147
return events.filter(HearingListItem.guard)
139148
}
140149

141-
async getEvent({ EventId }: HearingListItem) {
142-
const content = HearingContent.check(await api.getHearing(EventId))
150+
async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
151+
const data = await api.getHearing(EventId)
152+
const content = HearingContent.check(data)
153+
const eventInDb = await db
154+
.collection("events")
155+
.doc(`hearing-${String(EventId)}`)
156+
.get()
157+
const eventData = eventInDb.data()
158+
const hearing = Hearing.check(eventData)
159+
const shouldScrape = withinCutoff(hearing.startsAt.toDate())
160+
161+
let maybeVideoURL = null
162+
let transcript = null
163+
if (!hearing.videoFetchedAt && shouldScrape) {
164+
const req = await fetch(
165+
`https://malegislature.gov/Events/Hearings/Detail/${EventId}`
166+
)
167+
const res = await req.text()
168+
if (res) {
169+
const dom = new JSDOM(res)
170+
if (dom) {
171+
const maybeVideoSource =
172+
dom.window.document.querySelectorAll("video source")
173+
if (maybeVideoSource.length && maybeVideoSource[0]) {
174+
const newToken = randomBytes(16).toString("hex")
175+
const firstVideoSource = maybeVideoSource[0] as HTMLSourceElement
176+
maybeVideoURL = firstVideoSource.src
177+
178+
transcript = await assembly.transcripts.submit({
179+
webhook_url:
180+
process.env.NODE_ENV === "development"
181+
? "https://us-central1-digital-testimony-dev.cloudfunctions.net/transcription"
182+
: "https://us-central1-digital-testimony-prod.cloudfunctions.net/transcription",
183+
webhook_auth_header_name: "X-Maple-Webhook",
184+
webhook_auth_header_value: newToken,
185+
audio: firstVideoSource.src,
186+
auto_highlights: true,
187+
custom_topics: true,
188+
entity_detection: true,
189+
iab_categories: false,
190+
format_text: true,
191+
punctuate: true,
192+
speaker_labels: true,
193+
summarization: true,
194+
summary_model: "informative",
195+
summary_type: "bullets"
196+
})
197+
198+
await db
199+
.collection("events")
200+
.doc(`hearing-${String(EventId)}`)
201+
.collection("private")
202+
.doc("webhookAuth")
203+
.set({
204+
videoAssemblyWebhookToken: sha256(newToken)
205+
})
206+
}
207+
}
208+
}
209+
}
143210
const event: Hearing = {
144-
id: `hearing-${content.EventId}`,
211+
id: `hearing-${EventId}`,
145212
type: "hearing",
146213
content,
214+
videoURL: maybeVideoURL,
215+
videoFetchedAt: maybeVideoURL ? Timestamp.now() : null,
216+
videoAssemblyId: transcript ? transcript.id : null,
147217
...this.timestamps(content)
148218
}
149219
return event

functions/src/events/types.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,10 @@ export const HearingListItem = Record({ EventId: Number })
7676
export type Hearing = Static<typeof Hearing>
7777
export const Hearing = BaseEvent.extend({
7878
type: L("hearing"),
79-
content: HearingContent
79+
content: HearingContent,
80+
videoURL: Nullable(String),
81+
videoAssemblyId: Nullable(String),
82+
videoFetchedAt: Nullable(InstanceOf(Timestamp))
8083
})
8184

8285
export type Event = Static<typeof Event>

functions/src/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ export {
4848
unfollowUser
4949
} from "./subscriptions"
5050

51+
export { transcription } from "./webhooks"
52+
5153
export * from "./triggerPubsubFunction"
5254

5355
// Export the health check last so it is loaded last.

functions/src/webhooks/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
export * from "./transcription"
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import * as functions from "firebase-functions"
2+
import { AssemblyAI } from "assemblyai"
3+
import { db } from "../firebase"
4+
import { sha256 } from "js-sha256"
5+
6+
const assembly = new AssemblyAI({
7+
apiKey: process.env.ASSEMBLY_API_KEY ? process.env.ASSEMBLY_API_KEY : ""
8+
})
9+
10+
export const transcription = functions.https.onRequest(async (req, res) => {
11+
if (
12+
req.headers["X-Maple-Webhook"] &&
13+
req.headers["webhook_auth_header_value"]
14+
) {
15+
if (req.body.status === "completed") {
16+
const transcript = await assembly.transcripts.get(req.body.transcript_id)
17+
if (transcript && transcript.webhook_auth) {
18+
const maybeEventInDb = await db
19+
.collection("events")
20+
.where("videoAssemblyId", "==", transcript.id)
21+
.get()
22+
if (maybeEventInDb.docs.length) {
23+
const authenticatedEventsInDb = maybeEventInDb.docs.filter(
24+
async e => {
25+
const hashedToken = sha256(
26+
String(req.headers["webhook_auth_header_value"])
27+
)
28+
29+
const tokenInDb = await db
30+
.collection("events")
31+
.doc(e.id)
32+
.collection("private")
33+
.doc("webhookAuth")
34+
.get()
35+
const tokenInDbData = tokenInDb.data()
36+
if (tokenInDbData) {
37+
return hashedToken === tokenInDbData.videoAssemblyWebhookToken
38+
}
39+
return false
40+
}
41+
)
42+
if (authenticatedEventsInDb) {
43+
try {
44+
await db
45+
.collection("transcriptions")
46+
.doc(transcript.id)
47+
.set({ _timestamp: new Date(), ...transcript })
48+
49+
authenticatedEventsInDb.forEach(async d => {
50+
await d.ref.update({
51+
["webhook_auth_header_value"]: null
52+
})
53+
})
54+
console.log("transcript saved in db")
55+
} catch (error) {
56+
console.log(error)
57+
}
58+
}
59+
} else {
60+
res.status(404).send("Not Found")
61+
}
62+
}
63+
}
64+
}
65+
res.status(200).send()
66+
})

0 commit comments

Comments
 (0)