Skip to content

Commit efd1752

Browse files
authored
Add Scrape Hearing Feature to Admin Page (#2021)
* feature(admin): Add quick UI to scrape single hearing to admin page. This will make it easy to run one-offs in the future * chore(admin): Update local Docker instance to run ffmpeg, adding dummy secret for assemblyai to avoid error * fix(scraper): Use empty array instead of undefined as default for committeeChairs in hearing scraper - firebase does not like undefined values by default
1 parent 1e4b5c7 commit efd1752

File tree

8 files changed

+196
-8
lines changed

8 files changed

+196
-8
lines changed
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import { useState } from "react"
2+
import { Title } from "react-admin"
3+
import { Box, Button, TextField, Alert, CircularProgress } from "@mui/material"
4+
import { httpsCallable } from "firebase/functions"
5+
import { functions } from "components/firebase"
6+
7+
type ScrapeHearingRequest = { eventId: number }
8+
type ScrapeHearingResponse = {
9+
status: string
10+
message: string
11+
hearingId: string
12+
}
13+
14+
const scrapeSingleHearing = httpsCallable<
15+
ScrapeHearingRequest,
16+
ScrapeHearingResponse
17+
>(functions, "scrapeSingleHearing")
18+
19+
export const ScrapeHearingForm = () => {
20+
const [eventId, setEventId] = useState("")
21+
const [loading, setLoading] = useState(false)
22+
const [result, setResult] = useState<{
23+
type: "success" | "error"
24+
message: string
25+
} | null>(null)
26+
27+
const handleSubmit = async (e: React.FormEvent) => {
28+
e.preventDefault()
29+
setResult(null)
30+
31+
const parsedEventId = parseInt(eventId, 10)
32+
if (isNaN(parsedEventId)) {
33+
setResult({
34+
type: "error",
35+
message: "Please enter a valid numeric Event ID"
36+
})
37+
return
38+
}
39+
40+
setLoading(true)
41+
try {
42+
const response = await scrapeSingleHearing({ eventId: parsedEventId })
43+
setResult({
44+
type: "success",
45+
message: `${response.data.message} (ID: ${response.data.hearingId})`
46+
})
47+
setEventId("")
48+
} catch (error: any) {
49+
const errorMessage =
50+
error?.message || error?.details?.message || "Failed to scrape hearing"
51+
setResult({
52+
type: "error",
53+
message: errorMessage
54+
})
55+
} finally {
56+
setLoading(false)
57+
}
58+
}
59+
60+
return (
61+
<Box sx={{ padding: 2 }}>
62+
<Title title="Scrape Hearing" />
63+
<Box
64+
component="form"
65+
onSubmit={handleSubmit}
66+
sx={{
67+
display: "flex",
68+
flexDirection: "column",
69+
gap: 2,
70+
maxWidth: 400
71+
}}
72+
>
73+
<TextField
74+
label="Hearing Event ID"
75+
placeholder="e.g., 1234"
76+
value={eventId}
77+
onChange={e => setEventId(e.target.value)}
78+
disabled={loading}
79+
required
80+
type="number"
81+
helperText="Enter the EventId from the MA Legislature website"
82+
fullWidth
83+
/>
84+
<Button
85+
type="submit"
86+
variant="contained"
87+
disabled={loading || !eventId}
88+
sx={{ alignSelf: "flex-start" }}
89+
>
90+
{loading ? (
91+
<>
92+
<CircularProgress size={20} sx={{ mr: 1 }} color="inherit" />
93+
Scraping...
94+
</>
95+
) : (
96+
"Scrape Hearing"
97+
)}
98+
</Button>
99+
{result && (
100+
<Alert severity={result.type} sx={{ mt: 2 }}>
101+
{result.message}
102+
</Alert>
103+
)}
104+
</Box>
105+
</Box>
106+
)
107+
}
108+
109+
export const ScrapeHearingList = () => <ScrapeHearingForm />

components/moderation/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export * from "./ListPublishedTestimony"
33
export * from "./ListReports"
44
export * from "./EditReports"
55
export * from "./ListProfiles"
6+
export * from "./ScrapeHearing"
67

78
import dynamic from "next/dynamic"
89

components/moderation/moderation.tsx

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import { FirebaseDataProvider } from "react-admin-firebase"
44
import { QueryClient, QueryClientProvider } from "react-query"
55
import { EditReports, ListReports } from "./"
66
import { ListProfiles } from "./ListProfiles"
7+
import { ScrapeHearingList } from "./ScrapeHearing"
78
import {
89
createMyOne,
910
getMyListGroup,
@@ -48,6 +49,11 @@ const App = () => {
4849
list={ListProfiles}
4950
options={{ label: "Upgrade Requests" }}
5051
/>
52+
<Resource
53+
name="scrape-hearing"
54+
list={ScrapeHearingList}
55+
options={{ label: "Scrape Hearing" }}
56+
/>
5157
</Admin>
5258
</QueryClientProvider>
5359
)

functions/.secret.local

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
TYPESENSE_API_KEY=test-api-key
2+
ASSEMBLY_API_KEY=test-api-key

functions/src/events/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
export * from "./scrapeEvents"
2+
export { scrapeSingleHearing } from "./scrapeEvents"

functions/src/events/scrapeEvents.ts

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
1+
import * as functions from "firebase-functions"
12
import { RuntimeOptions, runWith } from "firebase-functions"
23
import { DateTime } from "luxon"
34
import { JSDOM } from "jsdom"
45
import { AssemblyAI } from "assemblyai"
5-
import { logFetchError } from "../common"
6+
import { checkAuth, checkAdmin, logFetchError } from "../common"
67
import { db, storage, Timestamp } from "../firebase"
78
import * as api from "../malegislature"
89
import {
@@ -287,7 +288,10 @@ export const getHearingVideoUrl = async (EventId: number) => {
287288
return null
288289
}
289290

290-
const shouldScrapeVideo = async (EventId: number) => {
291+
const shouldScrapeVideo = async (
292+
EventId: number,
293+
ignoreCutoff: boolean = false
294+
) => {
291295
const eventInDb = await db
292296
.collection("events")
293297
.doc(`hearing-${String(EventId)}`)
@@ -300,7 +304,10 @@ const shouldScrapeVideo = async (EventId: number) => {
300304
return false
301305
}
302306
if (!eventData.videoURL) {
303-
return withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate()))
307+
return (
308+
ignoreCutoff ||
309+
withinCutoff(new Date(Hearing.check(eventData).startsAt.toDate()))
310+
)
304311
}
305312
return false
306313
}
@@ -346,7 +353,10 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
346353
return events.filter(HearingListItem.guard)
347354
}
348355

349-
async getEvent({ EventId }: HearingListItem /* e.g. 4962 */) {
356+
async getEvent(
357+
{ EventId }: HearingListItem /* e.g. 4962 */,
358+
{ ignoreCutoff = false }: { ignoreCutoff?: boolean } = {}
359+
) {
350360
const data = await api.getHearing(EventId)
351361
const content = HearingContent.check(data)
352362

@@ -359,9 +369,9 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
359369
host.GeneralCourtNumber,
360370
host.CommitteeCode
361371
)
362-
: undefined
372+
: []
363373

364-
if (await shouldScrapeVideo(EventId)) {
374+
if (await shouldScrapeVideo(EventId, ignoreCutoff)) {
365375
try {
366376
const maybeVideoUrl = await getHearingVideoUrl(EventId)
367377
if (maybeVideoUrl) {
@@ -411,6 +421,61 @@ class HearingScraper extends EventScraper<HearingListItem, Hearing> {
411421
}
412422
}
413423

424+
/**
425+
* Callable cloud function to scrape a single hearing by EventId.
426+
* Requires authentication to prevent abuse of API call limits.
427+
*
428+
* @param data - Object containing the EventId (e.g., 1234)
429+
* @param context - Firebase callable context with auth information
430+
*/
431+
export const scrapeSingleHearing = functions
432+
.runWith({
433+
timeoutSeconds: 480,
434+
secrets: ["ASSEMBLY_API_KEY"],
435+
memory: "4GB"
436+
})
437+
.https.onCall(async (data: { eventId: number }, context) => {
438+
// Require admin authentication
439+
checkAuth(context, false)
440+
checkAdmin(context)
441+
442+
const { eventId } = data
443+
444+
if (!eventId || typeof eventId !== "number") {
445+
throw new functions.https.HttpsError(
446+
"invalid-argument",
447+
"The function must be called with a valid eventId (number)."
448+
)
449+
}
450+
451+
try {
452+
// Create a temporary scraper instance to reuse the existing logic
453+
const scraper = new HearingScraper()
454+
const hearing = await scraper.getEvent(
455+
{ EventId: eventId },
456+
{ ignoreCutoff: true }
457+
)
458+
459+
// Save the hearing to Firestore
460+
await db.doc(`/events/${hearing.id}`).set(hearing, { merge: true })
461+
462+
console.log(`Successfully scraped hearing ${eventId}`, hearing)
463+
464+
return {
465+
status: "success",
466+
message: `Successfully scraped hearing ${eventId}`,
467+
hearingId: hearing.id
468+
}
469+
} catch (error: any) {
470+
console.error(`Failed to scrape hearing ${eventId}:`, error)
471+
throw new functions.https.HttpsError(
472+
"internal",
473+
`Failed to scrape hearing ${eventId}`,
474+
{ details: error.message }
475+
)
476+
}
477+
})
478+
414479
export const scrapeSpecialEvents = new SpecialEventsScraper().function
415480
export const scrapeSessions = new SessionScraper().function
416481
export const scrapeHearings = new HearingScraper().function

functions/src/index.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@ export {
1515
startCommitteeBatches,
1616
updateCommitteeRosters
1717
} from "./committees"
18-
export { scrapeHearings, scrapeSessions, scrapeSpecialEvents } from "./events"
18+
export {
19+
scrapeHearings,
20+
scrapeSessions,
21+
scrapeSpecialEvents,
22+
scrapeSingleHearing
23+
} from "./events"
1924
export {
2025
syncHearingToSearchIndex,
2126
upgradeHearingSearchIndex

infra/Dockerfile.firebase

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
FROM andreysenov/firebase-tools:latest-node-20
22

33
USER root
4-
RUN apt update && apt install -y curl python3 python3-pip python3-venv
4+
RUN apt update && apt install -y curl python3 python3-pip python3-venv ffmpeg
55

66
WORKDIR /app
77
RUN chown -R node:node .

0 commit comments

Comments
 (0)