Skip to content

Commit b9e6f51

Browse files
authored
Add Python document on_created trigger to populate summary and topics (#1908)
* Initial commit for on document created trigger * Make a test fail * Annotate and refactor * Untested commit * Idempotent pass * Add some documentation * Add the working functions * Minor refactors
1 parent 06080a0 commit b9e6f51

File tree

5 files changed

+402
-0
lines changed

5 files changed

+402
-0
lines changed

functions/src/bills/types.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,9 @@ export const BillTopic = Record({
5959
* to appear in results when sorting by that value. */
6060
export const MISSING_TIMESTAMP = Timestamp.fromMillis(0)
6161

62+
/**
63+
* If you update this you also need to update `llm/bill_on_document_created.py`
64+
*/
6265
export const TOPICS_BY_CATEGORY = {
6366
Commerce: [
6467
"Banking and financial institutions regulation",

llm/bill_on_document_created.py

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
from firebase_functions.firestore_fn import (
2+
Event,
3+
DocumentSnapshot,
4+
)
5+
from llm_functions import get_summary_api_function, get_tags_api_function_v2
6+
from typing import TypedDict, NewType
7+
8+
Category = NewType("Category", str)
9+
10+
11+
# This allows us to type the return of `get_topics`
12+
class TopicAndCategory(TypedDict):
13+
# We use the name `tag` in Python, but `topic` in the database
14+
topic: str
15+
# Topic can be mapped directly to a category
16+
category: Category
17+
18+
19+
# Get the corresponding categories for every topic if it is present in the
20+
# topic_to_category list
21+
def get_categories_from_topics(
22+
topics: list[str], topic_to_category: dict[str, Category]
23+
) -> list[TopicAndCategory]:
24+
return [
25+
TopicAndCategory(topic=topic, category=topic_to_category[topic])
26+
for topic in topics
27+
if topic_to_category.get(topic)
28+
]
29+
30+
31+
# When a bill is created for a given session, we want to populate both the
32+
# summary and the tags for that bill. This is an idempotent function.
33+
def run_trigger(event: Event[DocumentSnapshot | None]) -> None:
34+
bill_id = event.params["bill_id"]
35+
inserted_data = event.data
36+
if inserted_data is None:
37+
print(f"bill with id `{bill_id}` has no event data")
38+
return
39+
40+
inserted_content = inserted_data.to_dict()
41+
if inserted_content is None:
42+
print(f"bill with id `{bill_id}` has no inserted content")
43+
return
44+
45+
# If the summary is already populated, only run the tags code
46+
summary = inserted_content.get("summary")
47+
if summary is None:
48+
document_text = inserted_content.get("contents", {}).get("DocumentText")
49+
document_title = inserted_content.get("contents", {}).get("Title")
50+
if document_text is None or document_title is None:
51+
print(f"bill with id `{bill_id}` unable to fetch document text or title")
52+
return
53+
54+
summary = get_summary_api_function(bill_id, document_title, document_text)
55+
56+
if summary["status"] in [-1, -2]:
57+
print(
58+
f"failed to generate summary for bill with id `{bill_id}`, got {summary['status']}"
59+
)
60+
return
61+
62+
# Set and insert the summary for the categorization step
63+
summary = summary["summary"]
64+
inserted_data.reference.update({"summary": summary})
65+
print(f"Successfully updated summary for bill with id `{bill_id}`")
66+
67+
# If the topics are already populated, we are done
68+
topics = inserted_content.get("topics")
69+
if topics is not None:
70+
print(f"bill with id `{bill_id}` has topics")
71+
return
72+
73+
tags = get_tags_api_function_v2(bill_id, document_title, summary)
74+
75+
if tags["status"] != 1:
76+
print(
77+
f"failed to generate tags for bill with id `{bill_id}`, got {tags['status']}"
78+
)
79+
return
80+
topics_and_categories = get_categories_from_topics(tags["tags"], CATEGORY_BY_TOPIC)
81+
inserted_data.reference.update({"topics": topics_and_categories})
82+
print(f"Successfully updated topics for bill with id `{bill_id}`")
83+
return
84+
85+
86+
TOPICS_BY_CATEGORY: dict[Category, list[str]] = {
87+
Category("Commerce"): [
88+
"Banking and financial institutions regulation",
89+
"Consumer protection",
90+
"Corporation law and goverance",
91+
"Commercial insurance",
92+
"Marketing and advertising",
93+
"Non-profit law and governance",
94+
"Occupational licensing",
95+
"Partnerships and limited liability companies",
96+
"Retail and wholesale trades",
97+
"Securities",
98+
],
99+
Category("Crime and Law Enforcement"): [
100+
"Assault and harassment offenses",
101+
"Correctional facilities",
102+
"Crimes against animals and natural resources",
103+
"Crimes against children",
104+
"Criminal investigation, prosecution, interrogation",
105+
"Criminal justice information and records",
106+
"Criminal justice reform",
107+
"Criminal sentencing",
108+
"Firearms and explosives",
109+
"Fraud offenses and financial crimes",
110+
"Property crimes",
111+
],
112+
Category("Economics and Public Finance"): [
113+
"Budget process",
114+
"Debt collection",
115+
"Eminent domain",
116+
"Financial literacy",
117+
"Financial services and investments",
118+
"Government contractors",
119+
"Pension and retirement benefits",
120+
],
121+
Category("Education"): [
122+
"Academic performance and assessments",
123+
"Adult education and literacy",
124+
"Charter and private schools",
125+
"Curriculum and standards",
126+
"Education technology",
127+
"Educational facilities and institutions",
128+
"Elementary and secondary education",
129+
"Higher education",
130+
"Special education",
131+
"Student aid and college costs",
132+
"Teachers and educators",
133+
"Vocational and technical education",
134+
],
135+
Category("Emergency Management"): [
136+
"Disaster relief and insurance",
137+
"Emergency communications systems",
138+
"Emergency medical services and trauma care",
139+
"Emergency planning and evacuation",
140+
"Hazards and emergency operations",
141+
],
142+
Category("Energy"): [
143+
"Energy costs assistance",
144+
"Energy efficiency and conservation",
145+
"Energy infrastructure and storage",
146+
"Energy prices and subsidies",
147+
"Energy research",
148+
"Renewable energy sources",
149+
],
150+
Category("Environmental Protection"): [
151+
"Air quality",
152+
"Environmental assessment, monitoring, research",
153+
"Environmental education",
154+
"Environmental health",
155+
"Environmental regulatory procedures",
156+
"Hazardous wastes and toxic substances",
157+
"Pollution control and abatement",
158+
"Soil pollution",
159+
"Trash and recycling",
160+
"Water quality",
161+
"Wetlands",
162+
"Wildlife conservation",
163+
],
164+
Category("Families"): [
165+
"Adoption and foster care",
166+
"Family planning and birth control",
167+
"Family relationships and status",
168+
"Family services",
169+
"Life insurance",
170+
"Parenting and parental rights",
171+
],
172+
Category("Food, Drugs, and Alcohol"): [
173+
"Alcoholic beverages and licenses",
174+
"Drug, alcohol, tobacco use",
175+
"Drug safety, medical device, and laboratory regulation",
176+
"Food industry and services",
177+
"Food service employment",
178+
"Food supply, safety, and labeling",
179+
"Nutrition and diet",
180+
],
181+
Category("Government Operations and Elections"): [
182+
"Census and government statistics",
183+
"Government information and archives",
184+
"Government studies and investigations",
185+
"Government trust funds",
186+
"Lobbying and campaign finance",
187+
'Municipality oversight and "home rule petitions"',
188+
"Political advertising",
189+
"Public-private partnerships",
190+
"Voting and elections",
191+
],
192+
Category("Healthcare"): [
193+
"Alternative treatments",
194+
"Dental care",
195+
"Health care costs",
196+
"Health facilities and institutions",
197+
"Health information and medical records",
198+
"Health insurance and coverage",
199+
"Health technology, devices, supplies",
200+
"Healthcare workforce",
201+
"Medical research",
202+
"Mental health",
203+
"Prescription drugs",
204+
"Sex and reproductive health",
205+
"Substance use disorder and addiction",
206+
"Telehealth",
207+
"Veterinary services and pets",
208+
],
209+
Category("Housing and Community Development"): [
210+
"Community life and organization",
211+
"Cooperative and condominium housing",
212+
"Homelessness and emergency shelter",
213+
"Housing discrimination",
214+
"Housing finance and home ownership",
215+
"Housing for the elderly and disabled",
216+
"Housing industry and standards",
217+
"Housing supply and affordability",
218+
"Landlord and tenant",
219+
"Low- and moderate-income housing",
220+
"Residential rehabilitation and home repair",
221+
],
222+
Category("Immigrants and Foreign Nationals"): [
223+
"Immigrant health and welfare",
224+
"Refugees, asylum, displaced persons",
225+
"Right to shelter",
226+
"Translation and language services",
227+
],
228+
Category("Labor and Employment"): [
229+
"Employee benefits",
230+
"Employment discrimination",
231+
"Employee leave",
232+
"Employee pensions",
233+
"Employee performance",
234+
"Migrant, seasonal, agricultural labor",
235+
"Self-employment",
236+
"Temporary and part-time employment",
237+
"Workers' compensation",
238+
"Workforce development and employment training",
239+
"Worker safety and health",
240+
"Youth employment and child labor",
241+
],
242+
Category("Law and Judiciary"): [
243+
"Civil disturbances",
244+
"Evidence and witnesses",
245+
"Judicial and court records",
246+
"Judicial review and appeals",
247+
"Jurisdiction and venue",
248+
"Legal fees and court costs",
249+
],
250+
Category("Public and Natural Resources"): [
251+
"Agriculture and aquaculture",
252+
"Coastal zones and ocean",
253+
"Forests, forestry, trees",
254+
"Monuments and memorials",
255+
"Watershed and water resources",
256+
"Wildlife",
257+
],
258+
Category("Social Services"): [
259+
"Child care and development",
260+
"Domestic violence and child abuse",
261+
"Food assistance and relief",
262+
"Home and outpatient care",
263+
"Social work, volunteer service, charitable organizations",
264+
"Unemployment",
265+
"Urban and suburban affairs and development",
266+
"Veterans' education, employment, rehabilitation",
267+
"Veterans' loans, housing, homeless programs",
268+
"Veterans' medical care",
269+
],
270+
Category("Sports and Recreation"): [
271+
"Art and culture",
272+
"Gambling and lottery",
273+
"Hunting and fishing",
274+
"Outdoor recreation",
275+
"Professional sports, stadiums and arenas",
276+
"Public parks",
277+
"Sports and recreation facilities",
278+
],
279+
Category("Taxation"): [
280+
"Capital gains tax",
281+
"Corporate tax",
282+
"Estate tax",
283+
"Excise tax",
284+
"Gift tax",
285+
"Income tax",
286+
"Payroll and emplyoment tax",
287+
"Property tax",
288+
"Sales tax",
289+
"Tax-exempt organizations",
290+
"Transfer and inheritance taxes",
291+
],
292+
Category("Technology and Communications"): [
293+
"Advanced technology and technological innovations",
294+
"Atmospheric science and weather",
295+
"Broadband and internet access",
296+
"Computers and information technology",
297+
"Cybersecurity and identity theft",
298+
"Data privacy",
299+
"Emerging technology (artificial intelligence, blockchain, etc.)",
300+
"Genetics",
301+
"Internet, web applications, social media",
302+
"Photography and imaging",
303+
"Telecommunication rates and fees",
304+
"Telephone and wireless communication",
305+
],
306+
Category("Transportation and Public Works"): [
307+
"Aviation and airports",
308+
"Highways and roads",
309+
"MBTA & public transportation",
310+
"Public utilities and utility rates",
311+
"Railroads",
312+
"Vehicle insurance and repairs",
313+
"Water storage",
314+
"Water use and supply",
315+
],
316+
}
317+
318+
CATEGORY_BY_TOPIC: dict[str, Category] = {
319+
topic: category
320+
for category, topics in TOPICS_BY_CATEGORY.items()
321+
for topic in topics
322+
}

llm/main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
from firebase_admin import initialize_app
55
from firebase_functions import https_fn, options
66
import os
7+
from firebase_functions.firestore_fn import (
8+
on_document_created,
9+
Event,
10+
DocumentSnapshot,
11+
)
12+
import bill_on_document_created
713

814
initialize_app()
915
app = Flask(__name__)
@@ -71,3 +77,12 @@ def ready():
7177
def httpsflaskexample(req: https_fn.Request) -> https_fn.Response:
7278
with app.request_context(req.environ):
7379
return app.full_dispatch_request()
80+
81+
82+
@on_document_created(
83+
secrets=["OPENAI_DEV", "OPENAI_PROD"],
84+
document="generalCourts/{session_id}/bills/{bill_id}",
85+
)
86+
def add_summary_on_document_created(event: Event[DocumentSnapshot | None]) -> None:
87+
set_openai_api_key()
88+
bill_on_document_created.run_trigger(event)

llm/readme.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,3 +193,25 @@ The current API is a little wonky because we take `bill_id` **and** `bill_text`.
193193
We could just look up the `bill_text` via the `bill_id` using the Firestore API.
194194
It might make sense to avoid the HTTP wrapper all-together and figure out how
195195
JS <-> Python communication works without an HTTP layer.
196+
197+
## Document triggers
198+
199+
The first trigger is an `@on_document_created` trigger in
200+
`bill_on_document_created.py`. The goal is to populate the `summary` and
201+
`topics` fields on bills which don't already have them. I've introduced some
202+
tests which you can run with `pytest` and additional type safety added with
203+
`mypy`. If you haven't used `NewType`s before, I explain them below.
204+
205+
### NewType in Python
206+
207+
First, `Category = NewType("Category", str)` creates a wrapper around `str`
208+
which can be used anywhere in the code as a `str` but is type checked as
209+
`Category`. This is useful in `get_categories_from_topics` to avoid mixing up
210+
the topic for the category! These are often called "newtypes". It is important
211+
to point out that you shouldn't go wrap **every** type this way but they are useful
212+
when you have functions like,
213+
214+
```python
215+
def could_easily_goof_up_the_order(topic: str, category: str):
216+
return ...
217+
```

0 commit comments

Comments
 (0)