Skip to content

Commit 3cefc70

Browse files
committed
Add some examples
1 parent 49100d3 commit 3cefc70

File tree

5 files changed

+93
-14
lines changed

5 files changed

+93
-14
lines changed

README.md

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,8 @@ to go for non-developers.
5555
1. Clone the whole repo, `cd` into the `pyrs990` directory
5656
1. Install dependencies - `poetry install`
5757
1. Run it, some very simple examples are below:
58-
1. `poetry run python -m pyrs990 --zip 59801 --use-disk-cache`
58+
1. `poetry run pyrs990 --zip 59801 --use-disk-cache`
59+
1. `poetry run pyrs990 --load-filters examples/has-a-website.json --use-disk-cache`
5960
1. ...more examples coming soon
6061
1. Run the commands again, notice the cache speedup
6162
1. The cache is set to `./.pyrs990-cache/`
@@ -151,6 +152,57 @@ reduce the number of files you have to download.
151152

152153
See the example queries for more information.
153154

155+
### Index Fields
156+
157+
The index fields available for filtering are listed below. Note that,
158+
in general, the BMF index may be a bit more reliable since it points
159+
directly at the filing data files and doesn't require joining on the
160+
EIN, which we haven't entirely figured out yet (there seem to be EIN
161+
values missing from one or the other index in some cases).
162+
163+
**BMF Index:**
164+
165+
- EIN - used to join indices
166+
- NAME
167+
- ICO
168+
- STREET
169+
- CITY
170+
- STATE
171+
- ZIP
172+
- GROUP
173+
- SUBSECTION
174+
- AFFILIATION
175+
- CLASSIFICATION
176+
- RULING
177+
- DEDUCTIBILITY
178+
- FOUNDATION
179+
- ACTIVITY
180+
- ORGANIZATION
181+
- STATUS
182+
- TAX_PERIOD
183+
- ASSET_CD
184+
- INCOME_CD
185+
- FILING_REQ_CD
186+
- PF_FILING_REQ_CD
187+
- ACCT_PD
188+
- ASSET_AMT
189+
- INCOME_AMT
190+
- REVENUE_AMT
191+
- NTEE_CD
192+
- SORT_NAME
193+
194+
**Annual Index:**
195+
196+
- RETURN_ID
197+
- FILING_TYPE
198+
- EIN - used for joining indices
199+
- TAX_PERIOD
200+
- SUB_DATE
201+
- TAXPAYER_NAME
202+
- RETURN_TYPE
203+
- DLN
204+
- OBJECT_ID - points to filing data
205+
154206
### Sources
155207

156208
The [IRS BMF index files](https://www.irs.gov/charities-non-profits/exempt-organizations-business-master-file-extract-eo-bmf)

examples/has-a-website.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"index": {
3+
"zip": "59801"
4+
},
5+
"filing": {
6+
"website_address": "^(?!N/A).*$"
7+
}
8+
}

examples/words-in-name.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"index": {
3+
"zip": "59801",
4+
"taxpayer_name": "land|river"
5+
},
6+
"filing": {}
7+
}

pyrs990/filters.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import re
2-
from typing import Mapping
2+
from typing import List, Mapping, Pattern
33

44
from .filing import Filing, FilingFilter
55
from .index import IndexFilter, IndexRecord
@@ -13,16 +13,20 @@ def __init__(self, name: str):
1313

1414

1515
def filter_filings(filters: Mapping[str, str],) -> FilingFilter:
16+
patterns: List[Pattern] = []
17+
for field_name, filter_text in filters.items():
18+
patterns.append(re.compile(filter_text, flags=re.IGNORECASE))
19+
1620
def _filter(filing: Filing) -> bool:
1721
passed = True
18-
for fieldName in filters:
19-
filter_text = filters[fieldName]
20-
if not hasattr(filing, fieldName):
21-
raise FieldNotFound(fieldName)
22-
value = getattr(filing, fieldName)
22+
for i, f in enumerate(filters):
23+
if not hasattr(filing, f):
24+
raise FieldNotFound(f)
25+
value = getattr(filing, f)
2326
if value is None:
2427
return False
25-
match = re.match(filter_text, str(value))
28+
pattern = patterns[i]
29+
match = pattern.search(str(value))
2630
if match is None:
2731
passed = False
2832
break
@@ -32,16 +36,20 @@ def _filter(filing: Filing) -> bool:
3236

3337

3438
def filter_index_record(filters: Mapping[str, str],) -> IndexFilter:
39+
patterns: List[Pattern] = []
40+
for field_name, filter_text in filters.items():
41+
patterns.append(re.compile(filter_text, flags=re.IGNORECASE))
42+
3543
def _filter(record: IndexRecord) -> bool:
3644
passed = True
37-
for field_name in filters:
38-
filter_text = filters[field_name]
39-
if not record.has_field(field_name):
40-
raise FieldNotFound(field_name)
41-
value = record.get_field(field_name)
45+
for i, f in enumerate(filters):
46+
if not record.has_field(f):
47+
raise FieldNotFound(f)
48+
value = record.get_field(f)
4249
if value is None:
4350
return False
44-
match = re.match(filter_text, str(value))
51+
pattern = patterns[i]
52+
match = pattern.search(str(value))
4553
if match is None:
4654
passed = False
4755
break

pyrs990/index.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@ class IndexRecord(NamedTuple):
2525
that both represent the same organization, as indicated by the EIN
2626
field. In all cases the two records here should have the same `ein`
2727
value.
28+
29+
TODO: EINs seem to be inconsistent, may want to use a heuristic
30+
Maybe base it on the organization name + zip code or something?
31+
But that would be kind of a pain to validate, so not sure.
2832
"""
2933

3034
annual_record: AnnualRecord

0 commit comments

Comments
 (0)