FinBot/get_filings.py at main · niti-go/FinBot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import pandas as pd
#from config import API_KEY #later on we will individually need to create a config.py file with api keys
import requests
import xml.etree.ElementTree as ET
import yfinance as yf
from tqdm import tqdm
import json
from bs4 import BeautifulSoup
from name_to_ticker import get_ticker_from_name, clean_security_name

def fetch_holdings_data(URL):
    """
        TODO: Visit the URL to and return the holdings data for this filing.
        (example URL that will be passed in, a filing for GOOGL: https://www.sec.gov/Archives/edgar/data/1652044/000156761922020202/0001567619-22-020202.txt)

        Each filing contains a list of holdings. We want to extract the information of all the holdings for this filing.
        This involves parsing the XML data at the URL.
        Each holding includes the name of the issuer, CUSIP, value, shares, investment discretion, and voting authority, and maybe some other information.
        Extract this information for each holding (maybe as a dictionary) and return a list of all holdings.
        (Or maybe return a dataframe where each row is a holding)
    """
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_colwidth', None)
    headers = {"User-Agent": "Some Name (some.email@example.com)"}
    response = requests.get(URL, headers=headers)

    if response.status_code != 200:
        #print(f"Failed to fetch data from {URL}")
        return pd.DataFrame()

    # format: <informationTable xmlns..> ... </informationTable>
    start_index = response.text.find("<informationTable")
    if start_index == -1:
        #print(f"Can't find <informationTable> in {URL}")
        return pd.DataFrame()
    xml_text = response.text[start_index:]
    end_index = xml_text.find("</informationTable>")
    if end_index == -1:
        print("Can't find <informationTable> in {URL}")
        return pd.DataFrame()
    xml_text = xml_text[:end_index+len("</informationTable>")]

    # Element Tree XML
    try:
        root = ET.fromstring(xml_text)
    except ET.ParseError as e:
        print("XML parsing error:", e)
        return pd.DataFrame()

    # Namespace dictionary based on the XML content
    ns = {'info': 'http://www.sec.gov/edgar/document/thirteenf/informationtable'}

    holdings = []
    print(f"Fetching {len(root.findall('.//info:infoTable', ns))} holdings from this filing.")
    counting=0

    # Load and clean name-to-ticker mapping csv
    tickers_df = pd.read_csv("name_ticker_mapping.csv")
    tickers_df["Clean Name"] = tickers_df["Security Name"].apply(clean_security_name)
    choices = tickers_df["Clean Name"].tolist()

    for info in tqdm(root.findall('.//info:infoTable', ns)):
        #print(counting)
        counting+=1
        issuer_name = info.find('info:nameOfIssuer', ns)
        cusip = info.find('info:cusip', ns)
        value = info.find('info:value', ns)
        shares = info.find('info:shrsOrPrnAmt/info:sshPrnamt', ns)
        investment_discretion = info.find('info:investmentDiscretion', ns)
        voting = info.find('info:votingAuthority', ns)
        sole = voting.find('info:Sole', ns) if voting is not None else None
        shared = voting.find('info:Shared', ns) if voting is not None else None
        none_val = voting.find('info:None', ns) if voting is not None else None

        holding = {
            "issuer_name": issuer_name.text if issuer_name is not None else None,
            "cusip": cusip.text if cusip is not None else None,
            "value": int(value.text) * 1000 if value is not None and value.text.isdigit() else None,  # Value in thousands
            "shares": int(shares.text) if shares is not None and shares.text.isdigit() else None,
            "investment_discretion": investment_discretion.text if investment_discretion is not None else None,
            "holding_ticker": get_ticker_from_name(issuer_name.text, choices, tickers_df),
            "voting_authority": {
                "sole": int(sole.text) if sole is not None and sole.text.isdigit() else None,
                "shared": int(shared.text) if shared is not None and shared.text.isdigit() else None,
                "none": int(none_val.text) if none_val is not None and none_val.text.isdigit() else None,
            }
        }
        holdings.append(holding)

    return pd.DataFrame(holdings)


def get_filings(cik):
    """
    Fetches all recent 13F filings for a given Central Index Key (CIK).

    Args:
        cik (str): The CIK number of the institution.

    Returns:
        df: A dataframe containing filings for the CIK. The columns are form type, filing date, URL, and holdings listed in the filing.
    """
    #See https://www.sec.gov/search-filings/edgar-application-programming-interfaces
    #for more information on the SEC EDGAR API we are using.

    # url = f"https://data.sec.gov/submissions/CIK{cik:010}.json"
    url = f"https://data.sec.gov/submissions/CIK{int(cik):010d}.json"
    headers = {"User-Agent": "Some Name (some.email@example.com)"}

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"There was an error fetching the data for CIK {cik}.")
        return []  # Return empty list

    data = response.json()
    filings = data.get("filings", {}).get("recent", {})

    indices = [i for i, form in enumerate(filings.get("form", [])) if "13F" in form]
    if not indices:
        return pd.DataFrame() #return empty dataframe; this cik has no 13f filings

    results = []
    for i in indices:
        print(f"num of filings for cik {cik}: {len(indices)}")
        accession = filings["accessionNumber"][i].replace("-", "")
        text_url = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession}/{filings['accessionNumber'][i]}.txt"
        #print(text_url)

        one_filing = {
            "form": filings["form"][i],
            "cik": cik,
            "date": filings["filingDate"][i],
            "url": f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession}/{filings['accessionNumber'][i]}-index.html",
            "text_url": text_url,
            "data" : fetch_holdings_data(text_url) #the textual data of a filing is multiple holdings
            }

        results.append(one_filing)

    return pd.DataFrame(results)


def fetch_cik_dict(demo_ciks=True):
    """
    Returns a dictionary mapping CIKs to (Ticker Symbol, Institution Name) from the SEC website.
    """
    if demo_ciks:
        #The CIKs below are large and have thousands of holdings, not ideal for demo
#         demo_dict = {
#             "0000102909".zfill(10): (None, "VANGUARD GROUP INC"),
#     "2012383".zfill(10): ("BLK", "BlackRock, Inc."),
#     "1167483".zfill(10): (None, "Tiger Global Management LLC"),
#     "93751".zfill(10): ("STT", "State Street Corp"),
#     "1018724".zfill(10): ("AMZN", "Amazon.com Inc"),
#     "51143".zfill(10): ("IBM", "International Business Machines Corp")
# }
        demo_dict = {
            # "1385613".zfill(10): ("GLRE", "GREENLIGHT CAPITAL RE, LTD."),
            # "01056831".zfill(10): (None, "Fairholme Capital Management LLC"),
            "1018724".zfill(10): ("AMZN", "Amazon.com Inc"),
            "0001652044".zfill(10): ("GOOGL", "Alphabet Inc."),
    "0001326801".zfill(10): ("META", "Meta Platforms, Inc."),
    "0000320193".zfill(10): ("AAPL", "Apple Inc."),
    "0000789019".zfill(10): ("MSFT", "MICROSOFT CORP"),
    "0001045810".zfill(10): ("NVDA", "NVIDIA CORP"),
        }

        return demo_dict

    else:
        url = "https://www.sec.gov/files/company_tickers.json"
        headers = {"User-Agent": "Some Name (some.email@example.com)"}

        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            return {}

        data = response.json()
        return {str(item["cik_str"]).zfill(10): (item["ticker"], item["title"]) for item in data.values()}


def get_all_13f_filings(MAX_NUM_TO_FETCH=6):
    """
    Fetches all 13F filings for every CIK.
    MAX_NUM_TO_FETCH: An optional argument specifying the maximum number of
    CIKs to scrape (to save time).

    Returns:
        df: A dataframe of all 13F filings across all CIKs.
    """
    cik_mapping = fetch_cik_dict(demo_ciks=True)
    df = pd.DataFrame()
    count = 0

    for cik in tqdm(cik_mapping.keys(), desc="Fetching 13F Filings"):
        cik_filings = get_filings(cik)
        if cik_filings.empty:
            continue

        cik_filings["Ticker Symbol"] = cik_mapping[cik][0]
        cik_filings["Institution Name"] = cik_mapping[cik][1]
        cik_filings["Sector/Industry"] = cik_filings["Ticker Symbol"].apply(lambda x: get_sector_from_yahoo(x) if pd.notna(x) else "N/A")
        cik_filings["Assets Under Management (AUM)"] = cik_filings["Ticker Symbol"].apply(lambda x: get_aum_and_fund_type(x)["AUM"] if pd.notna(x) else "N/A")
        cik_filings["Fund Type"] = cik_filings["Ticker Symbol"].apply(lambda x: get_aum_and_fund_type(x)["Fund Type"] if pd.notna(x) else "N/A")
        df = pd.concat([df, cik_filings], ignore_index=True)
        count+=1

        if count == MAX_NUM_TO_FETCH:
            break

    return df


def get_sector_from_yahoo(ticker):
    """Fetch sector/industry from Yahoo Finance"""
    try:
        stock = yf.Ticker(ticker)
        sector = stock.info.get("sector", "N/A")
        return sector
    except:
        return "N/A"


def get_aum_and_fund_type(ticker):
    """Fetch AUM (Assets Under Management) and Fund Type from Yahoo Finance."""
    try:
        stock = yf.Ticker(ticker)
        aum = stock.info.get("totalAssets", "N/A")
        fund_type = stock.info.get("category", "N/A")

        if isinstance(aum, (int, float)):
            aum = f"${aum/1e9:.2f}B"

        return {"AUM": aum, "Fund Type": fund_type}
    except:
        return {"AUM": "N/A", "Fund Type": "N/A"}


def main():

    filings = get_all_13f_filings(MAX_NUM_TO_FETCH=10)
    print(f"Fetched {len(filings)} 13F filings.")
    filings["data"] = filings["data"].apply(lambda df: df.to_dict(orient="records"))
    filings["data"] = filings["data"].apply(json.dumps)
    filings.to_csv("13f_filings_demo.csv", index=False)
    print("✅ CSV file '13f_filings_demo.csv' has been created successfully.")

if __name__ == "__main__":
    main()