|
| 1 | +import json |
| 2 | +import concurrent.futures |
| 3 | +from polygon import RESTClient |
| 4 | + |
| 5 | +# Initialize Polygon API client |
| 6 | +client = RESTClient( |
| 7 | + trace=True |
| 8 | +) # Assuming you have POLYGON_API_KEY environment variable set up |
| 9 | + |
| 10 | +# Initialize the data structure to hold SIC code groups |
| 11 | +sic_code_groups = {} |
| 12 | + |
| 13 | + |
| 14 | +# https://en.wikipedia.org/wiki/Standard_Industrial_Classification |
| 15 | +# https://www.investopedia.com/terms/s/sic_code.asp |
| 16 | +def sic_code_to_group(sic_code): |
| 17 | + """ |
| 18 | + Maps a given SIC code to the industry group. |
| 19 | + """ |
| 20 | + sic_code = int(sic_code) |
| 21 | + if 100 <= sic_code <= 999: |
| 22 | + return "Agriculture, Forestry and Fishing" |
| 23 | + elif 1000 <= sic_code <= 1499: |
| 24 | + return "Mining" |
| 25 | + elif 1500 <= sic_code <= 1799: |
| 26 | + return "Construction" |
| 27 | + # Note: 1800-1999 not used |
| 28 | + elif 2000 <= sic_code <= 3999: |
| 29 | + return "Manufacturing" |
| 30 | + elif 4000 <= sic_code <= 4999: |
| 31 | + return "Transportation and Public Utilities" |
| 32 | + elif 5000 <= sic_code <= 5199: |
| 33 | + return "Wholesale Trade" |
| 34 | + elif 5200 <= sic_code <= 5999: |
| 35 | + return "Retail Trade" |
| 36 | + elif 6000 <= sic_code <= 6799: |
| 37 | + return "Finance, Insurance and Real Estate" |
| 38 | + elif 7000 <= sic_code <= 8999: |
| 39 | + return "Services" |
| 40 | + elif 9100 <= sic_code <= 9729: |
| 41 | + return "Public Administration" |
| 42 | + elif 9900 <= sic_code <= 9999: |
| 43 | + return "Nonclassifiable" |
| 44 | + else: |
| 45 | + return None |
| 46 | + |
| 47 | + |
| 48 | +def process_ticker(ticker_snapshot): |
| 49 | + ticker = ticker_snapshot.ticker |
| 50 | + |
| 51 | + try: |
| 52 | + details = client.get_ticker_details(ticker) |
| 53 | + |
| 54 | + # Check if the type is 'CS' (common stock), if not, return early without processing this ticker |
| 55 | + # if getattr(details, 'type', None) != 'CS' or getattr(details, 'market_cap', None) != None: |
| 56 | + if ( |
| 57 | + getattr(details, "type", None) != "CS" |
| 58 | + or getattr(details, "market_cap", None) is None |
| 59 | + ): |
| 60 | + return |
| 61 | + |
| 62 | + sic_code = details.sic_code |
| 63 | + sic_description = getattr( |
| 64 | + details, "sic_description", None |
| 65 | + ) # Use getattr to avoid AttributeError if sic_description is not present |
| 66 | + market_cap = getattr(details, "market_cap", None) |
| 67 | + |
| 68 | + # if sic_code: |
| 69 | + # sic_code = str(sic_code)[:1] # Extract first 1 digits |
| 70 | + |
| 71 | + if sic_code: |
| 72 | + sic_group = sic_code_to_group(sic_code) |
| 73 | + if sic_group is None: |
| 74 | + return |
| 75 | + |
| 76 | + # Check if the sic_code is already in the groups, if not create a new entry with sic_description and empty companies list |
| 77 | + # if sic_code not in sic_code_groups: |
| 78 | + # sic_code_groups[sic_code] = {"sic_description": sic_description, "companies": []} |
| 79 | + |
| 80 | + if sic_group not in sic_code_groups: |
| 81 | + sic_code_groups[sic_group] = { |
| 82 | + "sic_description": sic_group, |
| 83 | + "companies": [], |
| 84 | + } |
| 85 | + |
| 86 | + # Append the company details to the corresponding SIC code entry |
| 87 | + # sic_code_groups[sic_code]["companies"].append({ |
| 88 | + # "ticker": ticker, |
| 89 | + # "market_cap": market_cap |
| 90 | + # }) |
| 91 | + |
| 92 | + sic_code_groups[sic_group]["companies"].append( |
| 93 | + {"ticker": ticker, "market_cap": market_cap} |
| 94 | + ) |
| 95 | + |
| 96 | + except Exception as e: |
| 97 | + print(f"Error processing ticker {ticker}: {e}") |
| 98 | + |
| 99 | + |
| 100 | +# Get snapshot data |
| 101 | +snapshot = client.get_snapshot_all("stocks") |
| 102 | + |
| 103 | +# Execute the data processing in parallel, limited to 100 workers |
| 104 | +with concurrent.futures.ThreadPoolExecutor(max_workers=100) as executor: |
| 105 | + executor.map(process_ticker, snapshot) |
| 106 | + |
| 107 | +# Modify the SIC Code Groups Dictionary to include the weights |
| 108 | +for sic_code, group_data in sic_code_groups.items(): |
| 109 | + companies = group_data["companies"] |
| 110 | + total_market_cap = sum( |
| 111 | + company["market_cap"] for company in companies if company["market_cap"] |
| 112 | + ) |
| 113 | + |
| 114 | + # If total_market_cap is 0, we will skip weight calculation to avoid division by zero |
| 115 | + if total_market_cap == 0: |
| 116 | + continue |
| 117 | + |
| 118 | + for company in companies: |
| 119 | + if company[ |
| 120 | + "market_cap" |
| 121 | + ]: # Avoid dividing by zero if a company's market cap is None or 0 |
| 122 | + company["weight"] = company["market_cap"] / total_market_cap |
| 123 | + else: |
| 124 | + company["weight"] = 0 # You can also set to a default value if preferred |
| 125 | + |
| 126 | +# Save the enhanced data structure to a JSON file |
| 127 | +with open("sic_code_groups.json", "w") as f: |
| 128 | + json.dump(sic_code_groups, f) |
| 129 | + |
| 130 | +print("Data collection complete and saved to 'sic_code_groups.json'") |
0 commit comments