From e7027689d943e7b89677a3153c0becb4a72273a4 Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Tue, 9 Sep 2025 23:03:40 -0700 Subject: [PATCH 1/3] cr --- .../sec_filing_classify_extract.ipynb | 853 ++++++++++++++++++ 1 file changed, 853 insertions(+) create mode 100644 examples/classify/sec_filing_classify_extract.ipynb diff --git a/examples/classify/sec_filing_classify_extract.ipynb b/examples/classify/sec_filing_classify_extract.ipynb new file mode 100644 index 00000000..b4642088 --- /dev/null +++ b/examples/classify/sec_filing_classify_extract.ipynb @@ -0,0 +1,853 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3b349365", + "metadata": {}, + "source": [ + "# Classifying and Extracting from SEC Filings\n", + "\n", + "\"Open\n", + "\n", + "This notebook demonstrates how to classify and extract information from SEC filings using LlamaParse. We'll walk through the process of classifying a document as either a 10-K or 10-Q filing and then extracting the relevant information.\n", + "\n", + "**Note**: The classification module is currently in *beta*, so we are still ironing out some interface/implementation details. Please let us know your feedback!\n", + "\n", + "Status:\n", + "| Last Executed | Version | State |\n", + "|---------------|---------|------------|\n", + "| Sep-09-2025 | 0.6.65 | Maintained |" + ] + }, + { + "cell_type": "markdown", + "id": "22fb430e", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "This notebook demonstrates a classify+extract workflow on SEC filings using LlamaCloud and LlamaIndex Workflows.\n", + "\n", + "- Classify each document as one of: 10-K, 10-Q, 8-K, Proxy (DEF 14A)\n", + "- Extract a type-specific schema depending on the classification\n", + "- Orchestrate via an event-driven LlamaIndex Workflow\n", + "\n", + "We also include public example documents, so anyone can run this end-to-end.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12356c11", + "metadata": {}, + "outputs": [], + "source": [ + "# Install and imports\n", + "import os\n", + "from typing import List, Optional\n", + "from datetime import date\n", + "from decimal import Decimal\n", + "from pydantic import BaseModel, Field\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "# LlamaIndex workflow imports\n", + "from llama_index.core.workflow import (\n", + " Event,\n", + " StartEvent,\n", + " StopEvent,\n", + " Context,\n", + " Workflow,\n", + " step,\n", + ")\n", + "from llama_index.core.prompts import ChatPromptTemplate\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "# LlamaCloud classify/extract\n", + "from llama_cloud.client import AsyncLlamaCloud\n", + "from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration\n", + "from llama_cloud_services.beta.classifier.client import ClassifyClient\n", + "from llama_cloud_services import LlamaExtract, ExtractionAgent\n", + "from llama_cloud import ExtractConfig\n", + "from llama_cloud.core.api_error import ApiError" + ] + }, + { + "cell_type": "markdown", + "id": "635e00b7", + "metadata": {}, + "source": [ + "## Sample documents\n", + "\n", + "We will download four public SEC filings (10-K, 10-Q, 8-K, Proxy) into `examples/classify/data/` and run the workflow over them.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c839b119", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'10-K': 'data/msft_10k.pdf',\n", + " '10-Q': 'data/msft_10q.pdf',\n", + " '8-K': 'data/msft_8k.pdf',\n", + " 'Proxy': 'data/msft_proxy.pdf'}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download Microsoft PDFs for all four types\n", + "import pathlib\n", + "import requests\n", + "\n", + "DATA_DIR = pathlib.Path(\"data\")\n", + "DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "MSFT_DOCS = {\n", + " \"10-K\": \"https://microsoft.gcs-web.com/static-files/1c864583-06f7-40cc-a94d-d11400c83cc8\",\n", + " \"10-Q\": \"https://microsoft.gcs-web.com/static-files/f96f7d38-36ce-4a26-9e29-61701cdca7a7\",\n", + " \"8-K\": \"https://microsoft.gcs-web.com/static-files/dc50633a-2880-4303-bebb-bdca89149f65\",\n", + " \"Proxy\": \"https://microsoft.gcs-web.com/static-files/d5ec87b3-e29d-4d33-9d84-5ce1f194dcf1\",\n", + "}\n", + "\n", + "local_files = {}\n", + "for k, url in MSFT_DOCS.items():\n", + " out_path = DATA_DIR / f\"msft_{k.replace('-', '').lower()}.pdf\"\n", + " if not out_path.exists():\n", + " # special case for proxy, run wget\n", + " r = requests.get(url, timeout=60)\n", + " r.raise_for_status()\n", + " with open(out_path, \"wb\") as f:\n", + " f.write(r.content)\n", + " local_files[k] = str(out_path)\n", + "\n", + "local_files" + ] + }, + { + "cell_type": "markdown", + "id": "c882ec38", + "metadata": {}, + "source": [ + "## Define type-specific extraction schemas\n", + "\n", + "We define concise Pydantic schemas for 10-K, 10-Q, 8-K, and Proxy (DEF 14A).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3450e8d8", + "metadata": {}, + "outputs": [], + "source": [ + "class Form10K(BaseModel):\n", + " company_name: str\n", + " fiscal_year_end: date\n", + " annual_revenue: Optional[Decimal] = None\n", + " net_income: Optional[Decimal] = None\n", + " total_assets: Optional[Decimal] = None\n", + " employee_count: Optional[int] = None\n", + " business_description: str\n", + " primary_risk_factors: List[str]\n", + " business_segments: List[str]\n", + " geographic_markets: List[str]\n", + "\n", + "\n", + "class Form10Q(BaseModel):\n", + " company_name: str\n", + " quarter_end: date\n", + " quarterly_revenue: Optional[Decimal] = None\n", + " quarterly_net_income: Optional[Decimal] = None\n", + " revenue_change_pct: Optional[float] = None\n", + " material_changes: List[str]\n", + " subsequent_events: List[str]\n", + "\n", + "\n", + "class Form8K(BaseModel):\n", + " company_name: str\n", + " filing_date: date\n", + " event_date: date\n", + " event_type: str\n", + " material_event_description: str\n", + " financial_impact: Optional[Decimal] = None\n", + " involved_parties: List[str]\n", + "\n", + "\n", + "class ProxyStatement(BaseModel):\n", + " company_name: str\n", + " meeting_date: date\n", + " ceo_name: str\n", + " ceo_total_compensation: Optional[Decimal] = None\n", + " board_members: List[str]\n", + " executive_officers: List[str]\n", + " shareholder_proposals: List[str]\n", + " voting_matters: List[str]\n", + " audit_firm: Optional[str] = None" + ] + }, + { + "cell_type": "markdown", + "id": "6a68087e", + "metadata": {}, + "source": [ + "## Classification rules\n", + "\n", + "We define four `ClassifierRule` entries describing each SEC form in natural language. The classifier returns the `type` string for the best-matching rule.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73a78a11", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: the types need to be in lowercase\n", + "SEC_CLASSIFICATION_RULES: list[ClassifierRule] = [\n", + " ClassifierRule(\n", + " type=\"10-k\",\n", + " description=(\n", + " \"Annual report on Form 10-K, includes business overview, risk factors, management's\"\n", + " \" discussion and analysis, audited financial statements for the fiscal year.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"10-q\",\n", + " description=(\n", + " \"Quarterly report on Form 10-Q, includes unaudited quarterly financial statements,\"\n", + " \" MD&A for the quarter, and updates on risk factors.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"8-k\",\n", + " description=(\n", + " \"Current report on Form 8-K, discloses material events such as acquisitions,\"\n", + " \" executive changes, earnings releases, or other significant occurrences.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"proxy\",\n", + " description=(\n", + " \"DEF 14A proxy statement for shareholder meetings including proposals and voting,\"\n", + " \" board of directors, executive compensation (CD&A), and auditor information.\"\n", + " ),\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "ae680962", + "metadata": {}, + "source": [ + "## Initialize clients\n", + "\n", + "We create clients for classification and extraction. Set `LLAMA_CLOUD_API_KEY` in your environment. Optionally set `LLAMA_CLOUD_BASE_URL`, `LLAMA_CLOUD_PROJECT_ID`, `LLAMA_CLOUD_ORGANIZATION_ID`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "446dbb35", + "metadata": {}, + "outputs": [], + "source": [ + "api_key = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "base_url = os.getenv(\"LLAMA_CLOUD_BASE_URL\")\n", + "project_id = os.getenv(\"LLAMA_CLOUD_PROJECT_ID\")\n", + "organization_id = os.getenv(\"LLAMA_CLOUD_ORGANIZATION_ID\")\n", + "\n", + "if not api_key:\n", + " raise ValueError(\"LLAMA_CLOUD_API_KEY not set. Please set it in your environment.\")\n", + "\n", + "async_client = AsyncLlamaCloud(token=api_key, base_url=base_url)\n", + "classify_client = ClassifyClient(\n", + " async_client, project_id=project_id, organization_id=organization_id\n", + ")\n", + "\n", + "extract_config = ExtractConfig(extraction_mode=\"BALANCED\")\n", + "llama_extract = LlamaExtract(project_id=project_id, organization_id=organization_id)\n", + "\n", + "# Model for LLM summarization in prompts if needed\n", + "llm = OpenAI(model=\"gpt-4o\")" + ] + }, + { + "cell_type": "markdown", + "id": "b21d50f0", + "metadata": {}, + "source": [ + "## Using Classify Module\n", + "\n", + "In this section we show you how to use the `ClassifyClient` module in a standalone manner (before using it in an e2e workflow).\n", + "\n", + "We run the classification module over the Microsoft 10-K file to verify that the output is in the correct class.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f356085d", + "metadata": {}, + "outputs": [], + "source": [ + "# set parsing configuration\n", + "parsing_config = ClassifyParsingConfiguration(max_pages=5)\n", + "\n", + "# classify file\n", + "results = await classify_client.aclassify_file_path(\n", + " rules=SEC_CLASSIFICATION_RULES,\n", + " file_input_path=\"data/msft_10k.pdf\",\n", + " parsing_configuration=parsing_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2d02406d", + "metadata": {}, + "source": [ + "The result will not only contain the classification type, but also the reasoning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34249af0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10-k\n", + "The document is titled 'FORM 10-K' and is labeled as an 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934' for the fiscal year ended June 30, 2024. It contains all the hallmarks of a 10-K filing, including a business overview, risk factors, management's discussion and analysis (MD&A), and references to audited financial statements. The index lists all required sections for a 10-K, such as Business, Risk Factors, MD&A, Financial Statements, and more. There is no indication that this is a quarterly report (10-Q), a current report (8-K), or a proxy statement (DEF 14A). The content and structure perfectly match the definition of a 10-K.\n" + ] + } + ], + "source": [ + "print(results.items[0].result.type)\n", + "print(results.items[0].result.reasoning)" + ] + }, + { + "cell_type": "markdown", + "id": "8408bb00", + "metadata": {}, + "source": [ + "## Workflow: Classify then Extract\n", + "\n", + "We build a `Workflow` with steps:\n", + "- `classify_file`: upload and classify the document\n", + "- `extract_by_type`: create/select an agent for the type and extract the corresponding schema\n", + "- `format_output`: return unified JSON with `type` and `data`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75ddf93a", + "metadata": {}, + "outputs": [], + "source": [ + "class ClassifiedEvent(Event):\n", + " file_path: str\n", + " doc_type: str\n", + "\n", + "\n", + "class ExtractedEvent(Event):\n", + " file_path: str\n", + " doc_type: str\n", + " data: dict\n", + "\n", + "\n", + "def _schema_for_type(doc_type: str):\n", + " if doc_type == \"10-k\":\n", + " return Form10K\n", + " if doc_type == \"10-q\":\n", + " return Form10Q\n", + " if doc_type == \"8-k\":\n", + " return Form8K\n", + " if doc_type == \"proxy\":\n", + " return ProxyStatement\n", + " raise ValueError(f\"Unsupported doc_type: {doc_type}\")\n", + "\n", + "\n", + "def _agent_name_for_type(doc_type: str) -> str:\n", + " return f\"sec-{doc_type.lower()}-extractor\"\n", + "\n", + "\n", + "class SECClassifyExtractWorkflow(Workflow):\n", + " def __init__(self, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.agent_registry: dict[str, ExtractionAgent] = {}\n", + "\n", + " @step\n", + " async def classify_file(self, ctx: Context, ev: StartEvent) -> ClassifiedEvent:\n", + " file_path = ev.file_path\n", + " parsing_config = ClassifyParsingConfiguration(max_pages=5)\n", + " results = await classify_client.aclassify_file_path(\n", + " rules=SEC_CLASSIFICATION_RULES,\n", + " file_input_path=file_path,\n", + " parsing_configuration=parsing_config,\n", + " )\n", + " item = results.items[0]\n", + " doc_type = item.result.type\n", + " return ClassifiedEvent(file_path=file_path, doc_type=doc_type)\n", + "\n", + " @step\n", + " async def extract_by_type(\n", + " self, ctx: Context, ev: ClassifiedEvent\n", + " ) -> ExtractedEvent:\n", + " schema = _schema_for_type(ev.doc_type)\n", + " agent_name = _agent_name_for_type(ev.doc_type)\n", + "\n", + " # Lazily create agent if not present\n", + " if ev.doc_type not in self.agent_registry:\n", + " try:\n", + " existing = llama_extract.get_agent(name=agent_name)\n", + " if existing:\n", + " llama_extract.delete_agent(existing.id)\n", + " except ApiError as e:\n", + " if e.status_code != 404:\n", + " raise\n", + " agent = llama_extract.create_agent(\n", + " name=agent_name, data_schema=schema, config=extract_config\n", + " )\n", + " self.agent_registry[ev.doc_type] = agent\n", + "\n", + " extraction = await self.agent_registry[ev.doc_type].aextract(ev.file_path)\n", + " data = (\n", + " extraction.data\n", + " if isinstance(extraction.data, dict)\n", + " else extraction.model_dump()\n", + " )\n", + " return ExtractedEvent(file_path=ev.file_path, doc_type=ev.doc_type, data=data)\n", + "\n", + " @step\n", + " async def format_output(self, ctx: Context, ev: ExtractedEvent) -> StopEvent:\n", + " return StopEvent(\n", + " result={\"type\": ev.doc_type, \"data\": ev.data, \"file\": ev.file_path}\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98463ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running workflow for 10-K...\n", + "Running step classify_file\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.84s/it]\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.18it/s]\n", + "Extracting files: 100%|██████████| 1/1 [00:19<00:00, 19.01s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", + "Running workflow for 10-Q...\n", + "Running step classify_file\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.31s/it]\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.43it/s]\n", + "Extracting files: 100%|██████████| 1/1 [00:25<00:00, 25.95s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", + "Running workflow for 8-K...\n", + "Running step classify_file\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.16s/it]\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.40it/s]\n", + "Extracting files: 100%|██████████| 1/1 [01:58<00:00, 118.02s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", + "Running workflow for Proxy...\n", + "Running step classify_file\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.44s/it]\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.28it/s]\n", + "Extracting files: 100%|██████████| 1/1 [02:07<00:00, 127.58s/it]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "{'10-K': {'type': '10-k', 'file': 'data/msft_10k.pdf'},\n", + " '10-Q': {'type': '10-q', 'file': 'data/msft_10q.pdf'},\n", + " '8-K': {'type': '8-k', 'file': 'data/msft_8k.pdf'},\n", + " 'Proxy': {'type': 'proxy', 'file': 'data/msft_proxy.pdf'}}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Run the workflow on each Microsoft document\n", + "import nest_asyncio\n", + "\n", + "nest_asyncio.apply()\n", + "\n", + "workflow = SECClassifyExtractWorkflow(verbose=True, timeout=None)\n", + "\n", + "results = {}\n", + "for doc_type, path in local_files.items():\n", + " print(f\"Running workflow for {path}...\")\n", + " res = await workflow.run(file_path=path)\n", + " results[doc_type] = res\n", + "\n", + "{t: {\"type\": v[\"type\"], \"file\": v[\"file\"]} for t, v in results.items()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4bd8aee0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==== 10-k ====\n", + "{'annual_revenue': 245122000000.0,\n", + " 'company_name': 'Microsoft Corporation',\n", + " 'fiscal_year_end': 'June 30, 2024',\n", + " 'net_income': 88136000000.0,\n", + " 'primary_risk_factors': ['We face intense competition across all markets for '\n", + " 'our products and services from a range of '\n", + " 'competitors varying in size and specialization. '\n", + " 'Barriers to entry are low in many markets, and we '\n", + " 'experience rapid evolution in technologies and user '\n", + " 'needs. Competition includes firms with competing '\n", + " 'platforms and business models such as cloud-based '\n", + " 'services and open source software. We are investing '\n", + " 'in AI as a highly competitive area. Cybersecurity '\n", + " 'threats from nation-state actors and other '\n", + " 'malicious parties pose significant risks. We are '\n", + " 'subject to a wide range of laws and regulations '\n", + " 'globally related to privacy, telecommunications, '\n", + " 'data protection, advertising, and content.',\n", + " 'Security vulnerabilities, data corruption, or '\n", + " 'reduced performance in our products and services '\n", + " 'could harm our reputation and reduce customer '\n", + " 'purchases. Improper disclosure or misuse of '\n", + " 'personal data by us or vendors could lead to legal '\n", + " 'liabilities and damage to reputation. Platforms may '\n", + " 'be abused for hostile content, leading to '\n", + " 'regulatory or reputational harm. Defective '\n", + " 'products, operational failures, supply chain '\n", + " 'disruptions, or infrastructure issues may cause '\n", + " 'financial or legal consequences. Legal and '\n", + " 'regulatory risks include antitrust and trade '\n", + " 'restrictions, along with data privacy laws that '\n", + " 'could increase costs or impede product adoption. '\n", + " 'Enforcement on privacy, content moderation, AI, and '\n", + " 'data protection could increase compliance costs and '\n", + " 'operational changes. Exposure exists to claims '\n", + " 'related to business practices, AI services, '\n", + " 'intellectual property, and tax liabilities.',\n", + " 'The software, device, and cloud services markets '\n", + " 'are dynamic and highly competitive. Demand for '\n", + " 'products and services is influenced by global '\n", + " 'macroeconomic and geopolitical factors. Supply '\n", + " 'chain disruptions can impact device manufacturing '\n", + " 'timelines. Success depends on attracting and '\n", + " 'retaining qualified employees. Foreign exchange '\n", + " 'rate fluctuations significantly affect revenues and '\n", + " 'expenses. The industry continually sees rapid '\n", + " 'technological and business model changes.',\n", + " 'We face economic risks including foreign exchange '\n", + " 'rate fluctuations, interest rate changes, credit '\n", + " 'risk, and equity price volatility.',\n", + " 'Product failure rates impact hardware warranty '\n", + " 'costs. Advertising expenses vary and present '\n", + " 'management risks. Investment impairments may occur '\n", + " 'due to market and credit conditions. Foreign '\n", + " 'currency risk is managed via derivatives. Interest '\n", + " 'rate risk arises from fixed-income securities. '\n", + " 'Credit risk is managed through credit default swaps '\n", + " 'in the fixed-income portfolio.',\n", + " 'We are subject to investigations such as the Irish '\n", + " 'Data Protection Commission investigation into '\n", + " \"LinkedIn's GDPR practices, which could result in \"\n", + " 'regulatory fines. Claims and lawsuits arise in the '\n", + " 'ordinary course of business with potential material '\n", + " 'adverse impacts. Uncertain tax positions related to '\n", + " 'transfer pricing issues with the IRS could '\n", + " 'materially impact financial statements.']}\n", + "\n", + "==== 10-q ====\n", + "{'company_name': 'Microsoft Corporation',\n", + " 'material_changes': ['In August 2024, Microsoft announced changes to the '\n", + " 'composition of its segments to better align with '\n", + " 'current business management, notably consolidating the '\n", + " 'commercial components of Microsoft 365 within the '\n", + " 'Productivity and Business Processes segment. These '\n", + " 'segment changes, which began in fiscal year 2025, were '\n", + " 'also reflected retrospectively in prior period segment '\n", + " 'information. The changes impacted Note 8 – Goodwill, '\n", + " 'Note 12 – Unearned Revenue, and Note 17 – Segment '\n", + " 'Information and Geographic Data but did not affect the '\n", + " 'consolidated balance sheets, income statements, or cash '\n", + " 'flow statements.',\n", + " 'On October 13, 2023, Microsoft completed its '\n", + " 'acquisition of Activision Blizzard, Inc. for $75.4 '\n", + " 'billion, primarily in cash. The purchase price '\n", + " 'allocation was completed by September 30, 2024. Due to '\n", + " 'the segment changes in fiscal year 2025, the reporting '\n", + " 'units and goodwill allocations were adjusted using a '\n", + " 'relative fair value approach. As of September 30, 2024, '\n", + " 'no commercial paper was issued or outstanding, '\n", + " 'following a prior $6.7 billion outstanding as of June '\n", + " '30, 2024. The Board of Directors approved a $60 billion '\n", + " 'share repurchase program on September 16, 2024, set to '\n", + " \"commence after the previous program's completion. Also, \"\n", + " 'in October 2024, the Irish Data Protection Commission '\n", + " 'issued a final GDPR decision involving LinkedIn, a '\n", + " 'Microsoft subsidiary.',\n", + " 'During the period, Microsoft Cloud revenues increased '\n", + " 'by 22% to $38.9 billion, with various components '\n", + " 'showing growth including Microsoft 365 Commercial '\n", + " '(13%), Microsoft 365 Consumer (5%), LinkedIn (10%), '\n", + " 'Dynamics (14%), Server products (23%), and Xbox content '\n", + " 'and services (61%) driven by the Activision '\n", + " 'acquisition. Search and news advertising revenues '\n", + " 'increased by 18%. Gross margin percentages decreased, '\n", + " 'particularly within Intelligent Cloud and Microsoft '\n", + " 'Cloud margins, partly due to scaling AI infrastructure. '\n", + " 'Operating expenses rose by $1.6 billion (12%), driven '\n", + " 'by Gaming and cloud engineering investments, while '\n", + " 'operating income increased by $3.7 billion (14%).',\n", + " 'Cash from operations increased by $3.6 billion to $34.2 '\n", + " 'billion for the three months ended September 30, 2024. '\n", + " 'Cash used in financing activities increased by $31.3 '\n", + " 'billion to $16.6 billion, primarily due to increased '\n", + " 'debt repayments. Cash used in investing rose by $15.7 '\n", + " 'billion to $15.2 billion. An $4.4 billion short-term '\n", + " 'transition tax installment is payable in fiscal year '\n", + " '2026. Share repurchases totaled 7 million shares for '\n", + " '$2.8 billion, and dividends declared amounted to $6.2 '\n", + " 'billion for the same quarter. As of September 30, 2024, '\n", + " '$7.5 billion remained available under the $60 billion '\n", + " 'share repurchase program. Capital expenditures are '\n", + " 'expected to rise to support cloud growth and AI '\n", + " 'infrastructure investments.',\n", + " 'Risks disclosed include security, privacy, and '\n", + " 'execution risks related to products and services '\n", + " 'including AI and IoT solutions; integration and '\n", + " 'reliability risks with third-party products; legal, '\n", + " 'regulatory, and litigation risks including antitrust '\n", + " 'and government competition law enforcement; supply '\n", + " 'chain and component quality issues; compliance risks '\n", + " 'with corruption, trade, and sanctions laws; evolving '\n", + " 'data privacy laws increasing compliance burdens; '\n", + " 'operational risks from outages and disruptions; '\n", + " 'increased scrutiny and regulation concerning AI, '\n", + " 'cybersecurity, and ESG requirements; tax risks '\n", + " 'including ongoing IRS audits for $28.9 billion plus '\n", + " 'penalties; sustainability regulatory requirements and '\n", + " 'commitments; intellectual property protection risks; '\n", + " 'general reputational risks from product issues and '\n", + " 'social scrutiny; adverse economic and market '\n", + " 'conditions; geopolitical, catastrophic, and '\n", + " 'climate-related risks; employment market and '\n", + " 'unionization challenges; and risks from global '\n", + " 'operations such as currency fluctuations and '\n", + " 'protectionist trends.'],\n", + " 'quarter_end': 'September 30, 2024',\n", + " 'quarterly_revenue': 65585.0,\n", + " 'revenue_change_pct': 16.05}\n", + "\n", + "==== 8-k ====\n", + "{'company_name': 'Microsoft Corporation',\n", + " 'event_date': 'August 21, 2024',\n", + " 'event_type': 'Regulation FD Disclosure – Updated Segment Reporting and FY25 '\n", + " 'Investor Metrics',\n", + " 'material_event_description': 'Microsoft Corporation announced changes to its '\n", + " 'segment structure and key metrics reporting, '\n", + " 'effective for Fiscal Year 2025. The company '\n", + " 'updated its reporting segments, notably '\n", + " 'consolidating Microsoft 365 Commercial revenue '\n", + " 'streams into the Productivity and Business '\n", + " 'Processes segment, reclassifying Copilot Pro '\n", + " 'and Nuance Enterprise revenue, and making '\n", + " 'corresponding adjustments to key performance '\n", + " 'indicators (KPIs). Microsoft also provided '\n", + " 'restated investor metrics and mechanical '\n", + " 'updates to its FY25 Q1 outlook solely to '\n", + " 'reflect the presentation changes. On August '\n", + " '21, 2024, Microsoft posted related '\n", + " 'presentation materials to its Investor '\n", + " 'Relations website.'}\n", + "\n", + "==== proxy ====\n", + "{'ceo_name': 'Satya Nadella',\n", + " 'ceo_total_compensation': None,\n", + " 'company_name': 'Microsoft Corporation',\n", + " 'meeting_date': '12/10/24',\n", + " 'voting_matters': ['Election of Directors',\n", + " 'Advisory Vote to Approve Named Executive Officer '\n", + " 'Compensation (\"say-on-pay vote\")',\n", + " 'Ratification of the Selection of Deloitte & Touche LLP as '\n", + " 'our Independent Auditor for Fiscal Year 2025',\n", + " 'Report on Risks of Weapons Development',\n", + " 'Assessment of Investing in Bitcoin',\n", + " 'Report on Data Operations in Human Rights Hotspots',\n", + " 'Report on Artificial Intelligence and Machine Learning '\n", + " 'Tools for Oil and Gas Development and Production',\n", + " 'Report on AI Misinformation and Disinformation',\n", + " 'Report on AI Data Sourcing Accountability']}\n" + ] + } + ], + "source": [ + "# Pretty print a subset of fields for each type\n", + "import json\n", + "from pprint import pprint\n", + "\n", + "\n", + "def summarize(doc_type: str, data: dict):\n", + " print(f\"\\n==== {doc_type} ====\")\n", + " if doc_type == \"10-k\":\n", + " keys = [\n", + " \"company_name\",\n", + " \"fiscal_year_end\",\n", + " \"annual_revenue\",\n", + " \"net_income\",\n", + " \"primary_risk_factors\",\n", + " ]\n", + " elif doc_type == \"10-q\":\n", + " keys = [\n", + " \"company_name\",\n", + " \"quarter_end\",\n", + " \"quarterly_revenue\",\n", + " \"revenue_change_pct\",\n", + " \"material_changes\",\n", + " ]\n", + " elif doc_type == \"8-k\":\n", + " keys = [\n", + " \"company_name\",\n", + " \"event_date\",\n", + " \"event_type\",\n", + " \"material_event_description\",\n", + " ]\n", + " else: # Proxy\n", + " keys = [\n", + " \"company_name\",\n", + " \"meeting_date\",\n", + " \"ceo_name\",\n", + " \"ceo_total_compensation\",\n", + " \"voting_matters\",\n", + " ]\n", + " subset = {k: data.get(k) for k in keys}\n", + " pprint(subset)\n", + "\n", + "\n", + "for t, out in results.items():\n", + " summarize(out[\"type\"], out[\"data\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "llama_parse", + "language": "python", + "name": "llama_parse" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From c49347afbf61614108df2472874320cb8e87b52f Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Wed, 10 Sep 2025 10:08:34 -0700 Subject: [PATCH 2/3] cr --- .../sec_filing_classify_extract.ipynb | 557 ++++++++++++------ 1 file changed, 361 insertions(+), 196 deletions(-) diff --git a/examples/classify/sec_filing_classify_extract.ipynb b/examples/classify/sec_filing_classify_extract.ipynb index b4642088..1613bc5f 100644 --- a/examples/classify/sec_filing_classify_extract.ipynb +++ b/examples/classify/sec_filing_classify_extract.ipynb @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "id": "75ddf93a", "metadata": {}, "outputs": [], @@ -443,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "id": "a98463ec", "metadata": {}, "outputs": [ @@ -451,7 +451,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Running workflow for 10-K...\n", + "Running step classify_file\n", + "Running step classify_file\n", + "Running step classify_file\n", "Running step classify_file\n", "Step classify_file produced event ClassifiedEvent\n", "Running step extract_by_type\n" @@ -461,20 +463,20 @@ "name": "stderr", "output_type": "stream", "text": [ - "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.84s/it]\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.18it/s]\n", - "Extracting files: 100%|██████████| 1/1 [00:19<00:00, 19.01s/it]\n" + "\n", + "\u001b[A\n", + "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.31s/it]\n", + "\n", + "\u001b[A\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 2.86it/s]\n", + "\n", + "\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Step extract_by_type produced event ExtractedEvent\n", - "Running step format_output\n", - "Step format_output produced event StopEvent\n", - "Running workflow for 10-Q...\n", - "Running step classify_file\n", "Step classify_file produced event ClassifiedEvent\n", "Running step extract_by_type\n" ] @@ -483,20 +485,57 @@ "name": "stderr", "output_type": "stream", "text": [ - "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.31s/it]\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.43it/s]\n", - "Extracting files: 100%|██████████| 1/1 [00:25<00:00, 25.95s/it]\n" + "\n", + "\n", + "\u001b[A\u001b[A\n", + "\n", + "Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.67it/s]\n", + "\n", + "\n", + "\u001b[A\u001b[A\n", + "\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 3.09it/s]\n", + "\n", + "\n", + "\u001b[A\u001b[A" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\u001b[A\u001b[A\u001b[A\n", + "\n", + "\n", + "Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.56it/s]\n", + "\n", + "\n", + "\n", + "\u001b[A\u001b[A\u001b[A\n", + "\n", + "\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", + "\n", + "\n", + "\n", + "\u001b[A\u001b[A\u001b[A" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Step extract_by_type produced event ExtractedEvent\n", - "Running step format_output\n", - "Step format_output produced event StopEvent\n", - "Running workflow for 8-K...\n", - "Running step classify_file\n", "Step classify_file produced event ClassifiedEvent\n", "Running step extract_by_type\n" ] @@ -505,9 +544,12 @@ "name": "stderr", "output_type": "stream", "text": [ - "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.16s/it]\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.40it/s]\n", - "Extracting files: 100%|██████████| 1/1 [01:58<00:00, 118.02s/it]\n" + "\n", + "\n", + "\n", + "\n", + "\u001b[A\u001b[A\u001b[A\u001b[A\n", + "Extracting files: 100%|██████████| 1/1 [00:19<00:00, 19.24s/it]\n" ] }, { @@ -516,20 +558,61 @@ "text": [ "Step extract_by_type produced event ExtractedEvent\n", "Running step format_output\n", - "Step format_output produced event StopEvent\n", - "Running workflow for Proxy...\n", - "Running step classify_file\n", - "Step classify_file produced event ClassifiedEvent\n", - "Running step extract_by_type\n" + "Step format_output produced event StopEvent\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.97it/s]\n", + "\n", + "\u001b[A\n", + "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 3.94it/s]\n", + "\n", + "\u001b[A\n", + "\n", + "Extracting files: 100%|██████████| 1/1 [00:42<00:00, 42.17s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Extracting files: 100%|██████████| 1/1 [00:38<00:00, 38.45s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.44s/it]\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 1.28it/s]\n", - "Extracting files: 100%|██████████| 1/1 [02:07<00:00, 127.58s/it]" + "\n", + "\n", + "\n", + "Extracting files: 100%|██████████| 1/1 [01:16<00:00, 76.33s/it]" ] }, { @@ -557,31 +640,51 @@ " 'Proxy': {'type': 'proxy', 'file': 'data/msft_proxy.pdf'}}" ] }, - "execution_count": null, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Run the workflow on each Microsoft document\n", - "import nest_asyncio\n", + "# # Run the workflow on each Microsoft document\n", + "# import nest_asyncio\n", "\n", + "# nest_asyncio.apply()\n", + "\n", + "# workflow = SECClassifyExtractWorkflow(verbose=True, timeout=None)\n", + "\n", + "# results = {}\n", + "# for doc_type, path in local_files.items():\n", + "# print(f\"Running workflow for {path}...\")\n", + "# res = await workflow.run(file_path=path)\n", + "# results[doc_type] = res\n", + "\n", + "# {t: {\"type\": v[\"type\"], \"file\": v[\"file\"]} for t, v in results.items()}\n", + "\n", + "\n", + "import asyncio\n", + "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", - "workflow = SECClassifyExtractWorkflow(verbose=True, timeout=None)\n", + "# Optional: limit concurrency (helps with rate limits)\n", + "sem = asyncio.Semaphore(4)\n", + "\n", + "async def run_one(doc_type: str, path: str):\n", + " async with sem: # remove this line and the 'async with' to run fully unbounded\n", + " wf = SECClassifyExtractWorkflow(verbose=True, timeout=180)\n", + " result = await wf.run(file_path=path)\n", + " return doc_type, result\n", "\n", - "results = {}\n", - "for doc_type, path in local_files.items():\n", - " print(f\"Running workflow for {path}...\")\n", - " res = await workflow.run(file_path=path)\n", - " results[doc_type] = res\n", + "pairs = await asyncio.gather(*(run_one(t, p) for t, p in local_files.items()))\n", + "results_by_type = {t: res for t, res in pairs}\n", "\n", - "{t: {\"type\": v[\"type\"], \"file\": v[\"file\"]} for t, v in results.items()}" + "# Quick view\n", + "{t: {\"type\": v[\"type\"], \"file\": v[\"file\"]} for t, v in results_by_type.items()}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "id": "4bd8aee0", "metadata": {}, "outputs": [ @@ -591,176 +694,230 @@ "text": [ "\n", "==== 10-k ====\n", - "{'annual_revenue': 245122000000.0,\n", + "{'annual_revenue': 245157565500.0,\n", " 'company_name': 'Microsoft Corporation',\n", " 'fiscal_year_end': 'June 30, 2024',\n", " 'net_income': 88136000000.0,\n", " 'primary_risk_factors': ['We face intense competition across all markets for '\n", - " 'our products and services from a range of '\n", - " 'competitors varying in size and specialization. '\n", - " 'Barriers to entry are low in many markets, and we '\n", - " 'experience rapid evolution in technologies and user '\n", - " 'needs. Competition includes firms with competing '\n", - " 'platforms and business models such as cloud-based '\n", - " 'services and open source software. We are investing '\n", - " 'in AI as a highly competitive area. Cybersecurity '\n", - " 'threats from nation-state actors and other '\n", - " 'malicious parties pose significant risks. We are '\n", - " 'subject to a wide range of laws and regulations '\n", - " 'globally related to privacy, telecommunications, '\n", - " 'data protection, advertising, and content.',\n", - " 'Security vulnerabilities, data corruption, or '\n", - " 'reduced performance in our products and services '\n", - " 'could harm our reputation and reduce customer '\n", - " 'purchases. Improper disclosure or misuse of '\n", - " 'personal data by us or vendors could lead to legal '\n", - " 'liabilities and damage to reputation. Platforms may '\n", - " 'be abused for hostile content, leading to '\n", - " 'regulatory or reputational harm. Defective '\n", - " 'products, operational failures, supply chain '\n", - " 'disruptions, or infrastructure issues may cause '\n", - " 'financial or legal consequences. Legal and '\n", - " 'regulatory risks include antitrust and trade '\n", - " 'restrictions, along with data privacy laws that '\n", - " 'could increase costs or impede product adoption. '\n", - " 'Enforcement on privacy, content moderation, AI, and '\n", - " 'data protection could increase compliance costs and '\n", - " 'operational changes. Exposure exists to claims '\n", - " 'related to business practices, AI services, '\n", - " 'intellectual property, and tax liabilities.',\n", - " 'The software, device, and cloud services markets '\n", - " 'are dynamic and highly competitive. Demand for '\n", - " 'products and services is influenced by global '\n", - " 'macroeconomic and geopolitical factors. Supply '\n", - " 'chain disruptions can impact device manufacturing '\n", - " 'timelines. Success depends on attracting and '\n", - " 'retaining qualified employees. Foreign exchange '\n", - " 'rate fluctuations significantly affect revenues and '\n", - " 'expenses. The industry continually sees rapid '\n", - " 'technological and business model changes.',\n", - " 'We face economic risks including foreign exchange '\n", - " 'rate fluctuations, interest rate changes, credit '\n", - " 'risk, and equity price volatility.',\n", - " 'Product failure rates impact hardware warranty '\n", - " 'costs. Advertising expenses vary and present '\n", - " 'management risks. Investment impairments may occur '\n", - " 'due to market and credit conditions. Foreign '\n", - " 'currency risk is managed via derivatives. Interest '\n", - " 'rate risk arises from fixed-income securities. '\n", - " 'Credit risk is managed through credit default swaps '\n", - " 'in the fixed-income portfolio.',\n", - " 'We are subject to investigations such as the Irish '\n", - " 'Data Protection Commission investigation into '\n", - " \"LinkedIn's GDPR practices, which could result in \"\n", - " 'regulatory fines. Claims and lawsuits arise in the '\n", - " 'ordinary course of business with potential material '\n", - " 'adverse impacts. Uncertain tax positions related to '\n", - " 'transfer pricing issues with the IRS could '\n", - " 'materially impact financial statements.']}\n", + " 'our products and services, which may adversely '\n", + " 'affect our results of operations. Competition in '\n", + " 'the technology sector from diversified global '\n", + " 'companies and small, specialized firms. Barriers to '\n", + " 'entry in many of our businesses are low and many '\n", + " 'areas in which we compete evolve rapidly with '\n", + " 'changing and disruptive technologies, shifting user '\n", + " 'needs, and frequent introductions of new products '\n", + " 'and services. Competition from firms that provide '\n", + " 'competing platforms and platform-based ecosystems. '\n", + " 'Competition from vertically-integrated models where '\n", + " 'firms control software, hardware, and related '\n", + " 'services, increasing their revenue and potential '\n", + " 'security/performance benefits. Competing platforms '\n", + " 'offer content and application marketplaces with '\n", + " 'scale and significant installed bases, challenging '\n", + " 'our ability to attract developers. Cloud-based '\n", + " 'services competition for consumers and business '\n", + " 'customers; pricing and delivery models are '\n", + " 'evolving. Highly competitive and rapidly evolving '\n", + " 'AI technology and services market. Competition from '\n", + " 'companies distributing open source software at '\n", + " 'little or no cost to end users.',\n", + " 'Cyberattacks and security vulnerabilities could '\n", + " 'lead to reduced revenue, increased costs, liability '\n", + " 'claims, or harm to reputation or competitive '\n", + " 'position. Nation-state and state-sponsored cyber '\n", + " 'attacks. Security threats to IT, including evolving '\n", + " 'methods by hackers and organizations to gain '\n", + " 'unauthorized access. Security vulnerabilities in '\n", + " 'products and services, including data corruption, '\n", + " 'reduced performance, or misuse of personal data '\n", + " 'leading to reputational harm, legal exposure, or '\n", + " 'liability. Abuse of platforms such as '\n", + " 'impersonation, misinformation, objectionable or '\n", + " 'illegal content, and compliance with content '\n", + " 'moderation regulations. Defective, insecure, or '\n", + " 'ineffective products, including those involving AI '\n", + " 'or Internet of Things devices, leading to legal '\n", + " 'claims, reputational damage, or regulatory action.',\n", + " 'AI-related risks including flawed algorithms, '\n", + " 'biased datasets, harmful outputs, copyright or '\n", + " 'legal claims, regulatory changes, and '\n", + " 'ethical/societal impacts. Operational risks '\n", + " 'including outages, data losses, supply chain '\n", + " 'disruptions, datacenter and server component '\n", + " 'shortages, hardware and software quality issues. '\n", + " 'Government enforcement under competition laws, '\n", + " 'antitrust actions, new market regulations, and '\n", + " 'resulting fines, restrictions, or inability to '\n", + " 'monetize or operate products. Anti-corruption, '\n", + " 'trade, and export control laws and related '\n", + " 'compliance risks, potential for fines, penalties, '\n", + " 'operational bans, or reputational damage. Evolving '\n", + " 'laws and regulations relating to the handling of '\n", + " 'personal data (e.g. GDPR, Digital Markets Act, data '\n", + " 'localization), including cross-border transfer '\n", + " 'restrictions, legal challenges, and costs of '\n", + " 'compliance. Expanding legal, regulatory, and '\n", + " 'reporting requirements in areas such as user data '\n", + " 'privacy, digital accessibility, advertising, AI, '\n", + " 'and cybersecurity.',\n", + " 'Material legal claims, lawsuits, and uncertain '\n", + " 'litigation outcomes related to product releases, '\n", + " 'AI, government contracts, employment, and IP. '\n", + " 'Adverse tax determinations, audits (including IRS '\n", + " 'Notices of Proposed Adjustment), changing tax laws '\n", + " 'or international agreements affecting effective tax '\n", + " 'rate and liabilities. Sustainability and '\n", + " 'ESG-related legal requirements and fulfillment of '\n", + " 'public sustainability commitments, including risks '\n", + " 'of legal action, regulatory penalties, or '\n", + " 'reputational damage. Intellectual property risks '\n", + " 'including inability to protect IP, source code '\n", + " 'leaks, infringement claims (including regarding AI '\n", + " 'training), and exposure to royalty, damages, or '\n", + " 'injunctions. Reputation or brand harm from customer '\n", + " 'or stakeholder backlash, product/service issues, '\n", + " 'privacy/data breaches, or failures in responsible '\n", + " 'AI. Adverse economic conditions, inflation, '\n", + " 'recession, market instability, customer insolvency, '\n", + " 'and investment impairments.',\n", + " 'Catastrophic events, geopolitical risks, pandemics, '\n", + " 'climate change, and natural disasters disrupting '\n", + " 'operations or affecting financial condition. Risks '\n", + " 'related to attracting and retaining talented '\n", + " 'employees, workforce diversity, unionization, and '\n", + " 'compliance with changing employment laws. '\n", + " 'Cybersecurity threats. Challenges in evolving '\n", + " 'technology and business models. Rapid changes in '\n", + " 'customer device and form factor preferences. Global '\n", + " 'macroeconomic and geopolitical factors. '\n", + " 'Availability of land, energy, networking supplies, '\n", + " 'and servers for datacenter expansion. Dependence on '\n", + " 'qualified suppliers for certain device components. '\n", + " 'Ability to attract and retain qualified employees. '\n", + " 'Fluctuations in foreign exchange rates.',\n", + " 'Economic risk from foreign exchange rates, interest '\n", + " 'rates, credit risk, and equity prices. Exposure to '\n", + " 'foreign currency risk including Euro, Japanese yen, '\n", + " 'British pound, Canadian dollar, and Australian '\n", + " 'dollar. Certain forecasted transactions, assets, '\n", + " 'and liabilities are exposed to foreign currency '\n", + " 'risk. Securities held in our fixed-income portfolio '\n", + " 'are subject to different interest rate risks based '\n", + " 'on their maturities. Securities held in our equity '\n", + " 'investments portfolio are subject to market price '\n", + " 'risk. Our fixed-income portfolio is diversified and '\n", + " 'consists primarily of investment-grade securities. '\n", + " 'We use credit default swap contracts to manage '\n", + " 'credit exposures relative to broad-based indices.',\n", + " 'Pending and ongoing litigation, including U.S. cell '\n", + " 'phone litigation related to alleged adverse health '\n", + " 'effects from radio emissions. Risk of adjustments '\n", + " 'and high-value tax contingencies from U.S. IRS '\n", + " 'audits, particularly concerning intercompany '\n", + " 'transfer pricing and large proposed tax '\n", + " 'adjustments. Potential goodwill and intangible '\n", + " 'asset impairments. Integration risks and fair value '\n", + " 'allocation uncertainties associated with large '\n", + " 'acquisitions such as Activision Blizzard and '\n", + " 'Nuance. Uncertain or changing tax regulations, '\n", + " 'especially related to foreign earnings and deferred '\n", + " 'tax asset realization.']}\n", "\n", "==== 10-q ====\n", "{'company_name': 'Microsoft Corporation',\n", " 'material_changes': ['In August 2024, Microsoft announced changes to the '\n", - " 'composition of its segments to better align with '\n", - " 'current business management, notably consolidating the '\n", - " 'commercial components of Microsoft 365 within the '\n", - " 'Productivity and Business Processes segment. These '\n", - " 'segment changes, which began in fiscal year 2025, were '\n", - " 'also reflected retrospectively in prior period segment '\n", - " 'information. The changes impacted Note 8 – Goodwill, '\n", - " 'Note 12 – Unearned Revenue, and Note 17 – Segment '\n", - " 'Information and Geographic Data but did not affect the '\n", - " 'consolidated balance sheets, income statements, or cash '\n", - " 'flow statements.',\n", + " 'composition of its segments to align with current '\n", + " 'business management, notably consolidating commercial '\n", + " 'components of Microsoft 365 into the Productivity and '\n", + " 'Business Processes segment. Prior period segment '\n", + " 'information was recast for fiscal year 2025, impacting '\n", + " 'Note 8 – Goodwill, Note 12 – Unearned Revenue, and Note '\n", + " '17 – Segment Information and Geographic Data.',\n", + " 'In March 2024, Microsoft obtained a non-exclusive '\n", + " \"license to Inflection AI, Inc.'s intellectual property \"\n", + " 'under an agreement, with Reid Hoffman, a board member, '\n", + " 'being a co-founder and director of Inflection.',\n", " 'On October 13, 2023, Microsoft completed its '\n", " 'acquisition of Activision Blizzard, Inc. for $75.4 '\n", - " 'billion, primarily in cash. The purchase price '\n", - " 'allocation was completed by September 30, 2024. Due to '\n", - " 'the segment changes in fiscal year 2025, the reporting '\n", - " 'units and goodwill allocations were adjusted using a '\n", - " 'relative fair value approach. As of September 30, 2024, '\n", - " 'no commercial paper was issued or outstanding, '\n", - " 'following a prior $6.7 billion outstanding as of June '\n", - " '30, 2024. The Board of Directors approved a $60 billion '\n", - " 'share repurchase program on September 16, 2024, set to '\n", - " \"commence after the previous program's completion. Also, \"\n", - " 'in October 2024, the Irish Data Protection Commission '\n", - " 'issued a final GDPR decision involving LinkedIn, a '\n", - " 'Microsoft subsidiary.',\n", - " 'During the period, Microsoft Cloud revenues increased '\n", - " 'by 22% to $38.9 billion, with various components '\n", - " 'showing growth including Microsoft 365 Commercial '\n", - " '(13%), Microsoft 365 Consumer (5%), LinkedIn (10%), '\n", - " 'Dynamics (14%), Server products (23%), and Xbox content '\n", - " 'and services (61%) driven by the Activision '\n", - " 'acquisition. Search and news advertising revenues '\n", - " 'increased by 18%. Gross margin percentages decreased, '\n", - " 'particularly within Intelligent Cloud and Microsoft '\n", - " 'Cloud margins, partly due to scaling AI infrastructure. '\n", - " 'Operating expenses rose by $1.6 billion (12%), driven '\n", - " 'by Gaming and cloud engineering investments, while '\n", - " 'operating income increased by $3.7 billion (14%).',\n", - " 'Cash from operations increased by $3.6 billion to $34.2 '\n", - " 'billion for the three months ended September 30, 2024. '\n", - " 'Cash used in financing activities increased by $31.3 '\n", - " 'billion to $16.6 billion, primarily due to increased '\n", - " 'debt repayments. Cash used in investing rose by $15.7 '\n", - " 'billion to $15.2 billion. An $4.4 billion short-term '\n", - " 'transition tax installment is payable in fiscal year '\n", - " '2026. Share repurchases totaled 7 million shares for '\n", - " '$2.8 billion, and dividends declared amounted to $6.2 '\n", - " 'billion for the same quarter. As of September 30, 2024, '\n", - " '$7.5 billion remained available under the $60 billion '\n", - " 'share repurchase program. Capital expenditures are '\n", - " 'expected to rise to support cloud growth and AI '\n", - " 'infrastructure investments.',\n", - " 'Risks disclosed include security, privacy, and '\n", - " 'execution risks related to products and services '\n", - " 'including AI and IoT solutions; integration and '\n", - " 'reliability risks with third-party products; legal, '\n", - " 'regulatory, and litigation risks including antitrust '\n", - " 'and government competition law enforcement; supply '\n", - " 'chain and component quality issues; compliance risks '\n", - " 'with corruption, trade, and sanctions laws; evolving '\n", - " 'data privacy laws increasing compliance burdens; '\n", - " 'operational risks from outages and disruptions; '\n", - " 'increased scrutiny and regulation concerning AI, '\n", - " 'cybersecurity, and ESG requirements; tax risks '\n", - " 'including ongoing IRS audits for $28.9 billion plus '\n", - " 'penalties; sustainability regulatory requirements and '\n", - " 'commitments; intellectual property protection risks; '\n", - " 'general reputational risks from product issues and '\n", - " 'social scrutiny; adverse economic and market '\n", - " 'conditions; geopolitical, catastrophic, and '\n", - " 'climate-related risks; employment market and '\n", - " 'unionization challenges; and risks from global '\n", - " 'operations such as currency fluctuations and '\n", - " 'protectionist trends.'],\n", + " 'billion, mainly cash. Changes to business segments in '\n", + " 'fiscal year 2025 resulted in reallocation of goodwill '\n", + " 'using a relative fair value approach. Unrecognized tax '\n", + " 'benefits and other income tax liabilities totaled $25.9 '\n", + " 'billion as of September 30, 2024, with IRS seeking an '\n", + " 'additional $28.9 billion plus penalties and interest '\n", + " 'for 2004-2013 related to intercompany transfer pricing. '\n", + " 'On September 16, 2024, Microsoft approved a new $60 '\n", + " 'billion share repurchase program following the previous '\n", + " 'program. In October 2024, the Irish Data Protection '\n", + " 'Commission issued a final decision to LinkedIn '\n", + " 'regarding alleged GDPR violations.',\n", + " 'Microsoft reported financial performance highlights '\n", + " 'including a 22% increase in Cloud revenue to $38.9 '\n", + " 'billion, overall revenue up 16%, driven by growth in '\n", + " 'all segments: Intelligent Cloud, Productivity and '\n", + " 'Business Processes, and More Personal Computing. Gaming '\n", + " 'revenue increased 43%, with Xbox content and services '\n", + " 'revenue growing 61% largely due to the Activision '\n", + " 'Blizzard acquisition, while Xbox hardware revenue '\n", + " 'decreased 29%. Operating income increased 14%. Cost of '\n", + " 'revenue increased 23%, gross margin increased 13%, but '\n", + " 'gross margin percentage decreased due to Intelligent '\n", + " 'Cloud and Cloud AI infrastructure scaling. Expenses '\n", + " 'rose driven by Gaming and cloud engineering '\n", + " 'investments. Effective tax rate increased to 19% from '\n", + " \"18%, with the OECD's global minimum tax applicable from \"\n", + " 'fiscal year 2025. IRS Notices seek $28.9 billion plus '\n", + " 'penalties and interest.',\n", + " 'Cash from operations increased $3.6 billion to $34.2 '\n", + " 'billion for the quarter ended September 30, 2024. Cash '\n", + " 'used in financing increased $31.3 billion due to higher '\n", + " 'debt repayments. Cash used in investing increased $15.7 '\n", + " 'billion primarily due to lower investment cash flow and '\n", + " 'higher property and equipment additions. During the '\n", + " 'three months ended September 30, 2024, Microsoft '\n", + " 'repurchased 7 million shares for $2.8 billion and '\n", + " 'declared dividends totaling $6.2 billion. A $7.5 '\n", + " 'billion balance remains on the $60 billion share '\n", + " 'repurchase program. Security incidents occurred '\n", + " 'involving a password spray attack in late 2023 '\n", + " 'affecting some Microsoft systems, potentially harming '\n", + " 'reputation and operations.',\n", + " 'Microsoft faces risks including security, privacy, and '\n", + " 'operational execution in products and AI integration; '\n", + " 'potential flaws in AI development causing legal or '\n", + " 'reputational harm; operational infrastructure risks '\n", + " 'like outages and supply chain disruptions; legal, '\n", + " 'regulatory, and litigation challenges including '\n", + " 'significant IRS tax disputes; compliance with data '\n", + " 'privacy regulations such as GDPR; sustainability '\n", + " 'regulatory and commitment risks; intellectual property '\n", + " 'protection issues; and risks from catastrophic events, '\n", + " 'geopolitical conflicts, economic downturns, and '\n", + " 'workforce challenges.'],\n", " 'quarter_end': 'September 30, 2024',\n", " 'quarterly_revenue': 65585.0,\n", - " 'revenue_change_pct': 16.05}\n", + " 'revenue_change_pct': 16.04}\n", "\n", "==== 8-k ====\n", "{'company_name': 'Microsoft Corporation',\n", " 'event_date': 'August 21, 2024',\n", - " 'event_type': 'Regulation FD Disclosure – Updated Segment Reporting and FY25 '\n", - " 'Investor Metrics',\n", - " 'material_event_description': 'Microsoft Corporation announced changes to its '\n", - " 'segment structure and key metrics reporting, '\n", - " 'effective for Fiscal Year 2025. The company '\n", - " 'updated its reporting segments, notably '\n", - " 'consolidating Microsoft 365 Commercial revenue '\n", - " 'streams into the Productivity and Business '\n", - " 'Processes segment, reclassifying Copilot Pro '\n", - " 'and Nuance Enterprise revenue, and making '\n", - " 'corresponding adjustments to key performance '\n", - " 'indicators (KPIs). Microsoft also provided '\n", - " 'restated investor metrics and mechanical '\n", - " 'updates to its FY25 Q1 outlook solely to '\n", - " 'reflect the presentation changes. On August '\n", - " '21, 2024, Microsoft posted related '\n", - " 'presentation materials to its Investor '\n", - " 'Relations website.'}\n", + " 'event_type': 'Regulation FD Disclosure; Segment and Metric Changes',\n", + " 'material_event_description': 'Microsoft Corporation announced updates to its '\n", + " 'reporting segment structure and key '\n", + " 'performance metrics for Fiscal Year 2025. '\n", + " 'Changes include the creation of a new '\n", + " \"'Microsoft 365 Commercial products and cloud \"\n", + " \"services' segment within Productivity and \"\n", + " 'Business Processes, realignment of revenues '\n", + " 'for EMS, Power BI, Windows Commercial, Copilot '\n", + " 'Pro, and Nuance Enterprise, and revised '\n", + " \"metrics such as replacing 'Office Commercial \"\n", + " \"products and cloud services revenue growth' \"\n", + " \"with 'Microsoft 365 Commercial cloud revenue \"\n", + " \"growth.' Microsoft updated its Fiscal Year \"\n", + " '2025 Q1 financial outlook to reflect these '\n", + " 'structural changes.'}\n", "\n", "==== proxy ====\n", "{'ceo_name': 'Satya Nadella',\n", @@ -825,9 +982,17 @@ " pprint(subset)\n", "\n", "\n", - "for t, out in results.items():\n", + "for t, out in results_by_type.items():\n", " summarize(out[\"type\"], out[\"data\"])" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1637e0b4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From e97c46c0becc597ae8fc1cb3e9f523e25a61882e Mon Sep 17 00:00:00 2001 From: Jerry Liu Date: Fri, 12 Sep 2025 08:57:26 -0700 Subject: [PATCH 3/3] cr --- .../sec_filing_classify_extract.ipynb | 149 ++++-------------- 1 file changed, 30 insertions(+), 119 deletions(-) diff --git a/examples/classify/sec_filing_classify_extract.ipynb b/examples/classify/sec_filing_classify_extract.ipynb index 1613bc5f..da98dcef 100644 --- a/examples/classify/sec_filing_classify_extract.ipynb +++ b/examples/classify/sec_filing_classify_extract.ipynb @@ -331,7 +331,7 @@ "output_type": "stream", "text": [ "10-k\n", - "The document is titled 'FORM 10-K' and is labeled as an 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934' for the fiscal year ended June 30, 2024. It contains all the hallmarks of a 10-K filing, including a business overview, risk factors, management's discussion and analysis (MD&A), and references to audited financial statements. The index lists all required sections for a 10-K, such as Business, Risk Factors, MD&A, Financial Statements, and more. There is no indication that this is a quarterly report (10-Q), a current report (8-K), or a proxy statement (DEF 14A). The content and structure perfectly match the definition of a 10-K.\n" + "The document is titled 'FORM 10-K' and is labeled as an 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934' for the fiscal year ended June 30, 2024. It contains all the hallmark sections of a 10-K, including Business Overview, Risk Factors, Management’s Discussion and Analysis (MD&A), and audited financial statements. The index and content structure match the requirements for a 10-K filing, and there is explicit reference to the form throughout the document. There is no ambiguity or evidence suggesting it is any other type of SEC filing. Therefore, this is a perfect match for the 10-k category.\n" ] } ], @@ -355,7 +355,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "id": "75ddf93a", "metadata": {}, "outputs": [], @@ -443,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "id": "a98463ec", "metadata": {}, "outputs": [ @@ -463,20 +463,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\n", - "\u001b[A\n", - "Uploading files: 100%|██████████| 1/1 [00:01<00:00, 1.31s/it]\n", - "\n", - "\u001b[A\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 2.86it/s]\n", - "\n", - "\u001b[A" + "Uploading files: 100%|████████████████████████████████████████████| 1/1 [00:01<00:00, 1.23s/it]\n", + "Creating extraction jobs: 100%|███████████████████████████████████| 1/1 [00:00<00:00, 1.21it/s]\n", + "Extracting files: 100%|███████████████████████████████████████████| 1/1 [00:12<00:00, 12.11s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", "Step classify_file produced event ClassifiedEvent\n", "Running step extract_by_type\n" ] @@ -485,25 +483,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "\n", - "\n", - "\u001b[A\u001b[A\n", - "\n", - "Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.67it/s]\n", - "\n", - "\n", - "\u001b[A\u001b[A\n", - "\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:00<00:00, 3.09it/s]\n", - "\n", - "\n", - "\u001b[A\u001b[A" + "Uploading files: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 2.13it/s]\n", + "Creating extraction jobs: 100%|███████████████████████████████████| 1/1 [00:00<00:00, 1.13it/s]\n", + "Extracting files: 100%|███████████████████████████████████████████| 1/1 [00:17<00:00, 17.91s/it]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", "Step classify_file produced event ClassifiedEvent\n", "Running step extract_by_type\n" ] @@ -512,24 +503,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\n", - "\n", - "\n", - "\u001b[A\u001b[A\u001b[A\n", - "\n", - "\n", - "Uploading files: 100%|██████████| 1/1 [00:00<00:00, 1.56it/s]\n", - "\n", - "\n", - "\n", - "\u001b[A\u001b[A\u001b[A\n", - "\n", - "\n", - "Creating extraction jobs: 100%|██████████| 1/1 [00:01<00:00, 1.43s/it]\n", - "\n", - "\n", - "\n", - "\u001b[A\u001b[A\u001b[A" + "Uploading files: 0%| | 0/1 [00:00