diff --git a/examples/classify/sec_filing_classify_extract.ipynb b/examples/classify/sec_filing_classify_extract.ipynb new file mode 100644 index 00000000..da98dcef --- /dev/null +++ b/examples/classify/sec_filing_classify_extract.ipynb @@ -0,0 +1,929 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3b349365", + "metadata": {}, + "source": [ + "# Classifying and Extracting from SEC Filings\n", + "\n", + "\"Open\n", + "\n", + "This notebook demonstrates how to classify and extract information from SEC filings using LlamaParse. We'll walk through the process of classifying a document as either a 10-K or 10-Q filing and then extracting the relevant information.\n", + "\n", + "**Note**: The classification module is currently in *beta*, so we are still ironing out some interface/implementation details. Please let us know your feedback!\n", + "\n", + "Status:\n", + "| Last Executed | Version | State |\n", + "|---------------|---------|------------|\n", + "| Sep-09-2025 | 0.6.65 | Maintained |" + ] + }, + { + "cell_type": "markdown", + "id": "22fb430e", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "This notebook demonstrates a classify+extract workflow on SEC filings using LlamaCloud and LlamaIndex Workflows.\n", + "\n", + "- Classify each document as one of: 10-K, 10-Q, 8-K, Proxy (DEF 14A)\n", + "- Extract a type-specific schema depending on the classification\n", + "- Orchestrate via an event-driven LlamaIndex Workflow\n", + "\n", + "We also include public example documents, so anyone can run this end-to-end.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12356c11", + "metadata": {}, + "outputs": [], + "source": [ + "# Install and imports\n", + "import os\n", + "from typing import List, Optional\n", + "from datetime import date\n", + "from decimal import Decimal\n", + "from pydantic import BaseModel, Field\n", + "\n", + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()\n", + "\n", + "# LlamaIndex workflow imports\n", + "from llama_index.core.workflow import (\n", + " Event,\n", + " StartEvent,\n", + " StopEvent,\n", + " Context,\n", + " Workflow,\n", + " step,\n", + ")\n", + "from llama_index.core.prompts import ChatPromptTemplate\n", + "from llama_index.llms.openai import OpenAI\n", + "\n", + "# LlamaCloud classify/extract\n", + "from llama_cloud.client import AsyncLlamaCloud\n", + "from llama_cloud.types import ClassifierRule, ClassifyParsingConfiguration\n", + "from llama_cloud_services.beta.classifier.client import ClassifyClient\n", + "from llama_cloud_services import LlamaExtract, ExtractionAgent\n", + "from llama_cloud import ExtractConfig\n", + "from llama_cloud.core.api_error import ApiError" + ] + }, + { + "cell_type": "markdown", + "id": "635e00b7", + "metadata": {}, + "source": [ + "## Sample documents\n", + "\n", + "We will download four public SEC filings (10-K, 10-Q, 8-K, Proxy) into `examples/classify/data/` and run the workflow over them.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c839b119", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'10-K': 'data/msft_10k.pdf',\n", + " '10-Q': 'data/msft_10q.pdf',\n", + " '8-K': 'data/msft_8k.pdf',\n", + " 'Proxy': 'data/msft_proxy.pdf'}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Download Microsoft PDFs for all four types\n", + "import pathlib\n", + "import requests\n", + "\n", + "DATA_DIR = pathlib.Path(\"data\")\n", + "DATA_DIR.mkdir(parents=True, exist_ok=True)\n", + "\n", + "MSFT_DOCS = {\n", + " \"10-K\": \"https://microsoft.gcs-web.com/static-files/1c864583-06f7-40cc-a94d-d11400c83cc8\",\n", + " \"10-Q\": \"https://microsoft.gcs-web.com/static-files/f96f7d38-36ce-4a26-9e29-61701cdca7a7\",\n", + " \"8-K\": \"https://microsoft.gcs-web.com/static-files/dc50633a-2880-4303-bebb-bdca89149f65\",\n", + " \"Proxy\": \"https://microsoft.gcs-web.com/static-files/d5ec87b3-e29d-4d33-9d84-5ce1f194dcf1\",\n", + "}\n", + "\n", + "local_files = {}\n", + "for k, url in MSFT_DOCS.items():\n", + " out_path = DATA_DIR / f\"msft_{k.replace('-', '').lower()}.pdf\"\n", + " if not out_path.exists():\n", + " # special case for proxy, run wget\n", + " r = requests.get(url, timeout=60)\n", + " r.raise_for_status()\n", + " with open(out_path, \"wb\") as f:\n", + " f.write(r.content)\n", + " local_files[k] = str(out_path)\n", + "\n", + "local_files" + ] + }, + { + "cell_type": "markdown", + "id": "c882ec38", + "metadata": {}, + "source": [ + "## Define type-specific extraction schemas\n", + "\n", + "We define concise Pydantic schemas for 10-K, 10-Q, 8-K, and Proxy (DEF 14A).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3450e8d8", + "metadata": {}, + "outputs": [], + "source": [ + "class Form10K(BaseModel):\n", + " company_name: str\n", + " fiscal_year_end: date\n", + " annual_revenue: Optional[Decimal] = None\n", + " net_income: Optional[Decimal] = None\n", + " total_assets: Optional[Decimal] = None\n", + " employee_count: Optional[int] = None\n", + " business_description: str\n", + " primary_risk_factors: List[str]\n", + " business_segments: List[str]\n", + " geographic_markets: List[str]\n", + "\n", + "\n", + "class Form10Q(BaseModel):\n", + " company_name: str\n", + " quarter_end: date\n", + " quarterly_revenue: Optional[Decimal] = None\n", + " quarterly_net_income: Optional[Decimal] = None\n", + " revenue_change_pct: Optional[float] = None\n", + " material_changes: List[str]\n", + " subsequent_events: List[str]\n", + "\n", + "\n", + "class Form8K(BaseModel):\n", + " company_name: str\n", + " filing_date: date\n", + " event_date: date\n", + " event_type: str\n", + " material_event_description: str\n", + " financial_impact: Optional[Decimal] = None\n", + " involved_parties: List[str]\n", + "\n", + "\n", + "class ProxyStatement(BaseModel):\n", + " company_name: str\n", + " meeting_date: date\n", + " ceo_name: str\n", + " ceo_total_compensation: Optional[Decimal] = None\n", + " board_members: List[str]\n", + " executive_officers: List[str]\n", + " shareholder_proposals: List[str]\n", + " voting_matters: List[str]\n", + " audit_firm: Optional[str] = None" + ] + }, + { + "cell_type": "markdown", + "id": "6a68087e", + "metadata": {}, + "source": [ + "## Classification rules\n", + "\n", + "We define four `ClassifierRule` entries describing each SEC form in natural language. The classifier returns the `type` string for the best-matching rule.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73a78a11", + "metadata": {}, + "outputs": [], + "source": [ + "# NOTE: the types need to be in lowercase\n", + "SEC_CLASSIFICATION_RULES: list[ClassifierRule] = [\n", + " ClassifierRule(\n", + " type=\"10-k\",\n", + " description=(\n", + " \"Annual report on Form 10-K, includes business overview, risk factors, management's\"\n", + " \" discussion and analysis, audited financial statements for the fiscal year.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"10-q\",\n", + " description=(\n", + " \"Quarterly report on Form 10-Q, includes unaudited quarterly financial statements,\"\n", + " \" MD&A for the quarter, and updates on risk factors.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"8-k\",\n", + " description=(\n", + " \"Current report on Form 8-K, discloses material events such as acquisitions,\"\n", + " \" executive changes, earnings releases, or other significant occurrences.\"\n", + " ),\n", + " ),\n", + " ClassifierRule(\n", + " type=\"proxy\",\n", + " description=(\n", + " \"DEF 14A proxy statement for shareholder meetings including proposals and voting,\"\n", + " \" board of directors, executive compensation (CD&A), and auditor information.\"\n", + " ),\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "ae680962", + "metadata": {}, + "source": [ + "## Initialize clients\n", + "\n", + "We create clients for classification and extraction. Set `LLAMA_CLOUD_API_KEY` in your environment. Optionally set `LLAMA_CLOUD_BASE_URL`, `LLAMA_CLOUD_PROJECT_ID`, `LLAMA_CLOUD_ORGANIZATION_ID`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "446dbb35", + "metadata": {}, + "outputs": [], + "source": [ + "api_key = os.getenv(\"LLAMA_CLOUD_API_KEY\")\n", + "base_url = os.getenv(\"LLAMA_CLOUD_BASE_URL\")\n", + "project_id = os.getenv(\"LLAMA_CLOUD_PROJECT_ID\")\n", + "organization_id = os.getenv(\"LLAMA_CLOUD_ORGANIZATION_ID\")\n", + "\n", + "if not api_key:\n", + " raise ValueError(\"LLAMA_CLOUD_API_KEY not set. Please set it in your environment.\")\n", + "\n", + "async_client = AsyncLlamaCloud(token=api_key, base_url=base_url)\n", + "classify_client = ClassifyClient(\n", + " async_client, project_id=project_id, organization_id=organization_id\n", + ")\n", + "\n", + "extract_config = ExtractConfig(extraction_mode=\"BALANCED\")\n", + "llama_extract = LlamaExtract(project_id=project_id, organization_id=organization_id)\n", + "\n", + "# Model for LLM summarization in prompts if needed\n", + "llm = OpenAI(model=\"gpt-4o\")" + ] + }, + { + "cell_type": "markdown", + "id": "b21d50f0", + "metadata": {}, + "source": [ + "## Using Classify Module\n", + "\n", + "In this section we show you how to use the `ClassifyClient` module in a standalone manner (before using it in an e2e workflow).\n", + "\n", + "We run the classification module over the Microsoft 10-K file to verify that the output is in the correct class.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f356085d", + "metadata": {}, + "outputs": [], + "source": [ + "# set parsing configuration\n", + "parsing_config = ClassifyParsingConfiguration(max_pages=5)\n", + "\n", + "# classify file\n", + "results = await classify_client.aclassify_file_path(\n", + " rules=SEC_CLASSIFICATION_RULES,\n", + " file_input_path=\"data/msft_10k.pdf\",\n", + " parsing_configuration=parsing_config,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2d02406d", + "metadata": {}, + "source": [ + "The result will not only contain the classification type, but also the reasoning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34249af0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10-k\n", + "The document is titled 'FORM 10-K' and is labeled as an 'ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934' for the fiscal year ended June 30, 2024. It contains all the hallmark sections of a 10-K, including Business Overview, Risk Factors, Management’s Discussion and Analysis (MD&A), and audited financial statements. The index and content structure match the requirements for a 10-K filing, and there is explicit reference to the form throughout the document. There is no ambiguity or evidence suggesting it is any other type of SEC filing. Therefore, this is a perfect match for the 10-k category.\n" + ] + } + ], + "source": [ + "print(results.items[0].result.type)\n", + "print(results.items[0].result.reasoning)" + ] + }, + { + "cell_type": "markdown", + "id": "8408bb00", + "metadata": {}, + "source": [ + "## Workflow: Classify then Extract\n", + "\n", + "We build a `Workflow` with steps:\n", + "- `classify_file`: upload and classify the document\n", + "- `extract_by_type`: create/select an agent for the type and extract the corresponding schema\n", + "- `format_output`: return unified JSON with `type` and `data`\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75ddf93a", + "metadata": {}, + "outputs": [], + "source": [ + "class ClassifiedEvent(Event):\n", + " file_path: str\n", + " doc_type: str\n", + "\n", + "\n", + "class ExtractedEvent(Event):\n", + " file_path: str\n", + " doc_type: str\n", + " data: dict\n", + "\n", + "\n", + "def _schema_for_type(doc_type: str):\n", + " if doc_type == \"10-k\":\n", + " return Form10K\n", + " if doc_type == \"10-q\":\n", + " return Form10Q\n", + " if doc_type == \"8-k\":\n", + " return Form8K\n", + " if doc_type == \"proxy\":\n", + " return ProxyStatement\n", + " raise ValueError(f\"Unsupported doc_type: {doc_type}\")\n", + "\n", + "\n", + "def _agent_name_for_type(doc_type: str) -> str:\n", + " return f\"sec-{doc_type.lower()}-extractor\"\n", + "\n", + "\n", + "class SECClassifyExtractWorkflow(Workflow):\n", + " def __init__(self, **kwargs):\n", + " super().__init__(**kwargs)\n", + " self.agent_registry: dict[str, ExtractionAgent] = {}\n", + "\n", + " @step\n", + " async def classify_file(self, ctx: Context, ev: StartEvent) -> ClassifiedEvent:\n", + " file_path = ev.file_path\n", + " parsing_config = ClassifyParsingConfiguration(max_pages=5)\n", + " results = await classify_client.aclassify_file_path(\n", + " rules=SEC_CLASSIFICATION_RULES,\n", + " file_input_path=file_path,\n", + " parsing_configuration=parsing_config,\n", + " )\n", + " item = results.items[0]\n", + " doc_type = item.result.type\n", + " return ClassifiedEvent(file_path=file_path, doc_type=doc_type)\n", + "\n", + " @step\n", + " async def extract_by_type(\n", + " self, ctx: Context, ev: ClassifiedEvent\n", + " ) -> ExtractedEvent:\n", + " schema = _schema_for_type(ev.doc_type)\n", + " agent_name = _agent_name_for_type(ev.doc_type)\n", + "\n", + " # Lazily create agent if not present\n", + " if ev.doc_type not in self.agent_registry:\n", + " try:\n", + " existing = llama_extract.get_agent(name=agent_name)\n", + " if existing:\n", + " llama_extract.delete_agent(existing.id)\n", + " except ApiError as e:\n", + " if e.status_code != 404:\n", + " raise\n", + " agent = llama_extract.create_agent(\n", + " name=agent_name, data_schema=schema, config=extract_config\n", + " )\n", + " self.agent_registry[ev.doc_type] = agent\n", + "\n", + " extraction = await self.agent_registry[ev.doc_type].aextract(ev.file_path)\n", + " data = (\n", + " extraction.data\n", + " if isinstance(extraction.data, dict)\n", + " else extraction.model_dump()\n", + " )\n", + " return ExtractedEvent(file_path=ev.file_path, doc_type=ev.doc_type, data=data)\n", + "\n", + " @step\n", + " async def format_output(self, ctx: Context, ev: ExtractedEvent) -> StopEvent:\n", + " return StopEvent(\n", + " result={\"type\": ev.doc_type, \"data\": ev.data, \"file\": ev.file_path}\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98463ec", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running step classify_file\n", + "Running step classify_file\n", + "Running step classify_file\n", + "Running step classify_file\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|████████████████████████████████████████████| 1/1 [00:01<00:00, 1.23s/it]\n", + "Creating extraction jobs: 100%|███████████████████████████████████| 1/1 [00:00<00:00, 1.21it/s]\n", + "Extracting files: 100%|███████████████████████████████████████████| 1/1 [00:12<00:00, 12.11s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 2.13it/s]\n", + "Creating extraction jobs: 100%|███████████████████████████████████| 1/1 [00:00<00:00, 1.13it/s]\n", + "Extracting files: 100%|███████████████████████████████████████████| 1/1 [00:17<00:00, 17.91s/it]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step extract_by_type produced event ExtractedEvent\n", + "Running step format_output\n", + "Step format_output produced event StopEvent\n", + "Step classify_file produced event ClassifiedEvent\n", + "Running step extract_by_type\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Uploading files: 0%| | 0/1 [00:00