-
Notifications
You must be signed in to change notification settings - Fork 0
API Re‐Design
Eric-Fithian edited this page Nov 6, 2025
·
2 revisions
Purpose: Define the desired API through concrete usage examples.
Approach: Show exactly how users should interact with DELM in various scenarios.
from delm import DELM, variable, simple_schema
# Define what to extract
schema = simple_schema(
variable("company", "Company name mentioned in text", "string"),
variable("price", "Price value in USD", "number"),
)
# Initialize with just essentials
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
)
# Extract from DataFrame
import pandas as pd
df = pd.DataFrame({
"text": [
"Apple stock is $150.",
"Microsoft revenue is $50B."
]
})
result = delm.extract(df)
# Access results
print(f"Extracted {result.num_records} records")
print(f"Cost: ${result.cost['total_cost']:.4f}")
print(result.data[["company", "price"]])# config.yaml
provider: openai
model: gpt-4o-mini
schema:
type: simple
variables:
- name: company
description: "Company name mentioned"
data_type: string
- name: price
description: "Price value in USD"
data_type: numberfrom delm import DELM
# One line to initialize
delm = DELM.from_config("config.yaml")
# One line to extract
result = delm.extract(df)
print(result) # ExtractionResult(records=2, cost=$0.0023)from delm import variable, simple_schema
# Helper function style
schema = simple_schema(
variable("company", "Company name", "string"),
variable("revenue", "Annual revenue", "number"),
variable("sector", "Business sector", "string",
allowed_values=["tech", "finance", "healthcare"]),
)from delm import variable, nested_schema
# Extract multiple companies per text chunk
schema = nested_schema(
"companies", # Container name
variable("name", "Company name", "string", required=True),
variable("revenue", "Annual revenue in USD", "number"),
variable("products", "List of products", "[string]"),
variable("growth_rate", "Annual growth rate percentage", "number",
validate_in_text=True),
)from delm import variable, nested_schema, multiple_schema
# Define separate schemas
products_schema = nested_schema(
"products",
variable("name", "Product name", "string", required=True),
variable("price", "Price in USD", "number"),
)
companies_schema = nested_schema(
"companies",
variable("name", "Company name", "string", required=True),
variable("revenue", "Revenue", "number"),
)
# Combine them
schema = multiple_schema(
products=products_schema,
companies=companies_schema,
)from delm import Variable, NestedSchema
# Direct Pydantic usage for full type safety
schema = NestedSchema(
type="nested",
container_name="commodities",
variables=[
Variable(
name="commodity_type",
description="Type of commodity (oil, gas, gold, etc.)",
data_type="string",
required=True,
allowed_values=["oil", "gas", "gold", "silver", "copper"],
validate_in_text=True,
),
Variable(
name="price_value",
description="Numeric price value if mentioned",
data_type="number",
required=False,
),
],
)# Quick prototyping with dicts
schema_dict = {
"type": "simple",
"variables": [
{
"name": "company",
"description": "Company name",
"data_type": "string",
},
{
"name": "price",
"description": "Price value",
"data_type": "number",
},
]
}
# Pass dict directly to DELM
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema_dict, # Dict works too
)# Option 1: Pass path to DELM
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema="schemas/my_schema.yaml", # Path to file
)
# Option 2: Load explicitly first
from delm import SimpleSchema
schema = SimpleSchema.parse_file("schemas/my_schema.yaml")
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)from delm import DELM, simple_schema, variable
schema = simple_schema(variable("name", "Name", "string"))
delm = DELM(
provider="openai",
model="gpt-4o-mini",
temperature=0.0,
batch_size=10,
max_workers=4,
max_budget=5.0,
schema=schema,
experiment="my_experiment",
)delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
# Text splitting (string shortcut)
splitting="paragraph", # or "sentence", "fixed-window"
# Relevance scoring (keyword list)
scoring=["price", "revenue", "forecast"],
# Filter by score
score_filter="delm_score > 0.5",
# Which column to process
target_column="text",
)from delm import DELM
from delm.strategies import FixedWindowSplit, FuzzyScorer
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
# Custom splitting strategy
splitting=FixedWindowSplit(window=5, stride=2),
# Custom scoring strategy
scoring=FuzzyScorer(
keywords=["price", "revenue"]
),
)# Load base config but override specific values
delm = DELM.from_config(
"config.yaml",
temperature=0.5, # Override config value
experiment="experiment_v2", # Override experiment name
max_budget=10.0, # Override budget
)from delm import DELM
import pandas as pd
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)
# Extract from DataFrame
df = pd.DataFrame({"text": ["Apple stock", "Microsoft revenue"]})
result = delm.extract(df)
# Result object has everything
print(result.data) # DataFrame with extracted data
print(result.cost) # Cost dictionary
print(result.num_records) # Number of records
print(result.num_chunks) # Number of chunks
print(result.num_errors) # Number of errors# From parquet file
result = delm.extract("data/earnings_reports.parquet")
# From CSV file
result = delm.extract("data/articles.csv")
# From directory of files
from pathlib import Path
result = delm.extract(Path("data/text_files/"))# Process only first 10 records (for testing)
result = delm.extract(df, sample_size=10)
print(f"Processed {result.num_records} records (sampled from {len(df)})")# Step 1: Prep data (split, score, chunk)
prepped = delm.prep_data(df)
# Inspect before processing
print(prepped[["delm_chunk_id", "delm_score"]].head())
print(f"High score chunks: {len(prepped[prepped['delm_score'] > 0.7])}")
# Optionally filter
prepped_filtered = prepped[prepped["delm_score"] > 0.5]
# Step 2: Process through LLM
result_df = delm.process_via_llm()
# Get cost summary
cost = delm.get_cost_summary()
print(f"Total cost: ${cost['total_cost']:.4f}")result = delm.extract(df)
# Result properties
print(result) # ExtractionResult(records=10, chunks=25, errors=0, cost=$0.0045)
# Access DataFrame
extracted_df = result.data
print(extracted_df[["company", "price", "delm_record_id"]])
# Access cost details
if result.cost:
print(f"Total: ${result.cost['total_cost']:.4f}")
print(f"Input tokens: {result.cost['input_tokens']}")
print(f"Output tokens: {result.cost['output_tokens']}")
# Check for errors
if result.num_errors > 0:
errors = result.data[result.data["delm_errors"].notna()]
print(f"Found {len(errors)} errors")# Nested schema returns structured data
schema = nested_schema(
"companies",
variable("name", "Company name", "string"),
variable("revenue", "Revenue", "number"),
)
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)
result = delm.extract(df)
# Access extracted data
# (Format depends on how nested data is stored - likely JSON column)
for _, row in result.data.iterrows():
extracted = row["delm_extracted_data"] # or JSON parsed version
print(f"Found {len(extracted['companies'])} companies")result = delm.extract(df)
# Save to file
result.data.to_csv("results.csv", index=False)
result.data.to_parquet("results.parquet")
# Save cost report
import json
with open("cost_report.json", "w") as f:
json.dump(result.cost, f, indent=2)from delm import DELM, variable, simple_schema
import pandas as pd
# 1. Define schema
schema = simple_schema(
variable("company", "Company name", "string"),
variable("price", "Stock price", "number"),
variable("date", "Date mentioned", "date"),
)
# 2. Initialize DELM
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
experiment="stock_extraction",
)
# 3. Load data
df = pd.read_parquet("earnings_calls.parquet")
# 4. Extract
result = delm.extract(df)
# 5. Analyze results
print(f"Extracted {result.num_records} records")
print(f"Cost: ${result.cost['total_cost']:.4f}")
# 6. Save
result.data.to_csv("extracted_stock_data.csv")from delm import DELM, variable, nested_schema, multiple_schema
# Define schemas for different entity types
products = nested_schema(
"products",
variable("name", "Product name", "string", required=True),
variable("price", "Price", "number"),
variable("category", "Category", "string"),
)
companies = nested_schema(
"companies",
variable("name", "Company name", "string", required=True),
variable("revenue", "Revenue", "number"),
)
# Combine
schema = multiple_schema(products=products, companies=companies)
# Extract
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)
result = delm.extract(df)
# Access different schemas
print(result.data["products"])
print(result.data["companies"])from delm import DELM, variable, simple_schema
schema = simple_schema(
variable("price", "Price mentioned", "number"),
variable("commodity", "Commodity type", "string"),
)
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
splitting="paragraph",
scoring=["price", "cost", "value", "commodity"],
score_filter="delm_score > 0.5", # Only process relevant chunks
)
result = delm.extract(df)
print(f"Processed only high-relevance chunks")from delm import DELM, variable, simple_schema
schema = simple_schema(variable("summary", "Summary", "string"))
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
max_budget=1.0, # Stop at $1
batch_size=5, # Small batches
)
try:
result = delm.extract(large_df)
except BudgetExceededError:
print("Budget exceeded, partial results available")
result = delm.get_extraction_results()# team_config.yaml (in version control)
provider: openai
model: gpt-4o-mini
temperature: 0.0
batch_size: 10
max_workers: 4
splitting: paragraph
scoring: [price, revenue, forecast, guidance]
schema:
type: nested
container_name: financial_metrics
variables:
- name: metric_name
description: "Name of financial metric"
data_type: string
required: true
- name: value
description: "Numeric value"
data_type: number
- name: period
description: "Time period"
data_type: string
experiment:
name: financial_extraction
directory: ./experiments# Everyone on team uses same config
from delm import DELM
delm = DELM.from_config("team_config.yaml")
result = delm.extract(df)
# Override for specific experiments
delm_test = DELM.from_config(
"team_config.yaml",
experiment="test_run_v2",
temperature=0.5,
sample_size=10,
)from delm import DELM, variable, simple_schema
schema = simple_schema(variable("sentiment", "Sentiment", "string"))
delm = DELM(
provider="openai",
model="gpt-4o-mini",
schema=schema,
prompt_template="""
Analyze the sentiment of this text carefully.
Extract: {variables}
Text to analyze:
{text}
""",
system_prompt="You are a sentiment analysis expert.",
)
result = delm.extract(df)# Test with different providers
providers_config = [
{"provider": "openai", "model": "gpt-4o-mini"},
{"provider": "anthropic", "model": "claude-3-sonnet"},
{"provider": "google", "model": "gemini-1.5-flash"},
]
results = {}
for config in providers_config:
delm = DELM(schema=schema, **config)
results[config["provider"]] = delm.extract(df)
# Compare results and costs
for provider, result in results.items():
print(f"{provider}: {result.num_records} records, ${result.cost['total_cost']:.4f}")from delm import DELM, DELMConfig
from pathlib import Path
# Old way with config object
config = DELMConfig.from_yaml("old_config.yaml")
delm = DELM(
config=config,
experiment="test",
experiment_dir=Path("./experiments"),
overwrite=True,
auto_checkpoint=True,
use_disk_storage=True,
)
# Old two-step workflow
delm.prep_data(df)
result_df = delm.process_via_llm()
cost = delm.get_cost_summary()# Use old config with new methods
config = DELMConfig.from_yaml("old_config.yaml")
delm = DELM(config=config, experiment="test")
# But use new extract() method
result = delm.extract(df) # New method works!# Quick test on sample data
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)
test_result = delm.extract(df, sample_size=5)
print(f"Test cost: ${test_result.cost['total_cost']:.4f}")
print("Looks good, running on full dataset...")
# Full run
full_result = delm.extract(df)# Prep and inspect
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema, scoring=["price"])
prepped = delm.prep_data(df)
# Check scores
print(prepped["delm_score"].describe())
# Filter low scores
high_quality = prepped[prepped["delm_score"] > 0.7]
print(f"Processing {len(high_quality)} high-quality chunks")
# Process
result_df = delm.process_via_llm()# Start simple
simple = simple_schema(variable("company", "Company name", "string"))
delm = DELM(provider="openai", model="gpt-4o-mini", schema=simple)
result = delm.extract(df.head(10)) # Test on small sample
# Add complexity
detailed = simple_schema(
variable("company", "Company name", "string"),
variable("revenue", "Revenue", "number"),
variable("sector", "Sector", "string", allowed_values=["tech", "finance"]),
)
delm = DELM(provider="openai", model="gpt-4o-mini", schema=detailed)
result = delm.extract(df.head(10)) # Test again
# Scale up
result = delm.extract(df) # Full datasetSimple things are simple:
delm = DELM(provider="openai", model="gpt-4o-mini", schema=schema)
result = delm.extract(df)Complex things are possible:
delm = DELM(
provider="anthropic",
model="claude-3-sonnet",
temperature=0.1,
max_budget=10.0,
schema=complex_schema,
splitting=FixedWindowSplit(window=5, stride=2),
scoring=FuzzyScorer(keywords=["key"]),
score_filter="delm_score > 0.6",
)
result = delm.extract(df)Everything is type-safe and validated:
# IDE autocomplete works
# Type errors caught before runtime
# Clear validation errorsOld code keeps working:
config = DELMConfig.from_yaml("old_config.yaml")
delm = DELM(config=config, experiment="test")
delm.prep_data(df)
result_df = delm.process_via_llm()This is the API we're building toward! 🎯