Skip to content

Commit c59e207

Browse files
committed
First commit
1 parent e74c7af commit c59e207

File tree

17 files changed

+1425
-1
lines changed

17 files changed

+1425
-1
lines changed

.env.example

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# OpenAI Configuration
2+
OPENAI_API_KEY=your_openai_api_key_here
3+
OPENAI_MODEL=gpt-4o-2024-08-06
4+
5+
# Logging Configuration
6+
LOG_LEVEL=INFO
7+
LOG_FORMAT=json
8+
9+
# Application Configuration
10+
MAX_RETRIES=3
11+
TIMEOUT_SECONDS=30

.gitignore

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Python
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
*.so
6+
.Python
7+
build/
8+
develop-eggs/
9+
dist/
10+
downloads/
11+
eggs/
12+
.eggs/
13+
lib/
14+
lib64/
15+
parts/
16+
sdist/
17+
var/
18+
wheels/
19+
*.egg-info/
20+
.installed.cfg
21+
*.egg
22+
MANIFEST
23+
24+
# Virtual environments
25+
.env
26+
.venv
27+
env/
28+
venv/
29+
ENV/
30+
env.bak/
31+
venv.bak/
32+
.uv-cache/
33+
34+
# IDE
35+
.vscode/
36+
.idea/
37+
*.swp
38+
*.swo
39+
*~
40+
41+
# Testing
42+
.coverage
43+
.pytest_cache/
44+
htmlcov/
45+
.tox/
46+
coverage.xml
47+
*.cover
48+
.hypothesis/
49+
50+
# Logs
51+
*.log
52+
logs/
53+
54+
# Docker
55+
.dockerignore
56+
57+
# OS
58+
.DS_Store
59+
Thumbs.db
60+
61+
# Project specific
62+
*.sqlite
63+
*.db
64+
temp/
65+
tmp/

.python-version

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.13

LICENSE

Whitespace-only changes.

README.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
# structured-output-cookbook

pyproject.toml

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
[project]
2+
name = "structured-output-cookbook"
3+
version = "0.1.0"
4+
description = "LLM-powered structured output extraction with predefined and custom schemas"
5+
readme = "README.md"
6+
requires-python = ">=3.13"
7+
authors = [
8+
{ name = "Saverio Mazza", email = "[email protected]" }
9+
]
10+
license = { text = "MIT" }
11+
keywords = ["llm", "structured-output", "extraction", "ai", "openai"]
12+
classifiers = [
13+
"Development Status :: 4 - Beta",
14+
"Intended Audience :: Developers",
15+
"License :: OSI Approved :: MIT License",
16+
"Programming Language :: Python :: 3",
17+
"Programming Language :: Python :: 3.13",
18+
"Topic :: Software Development :: Libraries :: Python Modules",
19+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
20+
]
21+
22+
dependencies = [
23+
"openai>=1.54.0",
24+
"pydantic>=2.9.0",
25+
"loguru>=0.7.2",
26+
"click>=8.1.7",
27+
"python-dotenv>=1.0.0",
28+
]
29+
30+
[project.optional-dependencies]
31+
dev = [
32+
"pytest>=8.0.0",
33+
"pytest-cov>=4.0.0",
34+
"pytest-mock>=3.12.0",
35+
"black>=24.0.0",
36+
"ruff>=0.8.0",
37+
"mypy>=1.13.0",
38+
"pre-commit>=4.0.0",
39+
]
40+
41+
[project.scripts]
42+
structured-output = "structured_output_cookbook.cli:main"
43+
44+
[project.urls]
45+
Homepage = "https://github.com/mazzasaverio/structured-output-cookbook"
46+
Repository = "https://github.com/mazzasaverio/structured-output-cookbook"
47+
Issues = "https://github.com/mazzasaverio/structured-output-cookbook/issues"
48+
49+
[build-system]
50+
requires = ["hatchling"]
51+
build-backend = "hatchling.build"
52+
53+
[tool.uv]
54+
dev-dependencies = [
55+
"pytest>=8.0.0",
56+
"pytest-cov>=4.0.0",
57+
"pytest-mock>=3.12.0",
58+
"black>=24.0.0",
59+
"ruff>=0.8.0",
60+
"mypy>=1.13.0",
61+
"pre-commit>=4.0.0",
62+
]
63+
64+
[tool.black]
65+
line-length = 88
66+
target-version = ['py313']
67+
68+
[tool.ruff]
69+
target-version = "py313"
70+
line-length = 88
71+
select = ["E", "W", "F", "I", "N", "UP", "ANN", "S", "B", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PTH", "ERA", "PD", "PGH", "PL", "TRY", "NPY", "RUF"]
72+
ignore = ["ANN101", "ANN102", "S101", "PLR0913"]
73+
74+
[tool.mypy]
75+
python_version = "3.13"
76+
strict = true
77+
warn_return_any = true
78+
warn_unused_configs = true
79+
80+
[tool.pytest.ini_options]
81+
testpaths = ["tests"]
82+
python_files = ["test_*.py"]
83+
python_classes = ["Test*"]
84+
python_functions = ["test_*"]
85+
addopts = [
86+
"--cov=src/structured_output_cookbook",
87+
"--cov-report=term-missing",
88+
"--cov-report=html",
89+
"--cov-fail-under=80",
90+
"-v"
91+
]
92+
93+
[tool.coverage.run]
94+
source = ["src"]
95+
omit = ["*/tests/*", "*/test_*"]
96+
97+
[tool.coverage.report]
98+
exclude_lines = [
99+
"pragma: no cover",
100+
"def __repr__",
101+
"raise AssertionError",
102+
"raise NotImplementedError",
103+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Structured Output Cookbook - LLM-powered structured data extraction."""
2+
3+
__version__ = "0.1.0"
4+
__author__ = "Your Name"
5+
6+
from .extractor import StructuredExtractor
7+
from .schemas.base import BaseSchema, ExtractionResult
8+
from .templates.job_description import JobDescriptionSchema
9+
from .templates.recipe import RecipeSchema
10+
from .config import Config
11+
12+
__all__ = [
13+
"StructuredExtractor",
14+
"BaseSchema",
15+
"ExtractionResult",
16+
"JobDescriptionSchema",
17+
"RecipeSchema",
18+
"Config",
19+
]
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
"""Command line interface for structured output extraction."""
2+
3+
import json
4+
import sys
5+
from pathlib import Path
6+
from typing import Optional
7+
8+
import click
9+
from .config import Config
10+
from .extractor import StructuredExtractor
11+
from .logger import setup_logger, get_logger
12+
from .templates.job_description import JobDescriptionSchema
13+
from .templates.recipe import RecipeSchema
14+
15+
# Available predefined templates
16+
TEMPLATES = {
17+
"job": JobDescriptionSchema,
18+
"recipe": RecipeSchema,
19+
}
20+
21+
22+
@click.group()
23+
@click.option("--debug", is_flag=True, help="Enable debug logging")
24+
@click.pass_context
25+
def main(ctx: click.Context, debug: bool) -> None:
26+
"""Structured Output Cookbook - Extract structured data from text using LLMs."""
27+
ctx.ensure_object(dict)
28+
29+
config = Config.from_env()
30+
if debug:
31+
config.log_level = "DEBUG"
32+
33+
setup_logger(config)
34+
ctx.obj["config"] = config
35+
ctx.obj["logger"] = get_logger(__name__)
36+
37+
38+
@main.command()
39+
def list_templates() -> None:
40+
"""List available predefined templates."""
41+
click.echo("Available templates:")
42+
for name, schema in TEMPLATES.items():
43+
click.echo(f" {name}: {schema.get_schema_description()}")
44+
45+
46+
@main.command()
47+
@click.argument("template", type=click.Choice(list(TEMPLATES.keys())))
48+
@click.option("--input-file", "-i", type=click.Path(exists=True), help="Input text file")
49+
@click.option("--text", "-t", help="Input text directly")
50+
@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
51+
@click.option("--pretty", is_flag=True, help="Pretty print JSON output")
52+
@click.pass_context
53+
def extract(
54+
ctx: click.Context,
55+
template: str,
56+
input_file: Optional[str],
57+
text: Optional[str],
58+
output: Optional[str],
59+
pretty: bool
60+
) -> None:
61+
"""Extract data using a predefined template."""
62+
logger = ctx.obj["logger"]
63+
config = ctx.obj["config"]
64+
65+
# Get input text
66+
if input_file:
67+
input_text = Path(input_file).read_text(encoding="utf-8")
68+
elif text:
69+
input_text = text
70+
else:
71+
click.echo("Error: Must provide either --input-file or --text", err=True)
72+
sys.exit(1)
73+
74+
# Extract data
75+
extractor = StructuredExtractor(config)
76+
schema = TEMPLATES[template]
77+
78+
logger.info(f"Extracting using template: {template}")
79+
result = extractor.extract(input_text, schema)
80+
81+
if not result.success:
82+
click.echo(f"Extraction failed: {result.error}", err=True)
83+
sys.exit(1)
84+
85+
# Format output
86+
indent = 2 if pretty else None
87+
output_json = json.dumps(result.data, indent=indent, ensure_ascii=False)
88+
89+
# Write output
90+
if output:
91+
Path(output).write_text(output_json, encoding="utf-8")
92+
click.echo(f"Results saved to {output}")
93+
else:
94+
click.echo(output_json)
95+
96+
# Show stats
97+
if result.tokens_used:
98+
logger.info(f"Tokens used: {result.tokens_used}")
99+
100+
101+
@main.command()
102+
@click.option("--schema-file", "-s", type=click.Path(exists=True), required=True, help="JSON schema file")
103+
@click.option("--prompt-file", "-p", type=click.Path(exists=True), help="System prompt file")
104+
@click.option("--prompt", help="System prompt text")
105+
@click.option("--input-file", "-i", type=click.Path(exists=True), help="Input text file")
106+
@click.option("--text", "-t", help="Input text directly")
107+
@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
108+
@click.option("--pretty", is_flag=True, help="Pretty print JSON output")
109+
@click.pass_context
110+
def extract_custom(
111+
ctx: click.Context,
112+
schema_file: str,
113+
prompt_file: Optional[str],
114+
prompt: Optional[str],
115+
input_file: Optional[str],
116+
text: Optional[str],
117+
output: Optional[str],
118+
pretty: bool
119+
) -> None:
120+
"""Extract data using a custom JSON schema."""
121+
logger = ctx.obj["logger"]
122+
config = ctx.obj["config"]
123+
124+
# Load schema
125+
try:
126+
schema_dict = json.loads(Path(schema_file).read_text(encoding="utf-8"))
127+
except (json.JSONDecodeError, FileNotFoundError) as e:
128+
click.echo(f"Error loading schema: {e}", err=True)
129+
sys.exit(1)
130+
131+
# Get system prompt
132+
if prompt_file:
133+
system_prompt = Path(prompt_file).read_text(encoding="utf-8")
134+
elif prompt:
135+
system_prompt = prompt
136+
else:
137+
click.echo("Error: Must provide either --prompt-file or --prompt", err=True)
138+
sys.exit(1)
139+
140+
# Get input text
141+
if input_file:
142+
input_text = Path(input_file).read_text(encoding="utf-8")
143+
elif text:
144+
input_text = text
145+
else:
146+
click.echo("Error: Must provide either --input-file or --text", err=True)
147+
sys.exit(1)
148+
149+
# Extract data
150+
extractor = StructuredExtractor(config)
151+
152+
logger.info("Extracting using custom schema")
153+
result = extractor.extract_with_custom_schema(input_text, schema_dict, system_prompt)
154+
155+
if not result.success:
156+
click.echo(f"Extraction failed: {result.error}", err=True)
157+
sys.exit(1)
158+
159+
# Format output
160+
indent = 2 if pretty else None
161+
output_json = json.dumps(result.data, indent=indent, ensure_ascii=False)
162+
163+
# Write output
164+
if output:
165+
Path(output).write_text(output_json, encoding="utf-8")
166+
click.echo(f"Results saved to {output}")
167+
else:
168+
click.echo(output_json)
169+
170+
# Show stats
171+
if result.tokens_used:
172+
logger.info(f"Tokens used: {result.tokens_used}")
173+
174+
175+
if __name__ == "__main__":
176+
main()

0 commit comments

Comments
 (0)