mazzasaverio
diff --git a/‎.env.example‎
Lines changed: 11 additions & 0 deletions b/‎.env.example‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 65 additions & 0 deletions b/‎.gitignore‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎.python-version‎
Lines changed: 1 addition & 0 deletions b/‎.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎LICENSE‎ b/‎LICENSE‎
diff --git a/‎README.md‎
Lines changed: 0 additions & 1 deletion b/‎README.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 103 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 103 additions & 0 deletions
diff --git a/‎src/structured_output_cookbook/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎src/structured_output_cookbook/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎src/structured_output_cookbook/cli.py‎
Lines changed: 176 additions & 0 deletions b/‎src/structured_output_cookbook/cli.py‎
Lines changed: 176 additions & 0 deletions
@@ -0,0 +1,11 @@
+# OpenAI Configuration
+OPENAI_API_KEY=your_openai_api_key_here
+OPENAI_MODEL=gpt-4o-2024-08-06
+
+# Logging Configuration
+LOG_LEVEL=INFO
+LOG_FORMAT=json
+
+# Application Configuration
+MAX_RETRIES=3
+TIMEOUT_SECONDS=30
@@ -0,0 +1,65 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+.uv-cache/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+
+# Testing
+.coverage
+.pytest_cache/
+htmlcov/
+.tox/
+coverage.xml
+*.cover
+.hypothesis/
+
+# Logs
+*.log
+logs/
+
+# Docker
+.dockerignore
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Project specific
+*.sqlite
+*.db
+temp/
+tmp/
@@ -0,0 +1 @@
+3.13
@@ -1 +0,0 @@
-# structured-output-cookbook
@@ -0,0 +1,103 @@
+[project]
+name = "structured-output-cookbook"
+version = "0.1.0"
+description = "LLM-powered structured output extraction with predefined and custom schemas"
+readme = "README.md"
+requires-python = ">=3.13"
+authors = [
+    { name = "Saverio Mazza", email = "[email protected]" }
+]
+license = { text = "MIT" }
+keywords = ["llm", "structured-output", "extraction", "ai", "openai"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+
+dependencies = [
+    "openai>=1.54.0",
+    "pydantic>=2.9.0",
+    "loguru>=0.7.2",
+    "click>=8.1.7",
+    "python-dotenv>=1.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+    "pytest-mock>=3.12.0",
+    "black>=24.0.0",
+    "ruff>=0.8.0",
+    "mypy>=1.13.0",
+    "pre-commit>=4.0.0",
+]
+
+[project.scripts]
+structured-output = "structured_output_cookbook.cli:main"
+
+[project.urls]
+Homepage = "https://github.com/mazzasaverio/structured-output-cookbook"
+Repository = "https://github.com/mazzasaverio/structured-output-cookbook"
+Issues = "https://github.com/mazzasaverio/structured-output-cookbook/issues"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.uv]
+dev-dependencies = [
+    "pytest>=8.0.0",
+    "pytest-cov>=4.0.0",
+    "pytest-mock>=3.12.0",
+    "black>=24.0.0",
+    "ruff>=0.8.0",
+    "mypy>=1.13.0",
+    "pre-commit>=4.0.0",
+]
+
+[tool.black]
+line-length = 88
+target-version = ['py313']
+
+[tool.ruff]
+target-version = "py313"
+line-length = 88
+select = ["E", "W", "F", "I", "N", "UP", "ANN", "S", "B", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", "INP", "PIE", "T20", "PYI", "PT", "Q", "RSE", "RET", "SLF", "SIM", "TID", "TCH", "INT", "ARG", "PTH", "ERA", "PD", "PGH", "PL", "TRY", "NPY", "RUF"]
+ignore = ["ANN101", "ANN102", "S101", "PLR0913"]
+
+[tool.mypy]
+python_version = "3.13"
+strict = true
+warn_return_any = true
+warn_unused_configs = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "--cov=src/structured_output_cookbook",
+    "--cov-report=term-missing",
+    "--cov-report=html",
+    "--cov-fail-under=80",
+    "-v"
+]
+
+[tool.coverage.run]
+source = ["src"]
+omit = ["*/tests/*", "*/test_*"]
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: no cover",
+    "def __repr__",
+    "raise AssertionError",
+    "raise NotImplementedError",
+]
@@ -0,0 +1,19 @@
+"""Structured Output Cookbook - LLM-powered structured data extraction."""
+
+__version__ = "0.1.0"
+__author__ = "Your Name"
+
+from .extractor import StructuredExtractor
+from .schemas.base import BaseSchema, ExtractionResult
+from .templates.job_description import JobDescriptionSchema
+from .templates.recipe import RecipeSchema
+from .config import Config
+
+__all__ = [
+    "StructuredExtractor",
+    "BaseSchema",
+    "ExtractionResult", 
+    "JobDescriptionSchema",
+    "RecipeSchema",
+    "Config",
+]
@@ -0,0 +1,176 @@
+"""Command line interface for structured output extraction."""
+
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+import click
+from .config import Config
+from .extractor import StructuredExtractor
+from .logger import setup_logger, get_logger
+from .templates.job_description import JobDescriptionSchema
+from .templates.recipe import RecipeSchema
+
+# Available predefined templates
+TEMPLATES = {
+    "job": JobDescriptionSchema,
+    "recipe": RecipeSchema,
+}
+
+
+@click.group()
+@click.option("--debug", is_flag=True, help="Enable debug logging")
+@click.pass_context
+def main(ctx: click.Context, debug: bool) -> None:
+    """Structured Output Cookbook - Extract structured data from text using LLMs."""
+    ctx.ensure_object(dict)
+    
+    config = Config.from_env()
+    if debug:
+        config.log_level = "DEBUG"
+    
+    setup_logger(config)
+    ctx.obj["config"] = config
+    ctx.obj["logger"] = get_logger(__name__)
+
+
+@main.command()
+def list_templates() -> None:
+    """List available predefined templates."""
+    click.echo("Available templates:")
+    for name, schema in TEMPLATES.items():
+        click.echo(f"  {name}: {schema.get_schema_description()}")
+
+
+@main.command()
+@click.argument("template", type=click.Choice(list(TEMPLATES.keys())))
+@click.option("--input-file", "-i", type=click.Path(exists=True), help="Input text file")
+@click.option("--text", "-t", help="Input text directly")
+@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
+@click.option("--pretty", is_flag=True, help="Pretty print JSON output")
+@click.pass_context
+def extract(
+    ctx: click.Context,
+    template: str,
+    input_file: Optional[str],
+    text: Optional[str],
+    output: Optional[str],
+    pretty: bool
+) -> None:
+    """Extract data using a predefined template."""
+    logger = ctx.obj["logger"]
+    config = ctx.obj["config"]
+    
+    # Get input text
+    if input_file:
+        input_text = Path(input_file).read_text(encoding="utf-8")
+    elif text:
+        input_text = text
+    else:
+        click.echo("Error: Must provide either --input-file or --text", err=True)
+        sys.exit(1)
+    
+    # Extract data
+    extractor = StructuredExtractor(config)
+    schema = TEMPLATES[template]
+    
+    logger.info(f"Extracting using template: {template}")
+    result = extractor.extract(input_text, schema)
+    
+    if not result.success:
+        click.echo(f"Extraction failed: {result.error}", err=True)
+        sys.exit(1)
+    
+    # Format output
+    indent = 2 if pretty else None
+    output_json = json.dumps(result.data, indent=indent, ensure_ascii=False)
+    
+    # Write output
+    if output:
+        Path(output).write_text(output_json, encoding="utf-8")
+        click.echo(f"Results saved to {output}")
+    else:
+        click.echo(output_json)
+    
+    # Show stats
+    if result.tokens_used:
+        logger.info(f"Tokens used: {result.tokens_used}")
+
+
+@main.command()
+@click.option("--schema-file", "-s", type=click.Path(exists=True), required=True, help="JSON schema file")
+@click.option("--prompt-file", "-p", type=click.Path(exists=True), help="System prompt file")
+@click.option("--prompt", help="System prompt text")
+@click.option("--input-file", "-i", type=click.Path(exists=True), help="Input text file")
+@click.option("--text", "-t", help="Input text directly")
+@click.option("--output", "-o", type=click.Path(), help="Output JSON file")
+@click.option("--pretty", is_flag=True, help="Pretty print JSON output")
+@click.pass_context
+def extract_custom(
+    ctx: click.Context,
+    schema_file: str,
+    prompt_file: Optional[str],
+    prompt: Optional[str],
+    input_file: Optional[str],
+    text: Optional[str],
+    output: Optional[str],
+    pretty: bool
+) -> None:
+    """Extract data using a custom JSON schema."""
+    logger = ctx.obj["logger"]
+    config = ctx.obj["config"]
+    
+    # Load schema
+    try:
+        schema_dict = json.loads(Path(schema_file).read_text(encoding="utf-8"))
+    except (json.JSONDecodeError, FileNotFoundError) as e:
+        click.echo(f"Error loading schema: {e}", err=True)
+        sys.exit(1)
+    
+    # Get system prompt
+    if prompt_file:
+        system_prompt = Path(prompt_file).read_text(encoding="utf-8")
+    elif prompt:
+        system_prompt = prompt
+    else:
+        click.echo("Error: Must provide either --prompt-file or --prompt", err=True)
+        sys.exit(1)
+    
+    # Get input text
+    if input_file:
+        input_text = Path(input_file).read_text(encoding="utf-8")
+    elif text:
+        input_text = text
+    else:
+        click.echo("Error: Must provide either --input-file or --text", err=True)
+        sys.exit(1)
+    
+    # Extract data
+    extractor = StructuredExtractor(config)
+    
+    logger.info("Extracting using custom schema")
+    result = extractor.extract_with_custom_schema(input_text, schema_dict, system_prompt)
+    
+    if not result.success:
+        click.echo(f"Extraction failed: {result.error}", err=True)
+        sys.exit(1)
+    
+    # Format output
+    indent = 2 if pretty else None
+    output_json = json.dumps(result.data, indent=indent, ensure_ascii=False)
+    
+    # Write output
+    if output:
+        Path(output).write_text(output_json, encoding="utf-8")
+        click.echo(f"Results saved to {output}")
+    else:
+        click.echo(output_json)
+    
+    # Show stats
+    if result.tokens_used:
+        logger.info(f"Tokens used: {result.tokens_used}")
+
+
+if __name__ == "__main__":
+    main()