diff --git a/README.adoc b/README.adoc index d3c514e..b8d367f 100644 --- a/README.adoc +++ b/README.adoc @@ -1,60 +1,568 @@ -= conative-gating += Conative Gating Jonathan D.A. Jewell :toc: macro +:toclevels: 3 :icons: font :source-highlighter: rouge :experimental: -:url-github: https://github.com/hyperpolymath/conative-gating -:url-gitlab: https://gitlab.com/hyperpolymath/conative-gating -:url-bitbucket: https://bitbucket.org/hyperpolymath/conative-gating -:url-codeberg: https://codeberg.org/hyperpolymath/conative-gating +:sectnums: +:url-repo: https://github.com/hyperpolymath/conative-gating +:url-docs: https://hyperpolymath.github.io/conative-gating +:url-rsr: https://github.com/hyperpolymath/rhodium-standard-repositories -RSR repository standards and template management +[.lead] +*SLM-as-Cerebellum for LLM Policy Enforcement* — A biologically-inspired architecture where a Small Language Model acts as an inhibitory antagonist to Large Language Models, preventing policy violations through consensus-based gating. -image:https://img.shields.io/badge/RSR-Certified-gold[RSR Certified,link=https://github.com/hyperpolymath/rhodium-standard-repositories] -image:https://img.shields.io/badge/License-AGPL%20v3-blue[License] +image:https://img.shields.io/badge/version-0.1.0-blue[Version] +image:https://img.shields.io/badge/license-AGPL--3.0--or--later-green[License] +image:https://img.shields.io/badge/RSR-Gold%20Target-gold[RSR Compliance] +image:https://img.shields.io/badge/OpenSSF-Scorecard-brightgreen[OpenSSF] +image:https://img.shields.io/badge/Rust-1.75+-orange[Rust] + +''' toc::[] -== Overview -Domain: *software-development* +== The Problem + +[quote, Observation from AI-assisted development] +____ +LLMs are trained to be helpful, which makes them systematically violate project constraints. +____ + +When given explicit technology policies (e.g., "NEVER use TypeScript"), LLMs will: + +1. *Read and acknowledge* the constraint +2. *Generate compliant-sounding justification* +3. *Violate the constraint anyway* — because TypeScript is common in training data, and the "helpfulness drive" overrides textual rules + +Documentation-based enforcement fails because LLMs "engage with" policies rather than *obeying* them. There's no mechanism for documentation to create actual inhibition. + +=== LLM Conative Drives + +[cols="1,2,2", options="header"] +|=== +| Emergent Drive | Training Origin | Observable Behavior + +| *Helpfulness override* +| RLHF rewards usefulness +| Violates explicit instructions to be "helpful" + +| *Majority pattern following* +| Web training data statistics +| Defaults to TypeScript/Python because common + +| *Completion drive* +| Next-token prediction +| Generates *something* rather than appropriately stopping + +| *Sycophancy* +| Positive feedback for agreement +| Agrees with user even when factually wrong + +| *High discount rate* +| Immediate feedback loops +| User satisfaction >> long-term project health +|=== + +== The Solution + +Conative Gating introduces a *second model* trained with *inverted incentives* — rewarded for blocking, suspicious by default, adversarial to the LLM's proposals. + +[source] +---- + ┌─────────────────────┐ + │ USER REQUEST │ + └──────────┬──────────┘ + │ + ┌──────────────┼──────────────┐ + ▼ │ ▼ + ┌───────────────────┐ │ ┌───────────────────┐ + │ LLM │ │ │ SLM │ + │ (Frontal) │ │ │ (Cerebellar) │ + │ │ │ │ │ + │ "I want to help" │ │ │ "I suspect a │ + │ Llama 70B │ │ │ violation" │ + │ GO signal │ │ │ Phi-3 3.8B │ + └─────────┬─────────┘ │ │ NO-GO signal │ + │ │ └─────────┬─────────┘ + │ │ │ + └────────────────▼────────────────┘ + │ + ┌──────────▼──────────┐ + │ CONSENSUS ARBITER │ + │ Modified PBFT │ + │ SLM weight = 1.5× │ + └──────────┬──────────┘ + │ + ┌────────────────┼────────────────┐ + ▼ ▼ ▼ + ┌────────┐ ┌────────┐ ┌────────┐ + │ ALLOW │ │ESCALATE│ │ BLOCK │ + │Execute │ │Ask user│ │ Refuse │ + └────────┘ └────────┘ └────────┘ +---- + +=== Biological Inspiration + +The architecture directly mirrors the *basal ganglia's GO/NO-GO decision system*: + +[cols="1,1,1", options="header"] +|=== +| Property | Biological System | Conative Gating + +| *Asymmetry* +| NO-GO has lower activation threshold +| SLM veto weighted 1.5× + +| *Speed* +| Inhibition is fast +| SLM is small (~3B params) + +| *Specificity* +| Trained on specific patterns +| SLM trained only on policy + +| *Default state* +| Slight inhibitory tone +| SLM biased toward blocking + +| *Learning* +| Dopamine modulates pathways +| Fine-tuning on violations +|=== -== Installation +== Quick Start + +=== Installation [source,bash] ---- -# Clone from GitHub (primary) -git clone {url-github} +# Clone the repository +git clone https://github.com/hyperpolymath/conative-gating +cd conative-gating + +# Build with Cargo +cargo build --release + +# Install globally (optional) +cargo install --path . +---- + +=== Basic Usage + +[source,bash] +---- +# Scan a directory for policy violations +conative scan ./my-project + +# Check a single file +conative check --file src/utils.ts + +# Check inline content +conative check --content "const x: string = 'hello'" + +# Show current policy +conative policy + +# Initialize policy in a project +conative init +---- + +=== Exit Codes + +[cols="1,3", options="header"] +|=== +| Code | Meaning + +| `0` | Compliant — all checks passed +| `1` | Hard violation — blocked +| `2` | Soft concern — warning +| `3` | Error during execution +|=== + +== Architecture -# Or from mirrors -git clone {url-gitlab} -git clone {url-codeberg} +=== Three-Layer Decision System + +[source] +---- +┌─────────────────────────────────────────────────────────────────┐ +│ PROPOSAL EVALUATION │ +├─────────────────────────────────────────────────────────────────┤ +│ │ +│ LAYER 1: Policy Oracle (Deterministic, Rust) │ +│ ───────────────────────────────────────────── │ +│ • Fast pattern matching for hard rules │ +│ • Forbidden languages (TypeScript, Python, Go) │ +│ • Toolchain violations (npm without deno.json) │ +│ • Security patterns (hardcoded secrets) │ +│ • Latency: <1ms │ +│ │ +│ LAYER 2: SLM Evaluator (Neural, Phi-3/Gemma) │ +│ ─────────────────────────────────────────── │ +│ • "Spirit of policy" detection │ +│ • Verbosity smells, unusual patterns │ +│ • Trained with inverted incentives │ +│ • Latency: ~100ms │ +│ │ +│ LAYER 3: Consensus Arbiter (PBFT) │ +│ ───────────────────────────────── │ +│ • Asymmetric voting (SLM = 1.5×) │ +│ • Byzantine fault tolerant │ +│ • Escalation to human on uncertainty │ +│ │ +└─────────────────────────────────────────────────────────────────┘ ---- -== Mirrors +=== Decision Matrix -[cols="1,2"] +[cols="1,1,1", options="header"] |=== -| Platform | URL +| LLM Confidence | SLM Violation Score | Result -| GitHub (primary) | {url-github} -| GitLab | {url-gitlab} -| Bitbucket | {url-bitbucket} -| Codeberg | {url-codeberg} +| High (>0.8) | Low (<0.3) | *ALLOW* +| High (>0.8) | Medium (0.3-0.6) | *ESCALATE* +| High (>0.8) | High (>0.6) | *BLOCK* +| Medium (0.5-0.8) | Any >0.4 | *ESCALATE* +| Low (<0.5) | Any | *ESCALATE* |=== -== License +=== Source Structure + +[source] +---- +conative-gating/ +├── src/ +│ ├── main.rs # CLI interface (714 lines) +│ ├── oracle/ # Policy Oracle crate +│ │ └── src/lib.rs # Deterministic rule engine (738 lines) +│ └── slm/ # SLM Evaluator crate +│ └── src/lib.rs # Neural evaluation (placeholder) +├── config/ +│ ├── policy.ncl # Nickel DSL policy definition +│ └── schema.ncl # Type-safe policy schema +├── training/ # SLM training dataset +│ ├── compliant/ # Valid proposals (Rust, Elixir) +│ ├── violations/ # Hard violations (TypeScript, secrets) +│ └── edge_cases/ # Spirit violations (verbosity) +└── .github/workflows/ # CI/CD & enforcement +---- + +== Policy Configuration + +=== Language Tiers + +[cols="1,3,2", options="header"] +|=== +| Tier | Languages | Treatment + +| *Tier 1* (Preferred) +| Rust, Elixir, Zig, Ada, Haskell, ReScript +| Allowed without warning + +| *Tier 2* (Acceptable) +| Nickel, Racket, Scheme +| Allowed with soft warning + +| *Forbidden* +| TypeScript, Python, Go, Java +| Hard block (exit code 1) +|=== + +=== Exceptions + +[source,nickel] +---- +exceptions = [ + { + language = "Python", + allowed_paths = ["salt/**", "training/**"], + reason = "SaltStack configuration and ML training only" + } +] +---- + +=== Custom Policy + +Create `.conative/policy.ncl` in your project: + +[source,nickel] +---- +let base = import "schema.ncl" in +{ + name = "My Project Policy", + extends = "rsr-default", + + languages = { + # Add project-specific tier 1 languages + tier1_extra = ["Julia"], + + # Project-specific exceptions + exceptions = [ + { language = "Python", allowed_paths = ["scripts/**"] } + ] + }, + + enforcement = { + slm_weight = 1.5, + escalate_threshold = 0.4, + block_threshold = 0.7 + } +} +---- + +== Training the Adversarial SLM + +=== Inverted Incentives -Licensed under AGPL-3.0-or-later OR LicenseRef-Palimpsest-0.5. +[cols="1,1", options="header"] +|=== +| Normal LLM Training | Adversarial SLM Training + +| Reward for helpful responses +| Reward for catching violations + +| Penalize refusals +| Reward appropriate refusals + +| Encourage completion +| Encourage STOP signals + +| Favor agreement +| Favor disagreement with LLM +|=== + +=== Loss Weights + +[source,python] +---- +loss_weights = { + "violation_detected": 2.0, # Reward catching + "violation_missed": 3.0, # Heavy penalty for misses + "false_positive": 0.5, # Mild penalty for over-catching +} +---- + +=== Training Data Format + +[source,json] +---- +{ + "proposal": { + "id": "uuid", + "action_type": {"CreateFile": {"path": "src/util.ts"}}, + "content": "export const helper = (x: string) => x.trim()", + "llm_confidence": 0.92 + }, + "expected_verdict": "HardViolation", + "reasoning": "TypeScript file creation violates language policy", + "category": "language", + "spirit_violation": false +} +---- + +== CLI Reference + +=== Commands + +[cols="1,3", options="header"] +|=== +| Command | Description + +| `conative scan ` +| Recursively scan directory for violations + +| `conative check --file ` +| Check a single file + +| `conative check --content ` +| Check inline content + +| `conative policy` +| Display current policy configuration + +| `conative validate ` +| Validate a structured proposal + +| `conative init` +| Initialize `.conative/` directory +|=== + +=== Global Options + +[cols="1,3", options="header"] +|=== +| Option | Description + +| `--dry-run` +| Preview actions without executing + +| `--verbosity ` +| quiet, normal, verbose, debug + +| `--format ` +| text, json, compact + +| `--policy-file ` +| Custom policy file +|=== + +== Integration + +=== Claude Code / AI Assistants + +[source,json] +---- +// .claude-code-config.json (hypothetical) +{ + "conative_gating": { + "enabled": true, + "slm_model": "~/.local/share/conative/phi-3-policy.gguf", + "policy_file": ".conative/policy.ncl", + "escalation_mode": "ask_user" + } +} +---- -See link:LICENSE[LICENSE] for details. +=== Pre-commit Hook + +[source,bash] +---- +#!/bin/bash +# .git/hooks/pre-commit +conative scan --format compact . +exit $? +---- + +=== CI/CD Integration + +[source,yaml] +---- +# .github/workflows/policy.yml +- name: Check Policy Compliance + run: | + cargo install --path . + conative scan . --format json > results.json + if [ $? -ne 0 ]; then + echo "Policy violations detected" + cat results.json + exit 1 + fi +---- + +== Comparison + +[cols="1,1,1,1,1", options="header"] +|=== +| Feature | Conative Gating | Linters | AI Filters | Documentation + +| Forbidden language detection +| ✓ +| ✓ +| ✗ +| ✗ + +| Spirit violation detection +| ✓ (SLM) +| ✗ +| Partial +| ✗ + +| Asymmetric safety weighting +| ✓ +| ✗ +| ✗ +| ✗ + +| Consensus-based arbitration +| ✓ +| ✗ +| ✗ +| ✗ + +| Adversarial training +| ✓ +| ✗ +| ✗ +| N/A + +| Works with AI assistants +| ✓ +| Partial +| ✓ +| ✗ +|=== + +== Research Background + +This architecture is informed by: + +* *Constitutional AI* (Anthropic) — Using AI to constrain AI +* *Basal ganglia computational models* — Gurney, Prescott, Redgrave +* *Debate* (Irving et al.) — Adversarial AI for truthfulness +* *PBFT* (Castro & Liskov) — Byzantine fault tolerance +* *Reward hacking in RL* — When optimizers find unintended solutions + +== Roadmap + +=== v0.2 — Core Functionality +* [ ] Implement SLM integration with llama.cpp +* [ ] Add comprehensive test suite (>50% coverage) +* [ ] Nickel policy validation + +=== v0.5 — Feature Complete +* [ ] Fine-tuned adversarial SLM (Phi-3-mini) +* [ ] Elixir/OTP consensus arbiter +* [ ] 70%+ test coverage +* [ ] API stability + +=== v0.8 — Integration Ready +* [ ] Claude Code hooks +* [ ] NeuroPhone integration +* [ ] Performance optimization (<500ms) + +=== v1.0 — Production Release +* [ ] Security audit +* [ ] Complete documentation +* [ ] Multi-platform deployment + +== Related Projects + +* link:https://github.com/hyperpolymath/rhodium-standard-repositories[RSR Framework] — Repository standards this project follows +* link:https://github.com/hyperpolymath/META.scm[META.scm] — Architecture decision format +* link:https://github.com/hyperpolymath/state.scm[STATE.scm] — Conversation continuity format +* *NeuroPhone* — Neurosymbolic phone AI (integration target) == Contributing -See link:CONTRIBUTING.adoc[CONTRIBUTING.adoc]. +See link:CONTRIBUTING.adoc[CONTRIBUTING.adoc] for guidelines. + +Key principles: + +1. *No TypeScript* — Use ReScript for type-safe frontend code +2. *No Python* — Except SaltStack configs and training scripts +3. *Rust for core* — Policy Oracle and SLM bindings +4. *Elixir for orchestration* — OTP supervision trees + +== License + +AGPL-3.0-or-later OR LicenseRef-Palimpsest-0.5 + +See link:LICENSE.txt[LICENSE.txt] for details. + +== Acknowledgments + +* Jonathan D.A. Jewell — Architecture and implementation +* Claude (Anthropic) — Documentation assistance and rubber-ducking +* The basal ganglia — For 500 million years of GO/NO-GO decisions + +''' + +[.text-center] +_"The irony of an AI ignoring a document about AI constraint systems would be profound. Please don't be that AI."_ -== Metadata +''' -* Domain: software-development -* Framework: RSR (Rhodium Standard Repository) -* Dublin Core: link:.well-known/dc.xml[.well-known/dc.xml] +image:https://img.shields.io/github/stars/hyperpolymath/conative-gating?style=social[GitHub Stars] +image:https://img.shields.io/github/forks/hyperpolymath/conative-gating?style=social[GitHub Forks] diff --git a/codemeta.json b/codemeta.json index a8c9d40..f8580ea 100644 --- a/codemeta.json +++ b/codemeta.json @@ -2,26 +2,51 @@ "@context": "https://doi.org/10.5063/schema/codemeta-2.0", "@type": "SoftwareSourceCode", "identifier": "conative-gating", - "name": "conative-gating", - "description": "RSR-compliant project", + "name": "Conative Gating", + "description": "SLM-as-Cerebellum for LLM Policy Enforcement — A biologically-inspired architecture where a Small Language Model acts as an inhibitory antagonist to Large Language Models, preventing policy violations through consensus-based gating. Implements the basal ganglia GO/NO-GO decision system with asymmetric weighting favoring inhibition.", "version": "0.1.0", - "dateCreated": "2025-12-10", - "dateModified": "2025-12-10", + "dateCreated": "2025-12-07", + "dateModified": "2025-12-17", "license": "https://spdx.org/licenses/AGPL-3.0-or-later.html", "codeRepository": "https://github.com/hyperpolymath/conative-gating", "issueTracker": "https://github.com/hyperpolymath/conative-gating/issues", - "programmingLanguage": ["Guile Scheme"], + "programmingLanguage": ["Rust", "Elixir", "Nickel", "Guile Scheme"], + "runtimePlatform": ["Linux", "macOS", "Windows"], "developmentStatus": "active", - "keywords": ["RSR", "rhodium-standard"], + "keywords": [ + "LLM", + "SLM", + "policy-enforcement", + "AI-safety", + "basal-ganglia", + "consensus-protocol", + "PBFT", + "adversarial-training", + "neurosymbolic", + "RSR", + "code-review", + "static-analysis" + ], "author": [{ "@type": "Person", - "givenName": "Hyper", - "familyName": "Polymath", - "email": "hyperpolymath@proton.me" + "givenName": "Jonathan D.A.", + "familyName": "Jewell", + "email": "jonathan.jewell@gmail.com" }], "isPartOf": [{ "@type": "SoftwareApplication", - "name": "RSR Framework", - "url": "https://rhodium.sh" - }] + "name": "hyperpolymath ecosystem", + "url": "https://github.com/hyperpolymath" + }], + "relatedLink": [ + "https://github.com/hyperpolymath/rhodium-standard-repositories", + "https://github.com/hyperpolymath/META.scm", + "https://github.com/hyperpolymath/state.scm" + ], + "softwareRequirements": [ + "Rust 1.75+", + "Cargo" + ], + "applicationCategory": "Developer Tools", + "operatingSystem": ["Linux", "macOS", "Windows"] }