|
| 1 | +# Author: Bradley R. Kinnard |
| 2 | +""" |
| 3 | +Response Validator - validates LLM responses against stored beliefs. |
| 4 | +Extracts claims from LLM output and checks for contradictions with the belief store. |
| 5 | +""" |
| 6 | + |
| 7 | +import logging |
| 8 | +import re |
| 9 | +from dataclasses import dataclass, field |
| 10 | +from typing import TYPE_CHECKING |
| 11 | + |
| 12 | +if TYPE_CHECKING: |
| 13 | + from ..core.models.belief import Belief |
| 14 | + |
| 15 | +logger = logging.getLogger(__name__) |
| 16 | + |
| 17 | +# lazy-loaded |
| 18 | +_nlp = None |
| 19 | + |
| 20 | + |
| 21 | +def _get_nlp(): |
| 22 | + """Lazy load spacy for claim extraction.""" |
| 23 | + global _nlp |
| 24 | + if _nlp is not None: |
| 25 | + return _nlp |
| 26 | + try: |
| 27 | + import spacy |
| 28 | + _nlp = spacy.load("en_core_web_sm") |
| 29 | + return _nlp |
| 30 | + except Exception as e: |
| 31 | + logger.warning(f"spacy unavailable for claim extraction: {e}") |
| 32 | + return None |
| 33 | + |
| 34 | + |
| 35 | +@dataclass |
| 36 | +class ExtractedClaim: |
| 37 | + """A factual claim extracted from LLM response.""" |
| 38 | + text: str |
| 39 | + sentence: str |
| 40 | + confidence: float = 1.0 # how certain we are this is a factual claim |
| 41 | + is_hedged: bool = False # "might be", "could be", etc. |
| 42 | + |
| 43 | + |
| 44 | +@dataclass |
| 45 | +class ValidationResult: |
| 46 | + """Result of validating LLM response against beliefs.""" |
| 47 | + is_valid: bool = True |
| 48 | + contradictions: list[dict] = field(default_factory=list) |
| 49 | + claims_checked: int = 0 |
| 50 | + flagged_claims: list[ExtractedClaim] = field(default_factory=list) |
| 51 | + |
| 52 | + |
| 53 | +# hedging phrases that reduce claim confidence |
| 54 | +HEDGE_PHRASES = frozenset([ |
| 55 | + "might", "may", "could", "possibly", "perhaps", "maybe", |
| 56 | + "i think", "i believe", "it seems", "appears to", |
| 57 | + "not sure", "uncertain", "likely", "probably", |
| 58 | + "in my opinion", "as far as i know", "i'm not certain", |
| 59 | +]) |
| 60 | + |
| 61 | +# phrases indicating the LLM is citing stored beliefs (should trust these) |
| 62 | +CITATION_PHRASES = frozenset([ |
| 63 | + "you mentioned", "you said", "you told me", |
| 64 | + "according to what you said", "based on what you told me", |
| 65 | + "from our conversation", "as you noted", "you indicated", |
| 66 | +]) |
| 67 | + |
| 68 | + |
| 69 | +def extract_claims(response: str) -> list[ExtractedClaim]: |
| 70 | + """ |
| 71 | + Extract factual claims from LLM response text. |
| 72 | + Filters out questions, hedged statements, and non-factual content. |
| 73 | + """ |
| 74 | + claims = [] |
| 75 | + nlp = _get_nlp() |
| 76 | + |
| 77 | + if nlp is None: |
| 78 | + # fallback: sentence splitting without NLP |
| 79 | + sentences = re.split(r'[.!?]+', response) |
| 80 | + for sent in sentences: |
| 81 | + sent = sent.strip() |
| 82 | + if len(sent) < 10: |
| 83 | + continue |
| 84 | + if sent.endswith('?'): |
| 85 | + continue # skip questions |
| 86 | + |
| 87 | + # check for hedging |
| 88 | + lower_sent = sent.lower() |
| 89 | + is_hedged = any(h in lower_sent for h in HEDGE_PHRASES) |
| 90 | + |
| 91 | + # skip if citing user's beliefs |
| 92 | + if any(c in lower_sent for c in CITATION_PHRASES): |
| 93 | + continue |
| 94 | + |
| 95 | + # basic factual claim detection: contains "is", "are", "was", "were", numbers |
| 96 | + is_factual = bool(re.search(r'\b(is|are|was|were|has|have|had|will|can|does|did)\b', lower_sent)) |
| 97 | + is_factual = is_factual or bool(re.search(r'\d+', sent)) |
| 98 | + |
| 99 | + if is_factual: |
| 100 | + claims.append(ExtractedClaim( |
| 101 | + text=sent, |
| 102 | + sentence=sent, |
| 103 | + confidence=0.5 if is_hedged else 0.8, |
| 104 | + is_hedged=is_hedged, |
| 105 | + )) |
| 106 | + return claims |
| 107 | + |
| 108 | + # NLP-based extraction |
| 109 | + doc = nlp(response) |
| 110 | + |
| 111 | + for sent in doc.sents: |
| 112 | + sent_text = sent.text.strip() |
| 113 | + if len(sent_text) < 10: |
| 114 | + continue |
| 115 | + |
| 116 | + # skip questions |
| 117 | + if sent_text.endswith('?'): |
| 118 | + continue |
| 119 | + |
| 120 | + lower_sent = sent_text.lower() |
| 121 | + |
| 122 | + # skip if citing user's beliefs |
| 123 | + if any(c in lower_sent for c in CITATION_PHRASES): |
| 124 | + continue |
| 125 | + |
| 126 | + # check for hedging |
| 127 | + is_hedged = any(h in lower_sent for h in HEDGE_PHRASES) |
| 128 | + |
| 129 | + # check if sentence contains factual assertions |
| 130 | + has_verb = any(tok.pos_ == "VERB" for tok in sent) |
| 131 | + has_subj = any(tok.dep_ in ("nsubj", "nsubjpass") for tok in sent) |
| 132 | + has_entity = any(ent.label_ in ("PERSON", "ORG", "GPE", "DATE", "TIME", "MONEY", "QUANTITY", "PERCENT") for ent in sent.ents) |
| 133 | + has_number = any(tok.like_num for tok in sent) |
| 134 | + |
| 135 | + # factual if has subject+verb or contains entities/numbers |
| 136 | + is_factual = (has_verb and has_subj) or has_entity or has_number |
| 137 | + |
| 138 | + if is_factual: |
| 139 | + confidence = 0.5 if is_hedged else 0.9 |
| 140 | + claims.append(ExtractedClaim( |
| 141 | + text=sent_text, |
| 142 | + sentence=sent_text, |
| 143 | + confidence=confidence, |
| 144 | + is_hedged=is_hedged, |
| 145 | + )) |
| 146 | + |
| 147 | + return claims |
| 148 | + |
| 149 | + |
| 150 | +def validate_response( |
| 151 | + response: str, |
| 152 | + beliefs: list["Belief"], |
| 153 | + contradiction_threshold: float = 0.6, |
| 154 | +) -> ValidationResult: |
| 155 | + """ |
| 156 | + Validate LLM response against stored beliefs. |
| 157 | +
|
| 158 | + Extracts claims from response and checks each against beliefs |
| 159 | + for contradictions. |
| 160 | +
|
| 161 | + Args: |
| 162 | + response: LLM response text |
| 163 | + beliefs: List of user beliefs to check against |
| 164 | + contradiction_threshold: Min confidence to flag contradiction |
| 165 | +
|
| 166 | + Returns: |
| 167 | + ValidationResult with any contradictions found |
| 168 | + """ |
| 169 | + from backend.core.bel.semantic_contradiction import check_contradiction |
| 170 | + |
| 171 | + result = ValidationResult() |
| 172 | + |
| 173 | + if not beliefs: |
| 174 | + return result # nothing to validate against |
| 175 | + |
| 176 | + claims = extract_claims(response) |
| 177 | + result.claims_checked = len(claims) |
| 178 | + |
| 179 | + if not claims: |
| 180 | + return result # no factual claims to check |
| 181 | + |
| 182 | + # check each claim against each belief |
| 183 | + for claim in claims: |
| 184 | + if claim.is_hedged: |
| 185 | + continue # skip hedged claims |
| 186 | + |
| 187 | + for belief in beliefs: |
| 188 | + contra_result = check_contradiction(claim.text, belief.content) |
| 189 | + |
| 190 | + if contra_result.label == "contradiction" and contra_result.confidence >= contradiction_threshold: |
| 191 | + result.is_valid = False |
| 192 | + result.contradictions.append({ |
| 193 | + "claim": claim.text, |
| 194 | + "belief_id": str(belief.id), |
| 195 | + "belief_content": belief.content, |
| 196 | + "confidence": contra_result.confidence, |
| 197 | + "reason_codes": contra_result.reason_codes, |
| 198 | + }) |
| 199 | + result.flagged_claims.append(claim) |
| 200 | + logger.warning( |
| 201 | + f"LLM claim contradicts belief: '{claim.text[:50]}...' vs '{belief.content[:50]}...'" |
| 202 | + ) |
| 203 | + |
| 204 | + return result |
| 205 | + |
| 206 | + |
| 207 | +def get_correction_prompt( |
| 208 | + original_response: str, |
| 209 | + contradictions: list[dict], |
| 210 | + beliefs: list["Belief"], |
| 211 | +) -> str: |
| 212 | + """ |
| 213 | + Generate a prompt to correct LLM response that contradicted beliefs. |
| 214 | + """ |
| 215 | + belief_context = "\n".join( |
| 216 | + f"- {b.content} (confidence: {b.confidence:.0%})" |
| 217 | + for b in beliefs[:10] |
| 218 | + ) |
| 219 | + |
| 220 | + contradiction_details = "\n".join( |
| 221 | + f"- Your claim '{c['claim'][:60]}...' contradicts: '{c['belief_content'][:60]}...'" |
| 222 | + for c in contradictions[:5] |
| 223 | + ) |
| 224 | + |
| 225 | + return f"""Your previous response contained claims that contradict what the user has told you. |
| 226 | +
|
| 227 | +WHAT THE USER HAS TOLD YOU (trust these): |
| 228 | +{belief_context} |
| 229 | +
|
| 230 | +CONTRADICTIONS FOUND: |
| 231 | +{contradiction_details} |
| 232 | +
|
| 233 | +Please regenerate your response, ensuring you don't contradict the user's stated facts. If you're uncertain about something, acknowledge that uncertainty rather than stating incorrect facts. |
| 234 | +
|
| 235 | +Original response to fix: |
| 236 | +{original_response[:500]}...""" |
| 237 | + |
| 238 | + |
| 239 | +__all__ = [ |
| 240 | + "ExtractedClaim", |
| 241 | + "ValidationResult", |
| 242 | + "extract_claims", |
| 243 | + "validate_response", |
| 244 | + "get_correction_prompt", |
| 245 | +] |
0 commit comments