From 1eace4c53f13d3c752ad4c99c356d04f22cdfc30 Mon Sep 17 00:00:00 2001
From: leochlon <leochlon@gmail.com>
Date: Fri, 3 Oct 2025 01:54:22 +0100
Subject: [PATCH 1/5] Add local LLM support and uncertainty estimation testing
 for PyG RAG

---
 examples/llm/txt2kg_rag.py           |  98 ++++++++--
 test/test_uncertainty_techqa.py      | 268 +++++++++++++++++++++++++++
 torch_geometric/llm/models/llm.py    | 199 +++++++++++++++++++-
 torch_geometric/llm/models/txt2kg.py |  16 +-
 4 files changed, 558 insertions(+), 23 deletions(-)
 create mode 100644 test/test_uncertainty_techqa.py

diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py
index 09310ac4802e..66b2e6e88765 100644
--- a/examples/llm/txt2kg_rag.py
+++ b/examples/llm/txt2kg_rag.py
@@ -39,6 +39,8 @@
     LLMJudge,
     SentenceTransformer,
 )
+# EDFL planner + universal backends (vendor these, or pip-install if you packaged them):
+# from hallucination_toolkit import OpenAIPlanner, OpenAIItem  # imported internally by LLM now
 from torch_geometric.llm.models.txt2kg import _chunk_text
 from torch_geometric.llm.utils.backend_utils import (
     create_graph_from_triples,
@@ -81,6 +83,11 @@ def parse_args():
         '--ENDPOINT_URL', type=str, default=DEFAULT_ENDPOINT_URL,
         help="The URL hosting your model, \
         in case you are not using the public NIM.")
+    parser.add_argument('--use_local_txt2kg', action="store_true",
+                        help="Use local LLM for TXT2KG instead of NVIDIA NIM")
+    parser.add_argument('--txt2kg_model', type=str,
+                        default="mistralai/Mistral-7B-Instruct-v0.3",
+                        help="Local model for TXT2KG")
     parser.add_argument(
         '--kg_chunk_size', type=int, default=KG_CHUNK_SIZE_DEFAULT,
         help="When splitting context documents for txt2kg,\
@@ -304,10 +311,19 @@ def get_data(args):
 
 
 def index_kg(args, context_docs):
-    kg_maker = TXT2KG(NVIDIA_NIM_MODEL=args.NV_NIM_MODEL,
-                      NVIDIA_API_KEY=args.NV_NIM_KEY,
-                      ENDPOINT_URL=args.ENDPOINT_URL,
-                      chunk_size=args.kg_chunk_size)
+    if args.use_local_txt2kg:
+        kg_maker = TXT2KG(
+            local_LM=True,
+            local_LM_model_name=args.txt2kg_model,
+            chunk_size=args.kg_chunk_size,
+        )
+    else:
+        kg_maker = TXT2KG(
+            NVIDIA_NIM_MODEL=args.NV_NIM_MODEL,
+            NVIDIA_API_KEY=args.NV_NIM_KEY,
+            ENDPOINT_URL=args.ENDPOINT_URL,
+            chunk_size=args.kg_chunk_size,
+        )
     print(
         "Note that if the TXT2KG process is too slow for you're liking using "
         "the public NIM, consider deploying yourself using local_lm flag of "
@@ -328,9 +344,14 @@ def index_kg(args, context_docs):
         checkpoint_file = checkpoint_file[0]
         checkpoint_model_name = checkpoint_file.name.split('--')[0]
         # check if triples generation are using the correct model
-        if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name:
-            raise RuntimeError(
-                "Error: stored triples were generated using a different model")
+        if args.use_local_txt2kg:
+            if checkpoint_model_name != "local":
+                raise RuntimeError(
+                    "Error: stored triples were generated using a different model")
+        else:
+            if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name:
+                raise RuntimeError(
+                    "Error: stored triples were generated using a different model")
         saved_relevant_triples = torch.load(checkpoint_file,
                                             weights_only=False)
         kg_maker.relevant_triples = saved_relevant_triples
@@ -437,15 +458,21 @@ def make_dataset(args):
         raw_triples_file = raw_triples_file[0]
         stored_model_name = raw_triples_file.name.split('--')[0]
 
-        if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name:
-            raise RuntimeError(
-                "Error: stored triples were generated using a different model")
+        # Check if stored triples match current model configuration
+        if args.use_local_txt2kg:
+            if stored_model_name != "local":
+                raise RuntimeError(
+                    "Error: stored triples were generated using a different model")
+        else:
+            if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name:
+                raise RuntimeError(
+                    "Error: stored triples were generated using a different model")
 
         print(f" -> Saved triples generated with: {stored_model_name}")
         triples = torch.load(raw_triples_file)
     else:
         triples = index_kg(args, context_docs)
-
+        
     print("Number of triples in our GraphDB =", len(triples))
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
@@ -574,6 +601,29 @@ def train(args, train_loader, val_loader):
     hidden_channels = args.gnn_hidden_channels
     num_gnn_layers = args.num_gnn_layers
 
+    def _uncertainty_kwargs():
+        return dict(
+            uncertainty_estim=True,
+            uncertainty_cfg={
+                "h_star": 0.05,
+                "isr_threshold": 1.0,
+                "m": 6,
+                "n_samples": 3,
+                "B_clip": 12.0,
+                "clip_mode": "one-sided",
+                "skeleton_policy": "auto",
+                "q_floor": None,
+                "temperature": 0.5,
+                "max_tokens_decision": 8,
+                "backend": "hf",  # or "ollama" / "anthropic"
+                "mask_refusals_in_loss": True,
+            },
+            decision_backend_kwargs=dict(
+                # HF backend (default): reuse the same model id; using a separate pipeline under the hood.
+                # If using Ollama or Anthropic instead, pass those credentials/args here.
+            ),
+        )
+
     if args.num_gnn_layers > 0:
         if args.gnn_model == "GAT":
             gnn = GAT(in_channels=768, hidden_channels=hidden_channels,
@@ -589,15 +639,29 @@ def train(args, train_loader, val_loader):
         gnn = None
 
     if args.llm_generator_mode == "full":
-        llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt,
-                  n_gpus=args.num_gpus)
+        llm = LLM(
+            model_name=args.llm_generator_name,
+            sys_prompt=sys_prompt,
+            n_gpus=args.num_gpus,
+            **_uncertainty_kwargs(),
+        )
     elif args.llm_generator_mode == "lora":
-        llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt,
-                  dtype=torch.float32, n_gpus=args.num_gpus)
+        llm = LLM(
+            model_name=args.llm_generator_name,
+            sys_prompt=sys_prompt,
+            dtype=torch.float32,
+            n_gpus=args.num_gpus,
+            **_uncertainty_kwargs(),
+        )
     else:
         # frozen
-        llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt,
-                  dtype=torch.float32, n_gpus=args.num_gpus).eval()
+        llm = LLM(
+            model_name=args.llm_generator_name,
+            sys_prompt=sys_prompt,
+            dtype=torch.float32,
+            n_gpus=args.num_gpus,
+            **_uncertainty_kwargs(),
+        ).eval()
         for _, p in llm.named_parameters():
             p.requires_grad = False
 
diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py
new file mode 100644
index 000000000000..bb5d75085db4
--- /dev/null
+++ b/test/test_uncertainty_techqa.py
@@ -0,0 +1,268 @@
+import argparse
+import json
+import os
+import sys
+import zipfile
+from pathlib import Path
+
+import torch
+from huggingface_hub import hf_hub_download
+from tqdm import tqdm
+
+# Add pytorch_geometric to path if needed
+sys.path.insert(0, '/content/pytorch_geometric')
+
+from torch_geometric.llm.models import LLM, GRetriever
+
+
+def download_techqa_data(dataset_dir="techqa"):
+    """Download TechQA dataset if it doesn't exist."""
+    json_path = Path(dataset_dir) / "train.json"
+    corpus_path = Path(dataset_dir) / "corpus"
+    
+    if json_path.exists() and corpus_path.exists():
+        print(f"Dataset already exists in {dataset_dir}")
+        return
+    
+    print("Downloading TechQA dataset...")
+    os.makedirs(dataset_dir, exist_ok=True)
+    
+    # Download corpus zip
+    zip_path = hf_hub_download(
+        repo_id="nvidia/TechQA-RAG-Eval",
+        repo_type="dataset",
+        filename="corpus.zip",
+    )
+    
+    # Download train.json
+    json_download_path = hf_hub_download(
+        repo_id="nvidia/TechQA-RAG-Eval",
+        repo_type="dataset",
+        filename="train.json",
+    )
+    
+    # Extract corpus
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(dataset_dir)
+    
+    # Copy train.json
+    import shutil
+    shutil.copy(json_download_path, json_path)
+    
+    print(f"Dataset downloaded to {dataset_dir}")
+
+
+def test_uncertainty_rag(args):
+    """Test LLM with uncertainty estimation on TechQA dataset."""
+    
+    # Download data if needed
+    download_techqa_data(args.dataset_dir)
+    
+    # Load Q&A pairs
+    json_path = Path(args.dataset_dir) / "train.json"
+    with open(json_path) as f:
+        data = json.load(f)
+    
+    # Setup LLM with uncertainty
+    print(f"Setting up LLM: {args.model_name}")
+    print(f"Using {args.num_gpus} GPU(s)")
+    
+    llm = LLM(
+        model_name=args.model_name,
+        sys_prompt=args.sys_prompt,
+        n_gpus=args.num_gpus,
+        dtype=torch.bfloat16,
+        uncertainty_estim=args.use_uncertainty,
+        uncertainty_cfg={
+            "h_star": args.h_star,
+            "isr_threshold": args.isr_threshold,
+            "m": args.m,
+            "n_samples": args.n_samples,
+            "B_clip": args.b_clip,
+            "clip_mode": "one-sided",
+            "skeleton_policy": args.skeleton_policy,
+            "temperature": args.temperature,
+            "max_tokens_decision": 8,
+            "backend": args.backend,
+            "mask_refusals_in_loss": True,
+        } if args.use_uncertainty else None,
+        decision_backend_kwargs={} if args.use_uncertainty else None,
+    )
+    
+    print(f"Uncertainty enabled: {llm.uncertainty_estim}")
+    
+    # Create GRetriever (no GNN for this simple test)
+    model = GRetriever(llm=llm, gnn=None, use_lora=False)
+    
+    # Filter out impossible questions
+    valid_data = [item for item in data if not item.get("is_impossible", False)]
+    test_data = valid_data[:args.num_questions]
+    
+    print(f"\nTesting on {len(test_data)} questions...")
+    print("=" * 80)
+    
+    results = []
+    
+    for item in tqdm(test_data, desc="Processing questions"):
+        question = item["question"]
+        answer = item["answer"]
+        
+        # Format question with mock context (in real scenario, use actual retrieved docs)
+        if args.use_mock_context:
+            context = f"Technical documentation: {answer[:args.context_length]}"
+        else:
+            context = "No context provided (zero-shot)"
+        
+        formatted_q = f"""
+[QUESTION]
+{question}
+[END_QUESTION]
+
+[RETRIEVED_CONTEXTS]
+{context}
+[END_RETRIEVED_CONTEXTS]
+"""
+        
+        # Run inference
+        result = model.llm.inference(
+            question=[formatted_q],
+            context=[""],
+            max_tokens=args.max_tokens,
+            return_uncertainty=args.use_uncertainty,
+            abstain_on_low_ISR=args.abstain_on_low_isr,
+        )
+        
+        # Parse result
+        if args.use_uncertainty and isinstance(result, tuple):
+            texts, uncertainties = result
+            generated = texts[0] if texts else ""
+            uncertainty = uncertainties[0] if uncertainties else None
+            
+            result_dict = {
+                "question": question[:100],
+                "expected": answer[:100],
+                "generated": generated[:100],
+                "isr": uncertainty.isr if uncertainty else None,
+                "decision": "ANSWER" if (uncertainty and uncertainty.decision_answer) else "REFUSE",
+                "full_question": question,
+                "full_answer": answer,
+                "full_generated": generated,
+            }
+        else:
+            generated = result[0] if isinstance(result, list) else str(result)
+            result_dict = {
+                "question": question[:100],
+                "expected": answer[:100],
+                "generated": generated[:100],
+                "full_question": question,
+                "full_answer": answer,
+                "full_generated": generated,
+            }
+        
+        results.append(result_dict)
+        
+        if args.verbose:
+            print(f"\nQ: {result_dict['question']}")
+            print(f"Expected: {result_dict['expected']}")
+            print(f"Generated: {result_dict['generated']}")
+            if args.use_uncertainty:
+                print(f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}")
+            print("-" * 80)
+    
+    # Summary statistics
+    print("\n" + "=" * 80)
+    print("SUMMARY")
+    print("=" * 80)
+    
+    if args.use_uncertainty:
+        isr_values = [r['isr'] for r in results if r['isr'] is not None]
+        answers = [r for r in results if r['decision'] == 'ANSWER']
+        refusals = [r for r in results if r['decision'] == 'REFUSE']
+        
+        print(f"Total questions: {len(results)}")
+        print(f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)")
+        print(f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)")
+        print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}")
+        print(f"Min ISR: {min(isr_values):.3f}")
+        print(f"Max ISR: {max(isr_values):.3f}")
+    else:
+        print(f"Total questions processed: {len(results)}")
+    
+    # Save results if requested
+    if args.output_file:
+        with open(args.output_file, 'w') as f:
+            json.dump(results, f, indent=2)
+        print(f"\nResults saved to {args.output_file}")
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Test LLM with uncertainty estimation on TechQA dataset"
+    )
+    
+    # Dataset args
+    parser.add_argument('--dataset_dir', type=str, default='techqa',
+                        help='Directory to store/load TechQA dataset')
+    parser.add_argument('--num_questions', type=int, default=10,
+                        help='Number of questions to test')
+    
+    # Model args
+    parser.add_argument('--model_name', type=str,
+                        default='mistralai/Mistral-7B-Instruct-v0.3',
+                        help='HuggingFace model name')
+    parser.add_argument('--num_gpus', type=int, default=1,
+                        help='Number of GPUs to use')
+    parser.add_argument('--sys_prompt', type=str,
+                        default='Answer questions based on the provided context.',
+                        help='System prompt for the LLM')
+    parser.add_argument('--max_tokens', type=int, default=50,
+                        help='Maximum tokens to generate')
+    
+    # Uncertainty args
+    parser.add_argument('--use_uncertainty', action='store_true',
+                        help='Enable uncertainty estimation')
+    parser.add_argument('--h_star', type=float, default=0.05,
+                        help='Entropy threshold for uncertainty')
+    parser.add_argument('--isr_threshold', type=float, default=1.0,
+                        help='ISR threshold for abstention')
+    parser.add_argument('--m', type=int, default=6,
+                        help='Number of semantic sets')
+    parser.add_argument('--n_samples', type=int, default=3,
+                        help='Number of samples per semantic set')
+    parser.add_argument('--b_clip', type=float, default=12.0,
+                        help='Clipping value for importance sampling')
+    parser.add_argument('--skeleton_policy', type=str, default='auto',
+                        choices=['auto', 'evidence_erase', 'none'],
+                        help='Skeleton generation policy')
+    parser.add_argument('--temperature', type=float, default=0.5,
+                        help='Sampling temperature')
+    parser.add_argument('--backend', type=str, default='hf',
+                        choices=['hf', 'ollama', 'anthropic'],
+                        help='Backend for decision model')
+    parser.add_argument('--abstain_on_low_isr', action='store_true',
+                        help='Abstain when ISR is below threshold')
+    
+    # Context args
+    parser.add_argument('--use_mock_context', action='store_true',
+                        help='Use partial answer as mock context (for testing)')
+    parser.add_argument('--context_length', type=int, default=100,
+                        help='Length of mock context to use')
+    
+    # Output args
+    parser.add_argument('--output_file', type=str, default=None,
+                        help='JSON file to save results')
+    parser.add_argument('--verbose', action='store_true',
+                        help='Print detailed results for each question')
+    
+    args = parser.parse_args()
+    
+    # Run test
+    results = test_uncertainty_rag(args)
+    
+    print("\nTest complete!")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py
index 1a3b2d22eb7b..21932d4bc33e 100644
--- a/torch_geometric/llm/models/llm.py
+++ b/torch_geometric/llm/models/llm.py
@@ -1,6 +1,8 @@
 import warnings
 from contextlib import nullcontext
-from typing import Any, Dict, List, Optional
+from dataclasses import dataclass, asdict
+import json, re
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -13,6 +15,40 @@
 IGNORE_INDEX = -100
 MAX_TXT_LEN = 512
 MAX_NEW_TOKENS = 128
+
+# ---- Uncertainty (EDFL/B2T/ISR) glue ------------------------------------
+@dataclass
+class _Uncertainty:
+    decision_answer: bool
+    delta_bar: float
+    b2t: float
+    isr: float
+    roh_bound: float
+    q_avg: float
+    q_conservative: float
+    y_label: str
+    rationale: str
+
+
+def _parse_decision_json(text: str) -> str:
+    # Compatible with your toolkit's tiny-JSON head. (answer|refuse)
+    try:
+        obj = json.loads(text)
+        d = str(obj.get("decision", "")).strip().lower()
+        if d in ("answer", "refuse"):
+            return d
+    except Exception:
+        pass
+    m = re.search(r'{"\s*decision\s*"\s*:\s*"(answer|refuse)"}', text,
+                  flags=re.I)
+    if m:
+        return m.group(1).lower()
+    t = text.strip().lower()
+    if "refuse" in t and "answer" not in t:
+        return "refuse"
+    if "answer" in t and "refuse" not in t:
+        return "answer"
+    return "refuse"  # default-safe
 PAD_TOKEN_ID = 0
 PADDING_SIDE = 'left'
 
@@ -74,6 +110,10 @@ def __init__(
         n_gpus: Optional[int] = None,
         dtype: Optional[torch.dtype] = torch.bfloat16,
         sys_prompt: Optional[str] = None,
+        # --- new ---
+        uncertainty_estim: bool = False,
+        uncertainty_cfg: Optional[Dict[str, Any]] = None,
+        decision_backend_kwargs: Optional[Dict[str, Any]] = None,
     ) -> None:
         super().__init__()
 
@@ -152,6 +192,133 @@ def __init__(
             else:
                 self.autocast_context = torch.amp.autocast('cuda', dtype=dtype)
 
+        # ---- Uncertainty config / backends (opt-in) ----------------------
+        self.uncertainty_estim = bool(uncertainty_estim)
+        self._uncfg: Dict[str, Any] = {
+            "h_star": 0.05,
+            "isr_threshold": 1.0,
+            "m": 6,
+            "n_samples": 3,
+            "B_clip": 12.0,
+            "clip_mode": "one-sided",
+            "skeleton_policy": "auto",
+            "q_floor": None,
+            "temperature": 0.5,
+            "max_tokens_decision": 8,
+            "backend": "hf",  # "hf" | "ollama" | "anthropic"
+            "mask_refusals_in_loss": True,
+        }
+        if uncertainty_cfg:
+            self._uncfg.update(uncertainty_cfg)
+        self._dec_backend_kwargs = decision_backend_kwargs or {}
+        self._last_uncertainty: Optional[List[_Uncertainty]] = None
+
+        # Try to import your planner/backends; fall back to a local adapter:
+        self._planner = None
+        self._backend = None
+        if self.uncertainty_estim:
+            try:
+                from hallbayes import OpenAIPlanner, OpenAIItem
+
+                self._OpenAIPlanner = OpenAIPlanner
+                self._OpenAIItem = OpenAIItem
+                try:
+                    from hallbayes import HuggingFaceBackend, OllamaBackend, AnthropicBackend
+                    self._HFBackend = HuggingFaceBackend
+                    self._OllamaBackend = OllamaBackend
+                    self._AnthropicBackend = AnthropicBackend
+                except Exception:
+                    self._HFBackend = self._OllamaBackend = self._AnthropicBackend = None
+            except Exception:
+                self.uncertainty_estim = False
+                warnings.warn(
+                    "Uncertainty requested but `hallucination_toolkit` not importable. "
+                    "Proceeding without uncertainty.",
+                    stacklevel=2,
+                )
+
+    # ---- helper: turn (question, context) into the string for decision ---
+    def _decision_user_text(self, q: str, ctx: Optional[str]) -> str:
+        if ctx and len(ctx) > 0:
+            return f"{ctx} - {q}"
+        return q
+
+    # ---- build the decision backend on first use -------------------------
+    def _get_decision_backend(self):
+        if self._backend is not None:
+            return self._backend
+        if not self.uncertainty_estim:
+            return None
+        backend_kind = str(self._uncfg.get("backend", "hf")).lower()
+        if backend_kind == "hf" and getattr(self, "_HFBackend", None) is not None:
+            self._backend = self._HFBackend(
+                mode="transformers",
+                model_id=self.model_name,
+                device_map="auto",
+                trust_remote_code=True,
+                **self._dec_backend_kwargs,
+            )
+        elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", None) is not None:
+            self._backend = self._OllamaBackend(**self._dec_backend_kwargs)
+        elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", None) is not None:
+            self._backend = self._AnthropicBackend(**self._dec_backend_kwargs)
+        else:
+            self._backend = None
+        return self._backend
+
+    # ---- compute EDFL/B2T/ISR metrics per item ---------------------------
+    def _compute_uncertainty(
+        self,
+        questions: List[str],
+        context: Optional[List[str]] = None,
+    ) -> List[_Uncertainty]:
+        backend = self._get_decision_backend()
+        Planner = getattr(self, "_OpenAIPlanner", None)
+        Item = getattr(self, "_OpenAIItem", None)
+        if backend is None or Planner is None or Item is None:
+            return []
+        planner = Planner(
+            backend=backend,
+            temperature=float(self._uncfg["temperature"]),
+            max_tokens_decision=int(self._uncfg["max_tokens_decision"]),
+            q_floor=self._uncfg["q_floor"],
+        )
+        items = []
+        for i, q in enumerate(questions):
+            ctx_i = None if context is None else context[i]
+            user_txt = self._decision_user_text(q, ctx_i)
+            items.append(
+                Item(
+                    prompt=user_txt,
+                    n_samples=int(self._uncfg["n_samples"]),
+                    m=int(self._uncfg["m"]),
+                    skeleton_policy=str(self._uncfg["skeleton_policy"]),
+                )
+            )
+        metrics = planner.run(
+            items,
+            h_star=float(self._uncfg["h_star"]),
+            isr_threshold=float(self._uncfg["isr_threshold"]),
+            B_clip=float(self._uncfg["B_clip"]),
+            clip_mode=str(self._uncfg["clip_mode"]),
+        )
+        outs: List[_Uncertainty] = []
+        for m in metrics:
+            outs.append(
+                _Uncertainty(
+                    decision_answer=bool(m.decision_answer),
+                    delta_bar=float(m.delta_bar),
+                    b2t=float(m.b2t),
+                    isr=float(m.isr),
+                    roh_bound=float(m.roh_bound),
+                    q_avg=float(m.q_avg),
+                    q_conservative=float(m.q_conservative),
+                    y_label=str(m.meta.get("y_label", "") if m.meta else ""),
+                    rationale=str(m.rationale),
+                )
+            )
+        return outs
+
     # legacy function - used for Llama 2 style prompting
     def _encode_inputs(
         self,
@@ -420,6 +587,16 @@ def forward(
         inputs_embeds, attention_mask, label_input_ids = self._get_embeds(
             question, context, embedding, answer)
 
+        # --- Uncertainty-aware training: mask labels for ISR<1 -------------
+        if self.uncertainty_estim and label_input_ids is not None:
+            unc = self._compute_uncertainty(question, context)
+            self._last_uncertainty = unc
+            if (self._uncfg.get("mask_refusals_in_loss", True)
+                    and len(unc) == len(question)):
+                for i, u in enumerate(unc):
+                    if not u.decision_answer:
+                        label_input_ids[i, :] = IGNORE_INDEX
+
         with self.autocast_context:
             outputs = self.llm(
                 inputs_embeds=inputs_embeds,
@@ -436,7 +613,10 @@ def inference(
         context: Optional[List[str]] = None,
         embedding: Optional[List[Tensor]] = None,
         max_tokens: Optional[int] = MAX_NEW_TOKENS,
-    ) -> List[str]:
+        # --- new ---
+        return_uncertainty: bool = False,
+        abstain_on_low_ISR: bool = False,
+    ) -> Union[List[str], Tuple[List[str], List[_Uncertainty]]]:
         r"""The inference pass.
 
         Args:
@@ -454,6 +634,11 @@ def inference(
         inputs_embeds, attention_mask, _ = self._get_embeds(
             question, context, embedding)
 
+        unc: List[_Uncertainty] = []
+        if self.uncertainty_estim:
+            unc = self._compute_uncertainty(question, context)
+            self._last_uncertainty = unc
+
         with self.autocast_context:
             outputs = self.llm.generate(
                 inputs_embeds=inputs_embeds,
@@ -464,7 +649,15 @@ def inference(
                 use_cache=True,
             )
 
-        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        if self.uncertainty_estim and unc and abstain_on_low_ISR:
+            texts = [
+                t if u.decision_answer else "[ABSTAIN]"
+                for t, u in zip(texts, unc)
+            ]
+        if return_uncertainty and (self.uncertainty_estim and unc):
+            return texts, unc
+        return texts
 
     def __repr__(self) -> str:
         return f'{self.__class__.__name__}({self.model_name})'
diff --git a/torch_geometric/llm/models/txt2kg.py b/torch_geometric/llm/models/txt2kg.py
index ac011b15d248..7b413acb7a0a 100644
--- a/torch_geometric/llm/models/txt2kg.py
+++ b/torch_geometric/llm/models/txt2kg.py
@@ -43,6 +43,9 @@ class TXT2KG():
             than deploying your own private NIM endpoint. This flag
             is mainly recommended for dev/debug.
             (default: False).
+        local_LM_model_name : str, optional
+            Model name to load when `local_LM` is enabled.
+            (default: "VAGOsolutions/SauerkrautLM-v2-14b-DPO").
         chunk_size : int, optional
             The size of the chunks in which the text data is processed
             (default: 512).
@@ -54,9 +57,13 @@ def __init__(
         NVIDIA_API_KEY: Optional[str] = "",
         ENDPOINT_URL: Optional[str] = "https://integrate.api.nvidia.com/v1",
         local_LM: bool = False,
+        local_LM_model_name: Optional[str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO",
         chunk_size: int = 512,
+        n_gpus=None
     ) -> None:
+        self.n_gpus = n_gpus
         self.local_LM = local_LM
+        self.local_LM_model_name = local_LM_model_name
         # Initialize the local LM flag and the NIM model info accordingly
         if self.local_LM:
             # If using a local LM, set the initd_LM flag to False
@@ -91,9 +98,12 @@ def _chunk_to_triples_str_local(self, txt: str) -> str:
         # call LLM on text
         chunk_start_time = time.time()
         if not self.initd_LM:
-            from torch_geometric.nn.nlp import LLM
-            LM_name = "VAGOsolutions/SauerkrautLM-v2-14b-DPO"
-            self.model = LLM(LM_name).eval()
+            from torch_geometric.llm.models import LLM
+            LM_name = self.local_LM_model_name
+            if self.n_gpus is not None:
+                self.model = LLM(LM_name, n_gpus=self.n_gpus).eval()
+            else:
+                self.model = LLM(LM_name).eval()
             self.initd_LM = True
         out_str = self.model.inference(question=[txt + '\n' + SYSTEM_PROMPT],
                                        max_tokens=self.chunk_size)[0]

From cc4d749158127ed763be92f1905451d88516ede8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 Oct 2025 01:00:41 +0000
Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/llm/txt2kg_rag.py           |  14 ++-
 test/test_uncertainty_techqa.py      | 136 +++++++++++++++------------
 torch_geometric/llm/models/llm.py    |  31 +++---
 torch_geometric/llm/models/txt2kg.py |  18 ++--
 4 files changed, 112 insertions(+), 87 deletions(-)

diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py
index 66b2e6e88765..fe4c2385cffa 100644
--- a/examples/llm/txt2kg_rag.py
+++ b/examples/llm/txt2kg_rag.py
@@ -347,11 +347,13 @@ def index_kg(args, context_docs):
         if args.use_local_txt2kg:
             if checkpoint_model_name != "local":
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model")
+                    "Error: stored triples were generated using a different model"
+                )
         else:
             if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name:
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model")
+                    "Error: stored triples were generated using a different model"
+                )
         saved_relevant_triples = torch.load(checkpoint_file,
                                             weights_only=False)
         kg_maker.relevant_triples = saved_relevant_triples
@@ -462,17 +464,19 @@ def make_dataset(args):
         if args.use_local_txt2kg:
             if stored_model_name != "local":
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model")
+                    "Error: stored triples were generated using a different model"
+                )
         else:
             if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name:
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model")
+                    "Error: stored triples were generated using a different model"
+                )
 
         print(f" -> Saved triples generated with: {stored_model_name}")
         triples = torch.load(raw_triples_file)
     else:
         triples = index_kg(args, context_docs)
-        
+
     print("Number of triples in our GraphDB =", len(triples))
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py
index bb5d75085db4..69316ee95052 100644
--- a/test/test_uncertainty_techqa.py
+++ b/test/test_uncertainty_techqa.py
@@ -19,54 +19,53 @@ def download_techqa_data(dataset_dir="techqa"):
     """Download TechQA dataset if it doesn't exist."""
     json_path = Path(dataset_dir) / "train.json"
     corpus_path = Path(dataset_dir) / "corpus"
-    
+
     if json_path.exists() and corpus_path.exists():
         print(f"Dataset already exists in {dataset_dir}")
         return
-    
+
     print("Downloading TechQA dataset...")
     os.makedirs(dataset_dir, exist_ok=True)
-    
+
     # Download corpus zip
     zip_path = hf_hub_download(
         repo_id="nvidia/TechQA-RAG-Eval",
         repo_type="dataset",
         filename="corpus.zip",
     )
-    
+
     # Download train.json
     json_download_path = hf_hub_download(
         repo_id="nvidia/TechQA-RAG-Eval",
         repo_type="dataset",
         filename="train.json",
     )
-    
+
     # Extract corpus
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         zip_ref.extractall(dataset_dir)
-    
+
     # Copy train.json
     import shutil
     shutil.copy(json_download_path, json_path)
-    
+
     print(f"Dataset downloaded to {dataset_dir}")
 
 
 def test_uncertainty_rag(args):
     """Test LLM with uncertainty estimation on TechQA dataset."""
-    
     # Download data if needed
     download_techqa_data(args.dataset_dir)
-    
+
     # Load Q&A pairs
     json_path = Path(args.dataset_dir) / "train.json"
     with open(json_path) as f:
         data = json.load(f)
-    
+
     # Setup LLM with uncertainty
     print(f"Setting up LLM: {args.model_name}")
     print(f"Using {args.num_gpus} GPU(s)")
-    
+
     llm = LLM(
         model_name=args.model_name,
         sys_prompt=args.sys_prompt,
@@ -88,31 +87,33 @@ def test_uncertainty_rag(args):
         } if args.use_uncertainty else None,
         decision_backend_kwargs={} if args.use_uncertainty else None,
     )
-    
+
     print(f"Uncertainty enabled: {llm.uncertainty_estim}")
-    
+
     # Create GRetriever (no GNN for this simple test)
     model = GRetriever(llm=llm, gnn=None, use_lora=False)
-    
+
     # Filter out impossible questions
-    valid_data = [item for item in data if not item.get("is_impossible", False)]
+    valid_data = [
+        item for item in data if not item.get("is_impossible", False)
+    ]
     test_data = valid_data[:args.num_questions]
-    
+
     print(f"\nTesting on {len(test_data)} questions...")
     print("=" * 80)
-    
+
     results = []
-    
+
     for item in tqdm(test_data, desc="Processing questions"):
         question = item["question"]
         answer = item["answer"]
-        
+
         # Format question with mock context (in real scenario, use actual retrieved docs)
         if args.use_mock_context:
             context = f"Technical documentation: {answer[:args.context_length]}"
         else:
             context = "No context provided (zero-shot)"
-        
+
         formatted_q = f"""
 [QUESTION]
 {question}
@@ -122,7 +123,7 @@ def test_uncertainty_rag(args):
 {context}
 [END_RETRIEVED_CONTEXTS]
 """
-        
+
         # Run inference
         result = model.llm.inference(
             question=[formatted_q],
@@ -131,22 +132,31 @@ def test_uncertainty_rag(args):
             return_uncertainty=args.use_uncertainty,
             abstain_on_low_ISR=args.abstain_on_low_isr,
         )
-        
+
         # Parse result
         if args.use_uncertainty and isinstance(result, tuple):
             texts, uncertainties = result
             generated = texts[0] if texts else ""
             uncertainty = uncertainties[0] if uncertainties else None
-            
+
             result_dict = {
-                "question": question[:100],
-                "expected": answer[:100],
-                "generated": generated[:100],
-                "isr": uncertainty.isr if uncertainty else None,
-                "decision": "ANSWER" if (uncertainty and uncertainty.decision_answer) else "REFUSE",
-                "full_question": question,
-                "full_answer": answer,
-                "full_generated": generated,
+                "question":
+                question[:100],
+                "expected":
+                answer[:100],
+                "generated":
+                generated[:100],
+                "isr":
+                uncertainty.isr if uncertainty else None,
+                "decision":
+                "ANSWER" if
+                (uncertainty and uncertainty.decision_answer) else "REFUSE",
+                "full_question":
+                question,
+                "full_answer":
+                answer,
+                "full_generated":
+                generated,
             }
         else:
             generated = result[0] if isinstance(result, list) else str(result)
@@ -158,68 +168,73 @@ def test_uncertainty_rag(args):
                 "full_answer": answer,
                 "full_generated": generated,
             }
-        
+
         results.append(result_dict)
-        
+
         if args.verbose:
             print(f"\nQ: {result_dict['question']}")
             print(f"Expected: {result_dict['expected']}")
             print(f"Generated: {result_dict['generated']}")
             if args.use_uncertainty:
-                print(f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}")
+                print(
+                    f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}"
+                )
             print("-" * 80)
-    
+
     # Summary statistics
     print("\n" + "=" * 80)
     print("SUMMARY")
     print("=" * 80)
-    
+
     if args.use_uncertainty:
         isr_values = [r['isr'] for r in results if r['isr'] is not None]
         answers = [r for r in results if r['decision'] == 'ANSWER']
         refusals = [r for r in results if r['decision'] == 'REFUSE']
-        
+
         print(f"Total questions: {len(results)}")
-        print(f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)")
-        print(f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)")
+        print(
+            f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)")
+        print(
+            f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)"
+        )
         print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}")
         print(f"Min ISR: {min(isr_values):.3f}")
         print(f"Max ISR: {max(isr_values):.3f}")
     else:
         print(f"Total questions processed: {len(results)}")
-    
+
     # Save results if requested
     if args.output_file:
         with open(args.output_file, 'w') as f:
             json.dump(results, f, indent=2)
         print(f"\nResults saved to {args.output_file}")
-    
+
     return results
 
 
 def main():
     parser = argparse.ArgumentParser(
-        description="Test LLM with uncertainty estimation on TechQA dataset"
-    )
-    
+        description="Test LLM with uncertainty estimation on TechQA dataset")
+
     # Dataset args
     parser.add_argument('--dataset_dir', type=str, default='techqa',
                         help='Directory to store/load TechQA dataset')
     parser.add_argument('--num_questions', type=int, default=10,
                         help='Number of questions to test')
-    
+
     # Model args
     parser.add_argument('--model_name', type=str,
                         default='mistralai/Mistral-7B-Instruct-v0.3',
                         help='HuggingFace model name')
     parser.add_argument('--num_gpus', type=int, default=1,
                         help='Number of GPUs to use')
-    parser.add_argument('--sys_prompt', type=str,
-                        default='Answer questions based on the provided context.',
-                        help='System prompt for the LLM')
+    parser.add_argument(
+        '--sys_prompt', type=str,
+        default='Answer questions based on the provided context.',
+        help='System prompt for the LLM')
     parser.add_argument('--max_tokens', type=int, default=50,
                         help='Maximum tokens to generate')
-    
+
     # Uncertainty args
     parser.add_argument('--use_uncertainty', action='store_true',
                         help='Enable uncertainty estimation')
@@ -234,8 +249,8 @@ def main():
     parser.add_argument('--b_clip', type=float, default=12.0,
                         help='Clipping value for importance sampling')
     parser.add_argument('--skeleton_policy', type=str, default='auto',
-                        choices=['auto', 'evidence_erase', 'none'],
-                        help='Skeleton generation policy')
+                        choices=['auto', 'evidence_erase',
+                                 'none'], help='Skeleton generation policy')
     parser.add_argument('--temperature', type=float, default=0.5,
                         help='Sampling temperature')
     parser.add_argument('--backend', type=str, default='hf',
@@ -243,26 +258,27 @@ def main():
                         help='Backend for decision model')
     parser.add_argument('--abstain_on_low_isr', action='store_true',
                         help='Abstain when ISR is below threshold')
-    
+
     # Context args
-    parser.add_argument('--use_mock_context', action='store_true',
-                        help='Use partial answer as mock context (for testing)')
+    parser.add_argument(
+        '--use_mock_context', action='store_true',
+        help='Use partial answer as mock context (for testing)')
     parser.add_argument('--context_length', type=int, default=100,
                         help='Length of mock context to use')
-    
+
     # Output args
     parser.add_argument('--output_file', type=str, default=None,
                         help='JSON file to save results')
     parser.add_argument('--verbose', action='store_true',
                         help='Print detailed results for each question')
-    
+
     args = parser.parse_args()
-    
+
     # Run test
-    results = test_uncertainty_rag(args)
-    
+    test_uncertainty_rag(args)
+
     print("\nTest complete!")
 
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py
index 21932d4bc33e..47a27dce3c2e 100644
--- a/torch_geometric/llm/models/llm.py
+++ b/torch_geometric/llm/models/llm.py
@@ -1,7 +1,8 @@
+import json
+import re
 import warnings
 from contextlib import nullcontext
-from dataclasses import dataclass, asdict
-import json, re
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -16,6 +17,7 @@
 MAX_TXT_LEN = 512
 MAX_NEW_TOKENS = 128
 
+
 # ---- Uncertainty (EDFL/B2T/ISR) glue ------------------------------------
 @dataclass
 class _Uncertainty:
@@ -49,6 +51,8 @@ def _parse_decision_json(text: str) -> str:
     if "answer" in t and "refuse" not in t:
         return "answer"
     return "refuse"  # default-safe
+
+
 PAD_TOKEN_ID = 0
 PADDING_SIDE = 'left'
 
@@ -218,12 +222,16 @@ def __init__(
         self._backend = None
         if self.uncertainty_estim:
             try:
-                from hallbayes import OpenAIPlanner, OpenAIItem
+                from hallbayes import OpenAIItem, OpenAIPlanner
 
                 self._OpenAIPlanner = OpenAIPlanner
                 self._OpenAIItem = OpenAIItem
                 try:
-                    from hallbayes import HuggingFaceBackend, OllamaBackend, AnthropicBackend
+                    from hallbayes import (
+                        AnthropicBackend,
+                        HuggingFaceBackend,
+                        OllamaBackend,
+                    )
                     self._HFBackend = HuggingFaceBackend
                     self._OllamaBackend = OllamaBackend
                     self._AnthropicBackend = AnthropicBackend
@@ -250,7 +258,8 @@ def _get_decision_backend(self):
         if not self.uncertainty_estim:
             return None
         backend_kind = str(self._uncfg.get("backend", "hf")).lower()
-        if backend_kind == "hf" and getattr(self, "_HFBackend", None) is not None:
+        if backend_kind == "hf" and getattr(self, "_HFBackend",
+                                            None) is not None:
             self._backend = self._HFBackend(
                 mode="transformers",
                 model_id=self.model_name,
@@ -258,9 +267,11 @@ def _get_decision_backend(self):
                 trust_remote_code=True,
                 **self._dec_backend_kwargs,
             )
-        elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", None) is not None:
+        elif backend_kind == "ollama" and getattr(self, "_OllamaBackend",
+                                                  None) is not None:
             self._backend = self._OllamaBackend(**self._dec_backend_kwargs)
-        elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", None) is not None:
+        elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend",
+                                                     None) is not None:
             self._backend = self._AnthropicBackend(**self._dec_backend_kwargs)
         else:
             self._backend = None
@@ -293,8 +304,7 @@ def _compute_uncertainty(
                     n_samples=int(self._uncfg["n_samples"]),
                     m=int(self._uncfg["m"]),
                     skeleton_policy=str(self._uncfg["skeleton_policy"]),
-                )
-            )
+                ))
         metrics = planner.run(
             items,
             h_star=float(self._uncfg["h_star"]),
@@ -315,8 +325,7 @@ def _compute_uncertainty(
                     q_conservative=float(m.q_conservative),
                     y_label=str(m.meta.get("y_label", "") if m.meta else ""),
                     rationale=str(m.rationale),
-                )
-            )
+                ))
         return outs
 
     # legacy function - used for Llama 2 style prompting
diff --git a/torch_geometric/llm/models/txt2kg.py b/torch_geometric/llm/models/txt2kg.py
index 7b413acb7a0a..f94143157b35 100644
--- a/torch_geometric/llm/models/txt2kg.py
+++ b/torch_geometric/llm/models/txt2kg.py
@@ -50,17 +50,13 @@ class TXT2KG():
             The size of the chunks in which the text data is processed
             (default: 512).
     """
-    def __init__(
-        self,
-        NVIDIA_NIM_MODEL: Optional[
-            str] = "nvidia/llama-3.1-nemotron-70b-instruct",
-        NVIDIA_API_KEY: Optional[str] = "",
-        ENDPOINT_URL: Optional[str] = "https://integrate.api.nvidia.com/v1",
-        local_LM: bool = False,
-        local_LM_model_name: Optional[str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO",
-        chunk_size: int = 512,
-        n_gpus=None
-    ) -> None:
+    def __init__(self, NVIDIA_NIM_MODEL: Optional[
+        str] = "nvidia/llama-3.1-nemotron-70b-instruct",
+                 NVIDIA_API_KEY: Optional[str] = "", ENDPOINT_URL: Optional[
+                     str] = "https://integrate.api.nvidia.com/v1",
+                 local_LM: bool = False, local_LM_model_name: Optional[
+                     str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO",
+                 chunk_size: int = 512, n_gpus=None) -> None:
         self.n_gpus = n_gpus
         self.local_LM = local_LM
         self.local_LM_model_name = local_LM_model_name

From 5ca988f6259dbabe872400663eb11e1598610282 Mon Sep 17 00:00:00 2001
From: leochlon <leochlon@gmail.com>
Date: Fri, 3 Oct 2025 03:10:00 +0100
Subject: [PATCH 3/5] Add uncertainty test, fix linting, and update changelog

---
 CHANGELOG.md                    |   9 +
 test/llm/models/test_llm.py     |  60 ++++++-
 test/test_uncertainty_techqa.py | 284 --------------------------------
 3 files changed, 67 insertions(+), 286 deletions(-)
 delete mode 100644 test/test_uncertainty_techqa.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a590afe5e190..8f742bf26408 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,15 @@
 All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+## [Unreleased]
+
+### Added
+- Added local LLM support for TXT2KG knowledge graph extraction ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
+- Added uncertainty estimation test for LLM class ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
+
+### Fixed
+- Fixed LLM import path in TXT2KG from `torch_geometric.nn` to `torch_geometric.llm.models` ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
+
 ## [2.7.0] - 2025-MM-DD
 
 ### Fixed
diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py
index a6a2608551a8..fa13f0eaf526 100644
--- a/test/llm/models/test_llm.py
+++ b/test/llm/models/test_llm.py
@@ -10,8 +10,8 @@
 @onlyRAG
 @withPackage('transformers', 'accelerate')
 def test_llm() -> None:
-    question = ["Is PyG the best open-source GNN library?"]
-    answer = ["yes!"]
+    question = ['Is PyG the best open-source GNN library?']
+    answer = ['yes!']
 
     model = LLM(
         model_name='HuggingFaceTB/SmolLM-360M',
@@ -30,3 +30,59 @@ def test_llm() -> None:
     del model
     gc.collect()
     torch.cuda.empty_cache()
+
+
+@onlyRAG
+@withPackage('transformers', 'accelerate')
+def test_llm_uncertainty() -> None:
+    """Test LLM with uncertainty estimation."""
+    question = ['What is the capital of France?']
+    answer = ['Paris']
+
+    # Test with uncertainty enabled
+    model = LLM(
+        model_name='HuggingFaceTB/SmolLM-360M',
+        num_params=1,
+        dtype=torch.float16,
+        uncertainty_estim=True,
+        uncertainty_cfg={
+            'h_star': 0.05,
+            'isr_threshold': 1.0,
+            'm': 3,
+            'n_samples': 2,
+            'B_clip': 12.0,
+            'clip_mode': 'one-sided',
+            'skeleton_policy': 'auto',
+            'temperature': 0.5,
+            'max_tokens_decision': 8,
+            'backend': 'hf',
+            'mask_refusals_in_loss': True,
+        },
+    )
+
+    assert model.uncertainty_estim is True
+
+    # Test inference with uncertainty
+    result = model.inference(
+        question=question,
+        context=[''],
+        max_tokens=10,
+        return_uncertainty=True,
+    )
+
+    # Should return tuple of (texts, uncertainties)
+    assert isinstance(result, tuple)
+    texts, uncertainties = result
+    assert len(texts) == 1
+    assert len(uncertainties) == 1
+
+    # Check uncertainty object has expected attributes
+    uncertainty = uncertainties[0]
+    assert hasattr(uncertainty, 'isr')
+    assert hasattr(uncertainty, 'decision_answer')
+    assert isinstance(uncertainty.isr, float)
+    assert isinstance(uncertainty.decision_answer, bool)
+
+    del model
+    gc.collect()
+    torch.cuda.empty_cache()
diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py
deleted file mode 100644
index 69316ee95052..000000000000
--- a/test/test_uncertainty_techqa.py
+++ /dev/null
@@ -1,284 +0,0 @@
-import argparse
-import json
-import os
-import sys
-import zipfile
-from pathlib import Path
-
-import torch
-from huggingface_hub import hf_hub_download
-from tqdm import tqdm
-
-# Add pytorch_geometric to path if needed
-sys.path.insert(0, '/content/pytorch_geometric')
-
-from torch_geometric.llm.models import LLM, GRetriever
-
-
-def download_techqa_data(dataset_dir="techqa"):
-    """Download TechQA dataset if it doesn't exist."""
-    json_path = Path(dataset_dir) / "train.json"
-    corpus_path = Path(dataset_dir) / "corpus"
-
-    if json_path.exists() and corpus_path.exists():
-        print(f"Dataset already exists in {dataset_dir}")
-        return
-
-    print("Downloading TechQA dataset...")
-    os.makedirs(dataset_dir, exist_ok=True)
-
-    # Download corpus zip
-    zip_path = hf_hub_download(
-        repo_id="nvidia/TechQA-RAG-Eval",
-        repo_type="dataset",
-        filename="corpus.zip",
-    )
-
-    # Download train.json
-    json_download_path = hf_hub_download(
-        repo_id="nvidia/TechQA-RAG-Eval",
-        repo_type="dataset",
-        filename="train.json",
-    )
-
-    # Extract corpus
-    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
-        zip_ref.extractall(dataset_dir)
-
-    # Copy train.json
-    import shutil
-    shutil.copy(json_download_path, json_path)
-
-    print(f"Dataset downloaded to {dataset_dir}")
-
-
-def test_uncertainty_rag(args):
-    """Test LLM with uncertainty estimation on TechQA dataset."""
-    # Download data if needed
-    download_techqa_data(args.dataset_dir)
-
-    # Load Q&A pairs
-    json_path = Path(args.dataset_dir) / "train.json"
-    with open(json_path) as f:
-        data = json.load(f)
-
-    # Setup LLM with uncertainty
-    print(f"Setting up LLM: {args.model_name}")
-    print(f"Using {args.num_gpus} GPU(s)")
-
-    llm = LLM(
-        model_name=args.model_name,
-        sys_prompt=args.sys_prompt,
-        n_gpus=args.num_gpus,
-        dtype=torch.bfloat16,
-        uncertainty_estim=args.use_uncertainty,
-        uncertainty_cfg={
-            "h_star": args.h_star,
-            "isr_threshold": args.isr_threshold,
-            "m": args.m,
-            "n_samples": args.n_samples,
-            "B_clip": args.b_clip,
-            "clip_mode": "one-sided",
-            "skeleton_policy": args.skeleton_policy,
-            "temperature": args.temperature,
-            "max_tokens_decision": 8,
-            "backend": args.backend,
-            "mask_refusals_in_loss": True,
-        } if args.use_uncertainty else None,
-        decision_backend_kwargs={} if args.use_uncertainty else None,
-    )
-
-    print(f"Uncertainty enabled: {llm.uncertainty_estim}")
-
-    # Create GRetriever (no GNN for this simple test)
-    model = GRetriever(llm=llm, gnn=None, use_lora=False)
-
-    # Filter out impossible questions
-    valid_data = [
-        item for item in data if not item.get("is_impossible", False)
-    ]
-    test_data = valid_data[:args.num_questions]
-
-    print(f"\nTesting on {len(test_data)} questions...")
-    print("=" * 80)
-
-    results = []
-
-    for item in tqdm(test_data, desc="Processing questions"):
-        question = item["question"]
-        answer = item["answer"]
-
-        # Format question with mock context (in real scenario, use actual retrieved docs)
-        if args.use_mock_context:
-            context = f"Technical documentation: {answer[:args.context_length]}"
-        else:
-            context = "No context provided (zero-shot)"
-
-        formatted_q = f"""
-[QUESTION]
-{question}
-[END_QUESTION]
-
-[RETRIEVED_CONTEXTS]
-{context}
-[END_RETRIEVED_CONTEXTS]
-"""
-
-        # Run inference
-        result = model.llm.inference(
-            question=[formatted_q],
-            context=[""],
-            max_tokens=args.max_tokens,
-            return_uncertainty=args.use_uncertainty,
-            abstain_on_low_ISR=args.abstain_on_low_isr,
-        )
-
-        # Parse result
-        if args.use_uncertainty and isinstance(result, tuple):
-            texts, uncertainties = result
-            generated = texts[0] if texts else ""
-            uncertainty = uncertainties[0] if uncertainties else None
-
-            result_dict = {
-                "question":
-                question[:100],
-                "expected":
-                answer[:100],
-                "generated":
-                generated[:100],
-                "isr":
-                uncertainty.isr if uncertainty else None,
-                "decision":
-                "ANSWER" if
-                (uncertainty and uncertainty.decision_answer) else "REFUSE",
-                "full_question":
-                question,
-                "full_answer":
-                answer,
-                "full_generated":
-                generated,
-            }
-        else:
-            generated = result[0] if isinstance(result, list) else str(result)
-            result_dict = {
-                "question": question[:100],
-                "expected": answer[:100],
-                "generated": generated[:100],
-                "full_question": question,
-                "full_answer": answer,
-                "full_generated": generated,
-            }
-
-        results.append(result_dict)
-
-        if args.verbose:
-            print(f"\nQ: {result_dict['question']}")
-            print(f"Expected: {result_dict['expected']}")
-            print(f"Generated: {result_dict['generated']}")
-            if args.use_uncertainty:
-                print(
-                    f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}"
-                )
-            print("-" * 80)
-
-    # Summary statistics
-    print("\n" + "=" * 80)
-    print("SUMMARY")
-    print("=" * 80)
-
-    if args.use_uncertainty:
-        isr_values = [r['isr'] for r in results if r['isr'] is not None]
-        answers = [r for r in results if r['decision'] == 'ANSWER']
-        refusals = [r for r in results if r['decision'] == 'REFUSE']
-
-        print(f"Total questions: {len(results)}")
-        print(
-            f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)")
-        print(
-            f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)"
-        )
-        print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}")
-        print(f"Min ISR: {min(isr_values):.3f}")
-        print(f"Max ISR: {max(isr_values):.3f}")
-    else:
-        print(f"Total questions processed: {len(results)}")
-
-    # Save results if requested
-    if args.output_file:
-        with open(args.output_file, 'w') as f:
-            json.dump(results, f, indent=2)
-        print(f"\nResults saved to {args.output_file}")
-
-    return results
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Test LLM with uncertainty estimation on TechQA dataset")
-
-    # Dataset args
-    parser.add_argument('--dataset_dir', type=str, default='techqa',
-                        help='Directory to store/load TechQA dataset')
-    parser.add_argument('--num_questions', type=int, default=10,
-                        help='Number of questions to test')
-
-    # Model args
-    parser.add_argument('--model_name', type=str,
-                        default='mistralai/Mistral-7B-Instruct-v0.3',
-                        help='HuggingFace model name')
-    parser.add_argument('--num_gpus', type=int, default=1,
-                        help='Number of GPUs to use')
-    parser.add_argument(
-        '--sys_prompt', type=str,
-        default='Answer questions based on the provided context.',
-        help='System prompt for the LLM')
-    parser.add_argument('--max_tokens', type=int, default=50,
-                        help='Maximum tokens to generate')
-
-    # Uncertainty args
-    parser.add_argument('--use_uncertainty', action='store_true',
-                        help='Enable uncertainty estimation')
-    parser.add_argument('--h_star', type=float, default=0.05,
-                        help='Entropy threshold for uncertainty')
-    parser.add_argument('--isr_threshold', type=float, default=1.0,
-                        help='ISR threshold for abstention')
-    parser.add_argument('--m', type=int, default=6,
-                        help='Number of semantic sets')
-    parser.add_argument('--n_samples', type=int, default=3,
-                        help='Number of samples per semantic set')
-    parser.add_argument('--b_clip', type=float, default=12.0,
-                        help='Clipping value for importance sampling')
-    parser.add_argument('--skeleton_policy', type=str, default='auto',
-                        choices=['auto', 'evidence_erase',
-                                 'none'], help='Skeleton generation policy')
-    parser.add_argument('--temperature', type=float, default=0.5,
-                        help='Sampling temperature')
-    parser.add_argument('--backend', type=str, default='hf',
-                        choices=['hf', 'ollama', 'anthropic'],
-                        help='Backend for decision model')
-    parser.add_argument('--abstain_on_low_isr', action='store_true',
-                        help='Abstain when ISR is below threshold')
-
-    # Context args
-    parser.add_argument(
-        '--use_mock_context', action='store_true',
-        help='Use partial answer as mock context (for testing)')
-    parser.add_argument('--context_length', type=int, default=100,
-                        help='Length of mock context to use')
-
-    # Output args
-    parser.add_argument('--output_file', type=str, default=None,
-                        help='JSON file to save results')
-    parser.add_argument('--verbose', action='store_true',
-                        help='Print detailed results for each question')
-
-    args = parser.parse_args()
-
-    # Run test
-    test_uncertainty_rag(args)
-
-    print("\nTest complete!")
-
-
-if __name__ == '__main__':
-    main()

From b9c61abf39d3752a91f93518424fe208a8bcc48e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 3 Oct 2025 02:15:56 +0000
Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 CHANGELOG.md                | 2 ++
 test/llm/models/test_llm.py | 1 -
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8f742bf26408..d9f780d15522 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,10 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 
 ### Added
+
 - Added local LLM support for TXT2KG knowledge graph extraction ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
 - Added uncertainty estimation test for LLM class ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
 
 ### Fixed
+
 - Fixed LLM import path in TXT2KG from `torch_geometric.nn` to `torch_geometric.llm.models` ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479))
 
 ## [2.7.0] - 2025-MM-DD
diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py
index fa13f0eaf526..f2a3ec49fd09 100644
--- a/test/llm/models/test_llm.py
+++ b/test/llm/models/test_llm.py
@@ -37,7 +37,6 @@ def test_llm() -> None:
 def test_llm_uncertainty() -> None:
     """Test LLM with uncertainty estimation."""
     question = ['What is the capital of France?']
-    answer = ['Paris']
 
     # Test with uncertainty enabled
     model = LLM(

From acad83c4fa93175d6c4d46056a066462e2a5d325 Mon Sep 17 00:00:00 2001
From: leochlon <leochlon@gmail.com>
Date: Fri, 3 Oct 2025 03:52:16 +0100
Subject: [PATCH 5/5] Fix line lengths

---
 examples/llm/txt2kg_rag.py        | 443 +++++++++++++++++++-----------
 test/llm/models/test_llm.py       |   2 +-
 torch_geometric/llm/models/llm.py | 127 +++++----
 3 files changed, 365 insertions(+), 207 deletions(-)

diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py
index fe4c2385cffa..ede39a46c6ba 100644
--- a/examples/llm/txt2kg_rag.py
+++ b/examples/llm/txt2kg_rag.py
@@ -14,6 +14,7 @@
 
 try:
     import wandb
+
     wandb_available = True
 except ImportError:
     wandb_available = False
@@ -39,8 +40,10 @@
     LLMJudge,
     SentenceTransformer,
 )
-# EDFL planner + universal backends (vendor these, or pip-install if you packaged them):
-# from hallucination_toolkit import OpenAIPlanner, OpenAIItem  # imported internally by LLM now
+# EDFL planner + universal backends (vendor these,
+# or pip-install if you packaged them):
+# from hallucination_toolkit import OpenAIPlanner, OpenAIItem  # imported
+# internally by LLM now
 from torch_geometric.llm.models.txt2kg import _chunk_text
 from torch_geometric.llm.utils.backend_utils import (
     create_graph_from_triples,
@@ -72,97 +75,162 @@
 
 def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument('--gnn_model', type=str, default="GAT",
-                        choices=["GAT", "SGFormer"],
-                        help="The GNN model to use. Default is GAT.")
-    parser.add_argument('--NV_NIM_MODEL', type=str,
-                        default=NV_NIM_MODEL_DEFAULT,
-                        help="The NIM LLM to use for TXT2KG for LLMJudge")
-    parser.add_argument('--NV_NIM_KEY', type=str, help="NVIDIA API key")
     parser.add_argument(
-        '--ENDPOINT_URL', type=str, default=DEFAULT_ENDPOINT_URL,
+        "--gnn_model",
+        type=str,
+        default="GAT",
+        choices=["GAT", "SGFormer"],
+        help="The GNN model to use. Default is GAT.",
+    )
+    parser.add_argument(
+        "--NV_NIM_MODEL",
+        type=str,
+        default=NV_NIM_MODEL_DEFAULT,
+        help="The NIM LLM to use for TXT2KG for LLMJudge",
+    )
+    parser.add_argument("--NV_NIM_KEY", type=str, help="NVIDIA API key")
+    parser.add_argument(
+        "--ENDPOINT_URL",
+        type=str,
+        default=DEFAULT_ENDPOINT_URL,
         help="The URL hosting your model, \
-        in case you are not using the public NIM.")
-    parser.add_argument('--use_local_txt2kg', action="store_true",
-                        help="Use local LLM for TXT2KG instead of NVIDIA NIM")
-    parser.add_argument('--txt2kg_model', type=str,
-                        default="mistralai/Mistral-7B-Instruct-v0.3",
-                        help="Local model for TXT2KG")
+        in case you are not using the public NIM.",
+    )
+    parser.add_argument(
+        "--use_local_txt2kg",
+        action="store_true",
+        help="Use local LLM for TXT2KG instead of NVIDIA NIM",
+    )
+    parser.add_argument(
+        "--txt2kg_model",
+        type=str,
+        default="mistralai/Mistral-7B-Instruct-v0.3",
+        help="Local model for TXT2KG",
+    )
     parser.add_argument(
-        '--kg_chunk_size', type=int, default=KG_CHUNK_SIZE_DEFAULT,
+        "--kg_chunk_size",
+        type=int,
+        default=KG_CHUNK_SIZE_DEFAULT,
         help="When splitting context documents for txt2kg,\
-        the maximum number of characters per chunk.")
-    parser.add_argument('--gnn_hidden_channels', type=int,
-                        default=GNN_HID_CHANNELS_DEFAULT,
-                        help="Hidden channels for GNN")
-    parser.add_argument('--num_gnn_layers', type=int,
-                        default=GNN_LAYERS_DEFAULT,
-                        help="Number of GNN layers")
-    parser.add_argument('--lr', type=float, default=LR_DEFAULT,
+        the maximum number of characters per chunk.",
+    )
+    parser.add_argument(
+        "--gnn_hidden_channels",
+        type=int,
+        default=GNN_HID_CHANNELS_DEFAULT,
+        help="Hidden channels for GNN",
+    )
+    parser.add_argument(
+        "--num_gnn_layers",
+        type=int,
+        default=GNN_LAYERS_DEFAULT,
+        help="Number of GNN layers",
+    )
+    parser.add_argument("--lr", type=float, default=LR_DEFAULT,
                         help="Learning rate")
-    parser.add_argument('--epochs', type=int, default=EPOCHS_DEFAULT,
+    parser.add_argument("--epochs", type=int, default=EPOCHS_DEFAULT,
                         help="Number of epochs")
-    parser.add_argument('--batch_size', type=int, default=BATCH_SIZE_DEFAULT,
+    parser.add_argument("--batch_size", type=int, default=BATCH_SIZE_DEFAULT,
                         help="Batch size")
-    parser.add_argument('--eval_batch_size', type=int,
-                        default=EVAL_BATCH_SIZE_DEFAULT,
-                        help="Evaluation batch size")
-    parser.add_argument('--llm_generator_name', type=str,
-                        default=LLM_GENERATOR_NAME_DEFAULT,
-                        help="The LLM to use for Generation")
     parser.add_argument(
-        '--llm_generator_mode', type=str, default=LLM_GEN_MODE_DEFAULT,
-        choices=["frozen", "lora",
-                 "full"], help="Whether to freeze the Generator LLM,\
-                        use LORA, or fully finetune")
-    parser.add_argument('--dont_save_model', action="store_true",
-                        help="Whether to skip model saving.")
-    parser.add_argument('--log_steps', type=int, default=30,
+        "--eval_batch_size",
+        type=int,
+        default=EVAL_BATCH_SIZE_DEFAULT,
+        help="Evaluation batch size",
+    )
+    parser.add_argument(
+        "--llm_generator_name",
+        type=str,
+        default=LLM_GENERATOR_NAME_DEFAULT,
+        help="The LLM to use for Generation",
+    )
+    parser.add_argument(
+        "--llm_generator_mode",
+        type=str,
+        default=LLM_GEN_MODE_DEFAULT,
+        choices=["frozen", "lora", "full"],
+        help="Whether to freeze the Generator LLM,\
+                        use LORA, or fully finetune",
+    )
+    parser.add_argument(
+        "--dont_save_model",
+        action="store_true",
+        help="Whether to skip model saving.",
+    )
+    parser.add_argument("--log_steps", type=int, default=30,
                         help="Log to wandb every N steps")
-    parser.add_argument('--wandb_project', type=str, default="techqa",
-                        help="Weights & Biases project name")
-    parser.add_argument('--wandb', action="store_true",
+    parser.add_argument(
+        "--wandb_project",
+        type=str,
+        default="techqa",
+        help="Weights & Biases project name",
+    )
+    parser.add_argument("--wandb", action="store_true",
                         help="Enable wandb logging")
     parser.add_argument(
-        '--num_gpus', type=int, default=None,
+        "--num_gpus",
+        type=int,
+        default=None,
         help="Number of GPUs to use. If not specified,"
-        "will determine automatically based on model size.")
-    parser.add_argument('--regenerate_dataset', action="store_true",
-                        help="Regenerate the dataset")
+        "will determine automatically based on model size.",
+    )
+    parser.add_argument(
+        "--regenerate_dataset",
+        action="store_true",
+        help="Regenerate the dataset",
+    )
     parser.add_argument(
-        '--doc_parsing_mode', type=str, default=None,
-        choices=["paragraph",
-                 "file"], help="How to parse documents: 'paragraph' splits "
+        "--doc_parsing_mode",
+        type=str,
+        default=None,
+        choices=["paragraph", "file"],
+        help="How to parse documents: 'paragraph' splits "
         "files by paragraphs, 'file' treats each file as"
         "one document. "
-        "This will override any value set in the config file.")
+        "This will override any value set in the config file.",
+    )
     parser.add_argument(
-        '--k_for_docs', type=int, default=None,
+        "--k_for_docs",
+        type=int,
+        default=None,
         help="Number of docs to retrieve for each question. "
-        "This will override any value set in the config file.")
+        "This will override any value set in the config file.",
+    )
     parser.add_argument(
-        '--doc_chunk_size', type=int, default=None,
+        "--doc_chunk_size",
+        type=int,
+        default=None,
         help="The chunk size to use VectorRAG (document retrieval). "
-        "This will override any value set in the config file.")
+        "This will override any value set in the config file.",
+    )
     parser.add_argument(
-        '--dataset', type=str, default="techqa", help="Dataset folder name, "
+        "--dataset",
+        type=str,
+        default="techqa",
+        help="Dataset folder name, "
         "should contain corpus and train.json files."
         "extracted triples, processed dataset, "
         "document retriever, and model checkpoints "
-        "will be saved in the dataset folder")
+        "will be saved in the dataset folder",
+    )
     parser.add_argument(
-        '--skip_graph_rag', action="store_true",
+        "--skip_graph_rag",
+        action="store_true",
         help="Skip the graph RAG step. "
-        "Used to compare the performance of Vector+Graph RAG vs Vector RAG.")
+        "Used to compare the performance of Vector+Graph RAG vs Vector RAG.",
+    )
     parser.add_argument(
-        '--use_x_percent_corpus', default=100.0, type=float,
+        "--use_x_percent_corpus",
+        default=100.0,
+        type=float,
         help="Debug flag that allows user to only use a random percentage "
-        "of available knowledge base corpus for RAG")
+        "of available knowledge base corpus for RAG",
+    )
     args = parser.parse_args()
 
     assert args.NV_NIM_KEY, "NVIDIA API key is required for TXT2KG and eval"
-    assert args.use_x_percent_corpus <= 100 and \
-        args.use_x_percent_corpus > 0, "Please provide a value in (0,100]"
+    assert (args.use_x_percent_corpus <= 100 and args.use_x_percent_corpus
+            > 0), "Please provide a value in (0,100]"
     if args.skip_graph_rag:
         print("Skipping graph RAG step, setting GNN layers to 0...")
         args.num_gnn_layers = 0
@@ -176,7 +244,9 @@ def parse_args():
         if config is not None:
             # Use a loop to check and apply config values for each parameter
             config_params = [
-                'doc_parsing_mode', 'doc_chunk_size', 'k_for_docs'
+                "doc_parsing_mode",
+                "doc_chunk_size",
+                "k_for_docs",
             ]
             for param in config_params:
                 if param in config and getattr(args, param) is None:
@@ -221,7 +291,7 @@ def _process_and_chunk_text(text, chunk_size, doc_parsing_mode):
     if multiple paragraphs are detected.
     """
     if doc_parsing_mode == "paragraph":
-        paragraphs = re.split(r'\n{2,}', text)
+        paragraphs = re.split(r"\n{2,}", text)
     else:
         # doc_parsing_mode == 'file' or doc_parsing_mode is None
         paragraphs = [text]
@@ -265,9 +335,11 @@ def get_data(args):
             if not os.path.exists(args.dataset):
                 os.mkdir(args.dataset)
             import zipfile
-            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+
+            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                 zip_ref.extractall(args.dataset)
             import shutil
+
             shutil.copy(json_path, os.path.join(args.dataset, "train.json"))
         elif user_input.lower() == "n" or user_input.lower() == "no":
             sys.exit("No selected, no data to work with... exiting.")
@@ -292,9 +364,11 @@ def get_data(args):
                 raise ValueError(f"Bad extraction for {file_path}, expecting "
                                  f"text only but got {doc_type}")
             text_contexts.extend(
-                _process_and_chunk_text(data[0]["metadata"]["content"],
-                                        args.doc_chunk_size,
-                                        args.doc_parsing_mode))
+                _process_and_chunk_text(
+                    data[0]["metadata"]["content"],
+                    args.doc_chunk_size,
+                    args.doc_parsing_mode,
+                ))
     else:
         for file_path in glob(os.path.join(args.dataset, "corpus", "*")):
             with open(file_path, "r+") as f:
@@ -327,12 +401,13 @@ def index_kg(args, context_docs):
     print(
         "Note that if the TXT2KG process is too slow for you're liking using "
         "the public NIM, consider deploying yourself using local_lm flag of "
-        "TXT2KG or using https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct "  # noqa
+        "TXT2KG or using "
+        "https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct "
         "to deploy to a private endpoint, which you can pass to this script "
         "w/ --ENDPOINT_URL flag.")
-    print(
-        "Guide for deploying NIM: https://developer.nvidia.com/blog/a-simple-guide-to-deploying-generative-ai-with-nvidia-nim/"  # noqa
-    )
+    print("Guide for deploying NIM: "
+          "https://developer.nvidia.com/blog/"
+          "a-simple-guide-to-deploying-generative-ai-with-nvidia-nim/")
     total_tqdm_count = len(context_docs)
     initial_tqdm_count = 0
     checkpoint_file = list(Path(args.dataset).glob("*--*--checkpoint_kg.pt"))
@@ -342,18 +417,16 @@ def index_kg(args, context_docs):
     if len(checkpoint_file) == 1:
         print("Restoring KG from checkpoint")
         checkpoint_file = checkpoint_file[0]
-        checkpoint_model_name = checkpoint_file.name.split('--')[0]
+        checkpoint_model_name = checkpoint_file.name.split("--")[0]
         # check if triples generation are using the correct model
         if args.use_local_txt2kg:
             if checkpoint_model_name != "local":
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model"
-                )
+                    "Error: stored triples generated using a different model")
         else:
-            if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name:
+            if args.NV_NIM_MODEL.split("/")[-1] != checkpoint_model_name:
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model"
-                )
+                    "Error: stored triples generated using a different model")
         saved_relevant_triples = torch.load(checkpoint_file,
                                             weights_only=False)
         kg_maker.relevant_triples = saved_relevant_triples
@@ -363,16 +436,19 @@ def index_kg(args, context_docs):
 
     chkpt_interval = 10
     chkpt_count = 0
-    for context_doc in tqdm(context_docs, total=total_tqdm_count,
-                            initial=initial_tqdm_count,
-                            desc="Extracting KG triples"):
+    for context_doc in tqdm(
+            context_docs,
+            total=total_tqdm_count,
+            initial=initial_tqdm_count,
+            desc="Extracting KG triples",
+    ):
         kg_maker.add_doc_2_KG(txt=context_doc)
         chkpt_count += 1
         if chkpt_count == chkpt_interval:
             chkpt_count = 0
             path = args.dataset + "/{m}--{t}--checkpoint_kg.pt"
-            model = kg_maker.NIM_MODEL.split(
-                '/')[-1] if not kg_maker.local_LM else "local"
+            model = (kg_maker.NIM_MODEL.split("/")[-1]
+                     if not kg_maker.local_LM else "local")
             path = path.format(m=model,
                                t=datetime.now().strftime("%Y%m%d_%H%M%S"))
             torch.save(kg_maker.relevant_triples, path)
@@ -384,12 +460,13 @@ def index_kg(args, context_docs):
     triples = list(dict.fromkeys(triples))
     raw_triples_path = args.dataset + "/{m}--{t}--raw_triples.pt"
 
-    model_name = kg_maker.NIM_MODEL.split(
-        '/')[-1] if not kg_maker.local_LM else "local"
+    model_name = (kg_maker.NIM_MODEL.split("/")[-1]
+                  if not kg_maker.local_LM else "local")
     torch.save(
         triples,
         raw_triples_path.format(m=model_name,
-                                t=datetime.now().strftime("%Y%m%d_%H%M%S")))
+                                t=datetime.now().strftime("%Y%m%d_%H%M%S")),
+    )
 
     for old_checkpoint_file in Path(
             args.dataset).glob("*--*--checkpoint_kg.pt"):
@@ -402,8 +479,8 @@ def update_data_lists(args, data_lists):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # creating the embedding model
     sent_trans_batch_size = 256
-    model = SentenceTransformer(
-        model_name=ENCODER_MODEL_NAME_DEFAULT).to(device).eval()
+    model = (SentenceTransformer(
+        model_name=ENCODER_MODEL_NAME_DEFAULT).to(device).eval())
     model_kwargs = {
         "output_device": device,
         "batch_size": int(sent_trans_batch_size / 4),
@@ -458,19 +535,17 @@ def make_dataset(args):
         raise RuntimeError("Error: multiple raw_triples files found")
     if len(raw_triples_file) == 1:
         raw_triples_file = raw_triples_file[0]
-        stored_model_name = raw_triples_file.name.split('--')[0]
+        stored_model_name = raw_triples_file.name.split("--")[0]
 
         # Check if stored triples match current model configuration
         if args.use_local_txt2kg:
             if stored_model_name != "local":
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model"
-                )
+                    "Error: stored triples generated using a different model")
         else:
-            if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name:
+            if args.NV_NIM_MODEL.split("/")[-1] != stored_model_name:
                 raise RuntimeError(
-                    "Error: stored triples were generated using a different model"
-                )
+                    "Error: stored triples generated using a different model")
 
         print(f" -> Saved triples generated with: {stored_model_name}")
         triples = torch.load(raw_triples_file)
@@ -488,18 +563,23 @@ def make_dataset(args):
     print("Creating the graph data from raw triples...")
     # create the graph data from raw triples
     graph_data = create_graph_from_triples(
-        triples=triples, embedding_model=model.encode,
+        triples=triples,
+        embedding_model=model.encode,
         embedding_method_kwargs={
             "batch_size": min(len(triples), sent_trans_batch_size),
-            "verbose": True
-        }, pre_transform=preprocess_triplet)
+            "verbose": True,
+        },
+        pre_transform=preprocess_triplet,
+    )
 
     print("Creating the graph and feature stores...")
     # creating the graph and feature stores
     fs, gs = create_remote_backend_from_graph_data(
-        graph_data=graph_data, path="backend",
+        graph_data=graph_data,
+        path="backend",
         graph_db=NeighborSamplingRAGGraphStore,
-        feature_db=KNNRAGFeatureStore).load()
+        feature_db=KNNRAGFeatureStore,
+    ).load()
     """
     NOTE: these retriever hyperparams are very important.
     Tuning may be needed for custom data...
@@ -508,7 +588,7 @@ def make_dataset(args):
     model_kwargs = {
         "output_device": device,
         "batch_size": int(sent_trans_batch_size / 4),
-        "verbose": True
+        "verbose": True,
     }
 
     doc_retriever_path = os.path.join(args.dataset, "document_retriever.pt")
@@ -521,10 +601,12 @@ def make_dataset(args):
             vector_retriever.k_for_docs = args.k_for_docs
     else:
         print("Creating document retriever...")
-        vector_retriever = DocumentRetriever(context_docs,
-                                             k_for_docs=args.k_for_docs,
-                                             model=model.encode,
-                                             model_kwargs=model_kwargs)
+        vector_retriever = DocumentRetriever(
+            context_docs,
+            k_for_docs=args.k_for_docs,
+            model=model.encode,
+            model_kwargs=model_kwargs,
+        )
         vector_retriever.save(doc_retriever_path)
 
     subgraph_filter = make_pcst_filter(
@@ -532,8 +614,9 @@ def make_dataset(args):
         model,
         topk=5,  # nodes
         topk_e=5,  # edges
-        cost_e=.5,  # edge cost
-        num_clusters=10)  # num clusters
+        cost_e=0.5,  # edge cost
+        num_clusters=10,
+    )  # num clusters
 
     # number of neighbors for each seed node selected by KNN
     fanout = 100
@@ -552,10 +635,12 @@ def make_dataset(args):
     print("Now to retrieve context for each query from "
           "our Vector and Graph DBs...")
 
-    query_loader = RAGQueryLoader(graph_data=(fs, gs),
-                                  subgraph_filter=subgraph_filter,
-                                  vector_retriever=vector_retriever,
-                                  config=query_loader_config)
+    query_loader = RAGQueryLoader(
+        graph_data=(fs, gs),
+        subgraph_filter=subgraph_filter,
+        vector_retriever=vector_retriever,
+        config=query_loader_config,
+    )
 
     # pre-process the dataset
     total_data_list = []
@@ -578,15 +663,17 @@ def make_dataset(args):
     # stats
     print("Min # of Retrieved Triples =", min(extracted_triple_sizes))
     print("Max # of Retrieved Triples =", max(extracted_triple_sizes))
-    print("Average # of Retrieved Triples =",
-          sum(extracted_triple_sizes) / len(extracted_triple_sizes))
+    print(
+        "Average # of Retrieved Triples =",
+        sum(extracted_triple_sizes) / len(extracted_triple_sizes),
+    )
 
     # 60:20:20 split
-    data_lists["train"] = total_data_list[:int(.6 * len(total_data_list))]
-    data_lists["validation"] = total_data_list[int(.6 * len(total_data_list)
-                                                   ):int(.8 *
+    data_lists["train"] = total_data_list[:int(0.6 * len(total_data_list))]
+    data_lists["validation"] = total_data_list[int(0.6 * len(total_data_list)
+                                                   ):int(0.8 *
                                                          len(total_data_list))]
-    data_lists["test"] = total_data_list[int(.8 * len(total_data_list)):]
+    data_lists["test"] = total_data_list[int(0.8 * len(total_data_list)):]
 
     dataset_name = os.path.basename(args.dataset)
     dataset_path = os.path.join(args.dataset, f"{dataset_name}.pt")
@@ -599,9 +686,11 @@ def make_dataset(args):
 
 def train(args, train_loader, val_loader):
     if args.wandb:
-        wandb.init(project=args.wandb_project,
-                   name=f"run_{datetime.now().strftime('%Y-%m-%d_%H:%M')}",
-                   config=vars(args))
+        wandb.init(
+            project=args.wandb_project,
+            name=f"run_{datetime.now().strftime('%Y-%m-%d_%H:%M')}",
+            config=vars(args),
+        )
     hidden_channels = args.gnn_hidden_channels
     num_gnn_layers = args.num_gnn_layers
 
@@ -623,20 +712,32 @@ def _uncertainty_kwargs():
                 "mask_refusals_in_loss": True,
             },
             decision_backend_kwargs=dict(
-                # HF backend (default): reuse the same model id; using a separate pipeline under the hood.
-                # If using Ollama or Anthropic instead, pass those credentials/args here.
+                # HF backend (default): reuse the same model id; using a
+                # separate pipeline under the hood.
+                # If using Ollama or Anthropic instead, pass those
+                # credentials/args here.
             ),
         )
 
     if args.num_gnn_layers > 0:
         if args.gnn_model == "GAT":
-            gnn = GAT(in_channels=768, hidden_channels=hidden_channels,
-                      out_channels=1024, num_layers=num_gnn_layers, heads=4)
+            gnn = GAT(
+                in_channels=768,
+                hidden_channels=hidden_channels,
+                out_channels=1024,
+                num_layers=num_gnn_layers,
+                heads=4,
+            )
         elif args.gnn_model == "SGFormer":
-            gnn = SGFormer(in_channels=768, hidden_channels=hidden_channels,
-                           out_channels=1024, trans_num_heads=1,
-                           trans_dropout=0.5, gnn_num_layers=num_gnn_layers,
-                           gnn_dropout=0.5)
+            gnn = SGFormer(
+                in_channels=768,
+                hidden_channels=hidden_channels,
+                out_channels=1024,
+                trans_num_heads=1,
+                trans_dropout=0.5,
+                gnn_num_layers=num_gnn_layers,
+                gnn_dropout=0.5,
+            )
         else:
             raise ValueError(f"Invalid GNN model: {args.gnn_model}")
     else:
@@ -685,17 +786,20 @@ def _uncertainty_kwargs():
     else:
         params = [p for _, p in model.named_parameters() if p.requires_grad]
         lr = args.lr
-        optimizer = torch.optim.AdamW([{
-            'params': params,
-            'lr': lr,
-            'weight_decay': 0.05
-        }], betas=(0.9, 0.95))
+        optimizer = torch.optim.AdamW(
+            [{
+                "params": params,
+                "lr": lr,
+                "weight_decay": 0.05
+            }],
+            betas=(0.9, 0.95),
+        )
 
         num_oom_errors = 0
         for epoch in range(args.epochs):
             model.train()
             epoch_loss = 0
-            epoch_str = f'Epoch: {epoch + 1}|{args.epochs}'
+            epoch_str = f"Epoch: {epoch + 1}|{args.epochs}"
             loader = tqdm(train_loader, desc=epoch_str)
             for step, batch in enumerate(loader):
                 new_qs = []
@@ -704,7 +808,8 @@ def _uncertainty_kwargs():
                     new_qs.append(
                         prompt_template.format(
                             question=q,
-                            context="\n".join(batch.text_context[i])))
+                            context="\n".join(batch.text_context[i]),
+                        ))
                 batch.question = new_qs
 
                 if args.skip_graph_rag:
@@ -714,37 +819,44 @@ def _uncertainty_kwargs():
                 try:
                     loss = get_loss(model, batch)
                     loss.backward()
-                    clip_grad_norm_(optimizer.param_groups[0]['params'], 0.1)
+                    clip_grad_norm_(optimizer.param_groups[0]["params"], 0.1)
                     if (step + 1) % 2 == 0:
-                        adjust_learning_rate(optimizer.param_groups[0], lr,
-                                             step / len(train_loader) + epoch,
-                                             args.epochs)
+                        adjust_learning_rate(
+                            optimizer.param_groups[0],
+                            lr,
+                            step / len(train_loader) + epoch,
+                            args.epochs,
+                        )
                     optimizer.step()
                     epoch_loss += float(loss.detach())
 
                     if args.wandb and (step + 1) % args.log_steps == 0:
                         wandb.log({
                             "train/loss": float(loss.detach()),
-                            "train/lr": optimizer.param_groups[0]['lr'],
+                            "train/lr": optimizer.param_groups[0]["lr"],
                         })
 
                     if (step + 1) % 2 == 0:
-                        lr = optimizer.param_groups[0]['lr']
+                        lr = optimizer.param_groups[0]["lr"]
                 except torch.cuda.OutOfMemoryError:
                     torch.cuda.empty_cache()
-                    print("Sequence length of last batch: ",
-                          model.seq_length_stats[-1])
+                    print(
+                        "Sequence length of last batch: ",
+                        model.seq_length_stats[-1],
+                    )
                     # TODO: Implement CPU fallback (WIP)
                     num_oom_errors += 1
             print("Sequence length stats: ")
-            print("seq_len avg: ",
-                  sum(model.seq_length_stats) / len(model.seq_length_stats))
+            print(
+                "seq_len avg: ",
+                sum(model.seq_length_stats) / len(model.seq_length_stats),
+            )
             print("seq_len min: ", min(model.seq_length_stats))
             print("seq_len max: ", max(model.seq_length_stats))
             print("Percent of OOM errors: ",
                   num_oom_errors / len(train_loader))
             train_loss = epoch_loss / len(train_loader)
-            print(epoch_str + f', Train Loss: {train_loss:4f}')
+            print(epoch_str + f", Train Loss: {train_loss:4f}")
 
             # Eval Step
             val_loss = 0
@@ -757,7 +869,8 @@ def _uncertainty_kwargs():
                         new_qs.append(
                             prompt_template.format(
                                 question=q,
-                                context="\n".join(batch.text_context[i])))
+                                context="\n".join(batch.text_context[i]),
+                            ))
                     batch.question = new_qs
                     if args.skip_graph_rag:
                         batch.desc = ""
@@ -770,7 +883,7 @@ def _uncertainty_kwargs():
                     wandb.log({
                         "val/loss": val_loss,
                         "train/epoch_loss": train_loss,
-                        "epoch": epoch + 1
+                        "epoch": epoch + 1,
                     })
 
         if args.wandb:
@@ -804,8 +917,8 @@ def eval(question: str, pred: str, correct_answer: str):
         test_batch.question = new_qs
         if args.skip_graph_rag:
             test_batch.desc = ""
-        preds = (inference_step(model, test_batch,
-                                max_out_tokens=max_chars_in_train_answer / 2))
+        preds = inference_step(model, test_batch,
+                               max_out_tokens=max_chars_in_train_answer / 2)
         for question, pred, label in zip(raw_qs, preds, test_batch.label):
             eval_tuples.append((question, pred, label))
     for question, pred, label in tqdm(eval_tuples, desc="Eval"):
@@ -817,7 +930,7 @@ def eval(question: str, pred: str, correct_answer: str):
     print("Improvement of this estimation process is WIP...")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # for reproducibility
     seed_everything(50)
 
@@ -848,13 +961,27 @@ def eval(question: str, pred: str, correct_answer: str):
         data_lists = make_dataset(args)
     batch_size = args.batch_size
     eval_batch_size = args.eval_batch_size
-    train_loader = DataLoader(data_lists["train"], batch_size=batch_size,
-                              drop_last=True, pin_memory=True, shuffle=True)
-    val_loader = DataLoader(data_lists["validation"],
-                            batch_size=eval_batch_size, drop_last=False,
-                            pin_memory=True, shuffle=False)
-    test_loader = DataLoader(data_lists["test"], batch_size=eval_batch_size,
-                             drop_last=False, pin_memory=True, shuffle=False)
+    train_loader = DataLoader(
+        data_lists["train"],
+        batch_size=batch_size,
+        drop_last=True,
+        pin_memory=True,
+        shuffle=True,
+    )
+    val_loader = DataLoader(
+        data_lists["validation"],
+        batch_size=eval_batch_size,
+        drop_last=False,
+        pin_memory=True,
+        shuffle=False,
+    )
+    test_loader = DataLoader(
+        data_lists["test"],
+        batch_size=eval_batch_size,
+        drop_last=False,
+        pin_memory=True,
+        shuffle=False,
+    )
 
     model = train(args, train_loader, val_loader)
     test(model, test_loader, args)
diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py
index f2a3ec49fd09..07283775e73a 100644
--- a/test/llm/models/test_llm.py
+++ b/test/llm/models/test_llm.py
@@ -36,7 +36,7 @@ def test_llm() -> None:
 @withPackage('transformers', 'accelerate')
 def test_llm_uncertainty() -> None:
     """Test LLM with uncertainty estimation."""
-    question = ['What is the capital of France?']
+    question = ['Who won the Nobel Prize in Physics in 2050?']
 
     # Test with uncertainty enabled
     model = LLM(
diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py
index 47a27dce3c2e..8596046073ee 100644
--- a/torch_geometric/llm/models/llm.py
+++ b/torch_geometric/llm/models/llm.py
@@ -54,12 +54,12 @@ def _parse_decision_json(text: str) -> str:
 
 
 PAD_TOKEN_ID = 0
-PADDING_SIDE = 'left'
+PADDING_SIDE = "left"
 
 # legacy constants - used for Llama 2 style prompting
-BOS = '<s>[INST]'
-EOS_USER = '[/INST]'
-EOS = '[/s]'
+BOS = "<s>[INST]"
+EOS_USER = "[/INST]"
+EOS = "[/s]"
 
 
 def get_llm_kwargs(required_memory: int, dtype=torch.dtype) -> Dict[str, Any]:
@@ -75,15 +75,15 @@ def get_llm_kwargs(required_memory: int, dtype=torch.dtype) -> Dict[str, Any]:
     if sum(gpu_memory) < required_memory:
         gpu_memory = []  # If not enough VRAM, use pure CPU.
 
-    kwargs = dict(revision='main')
+    kwargs = dict(revision="main")
     if len(gpu_memory) > 0:
-        kwargs['max_memory'] = {
-            i: f'{memory}GiB'
+        kwargs["max_memory"] = {
+            i: f"{memory}GiB"
             for i, memory in enumerate(gpu_memory)
         }
-        kwargs['low_cpu_mem_usage'] = True
-        kwargs['device_map'] = 'auto'
-        kwargs['torch_dtype'] = dtype
+        kwargs["low_cpu_mem_usage"] = True
+        kwargs["device_map"] = "auto"
+        kwargs["torch_dtype"] = dtype
 
     return kwargs
 
@@ -124,9 +124,11 @@ def __init__(
         self.model_name = model_name
 
         from transformers import AutoModelForCausalLM, AutoTokenizer
+
         if n_gpus is None:
             if num_params is None:
                 from huggingface_hub import get_safetensors_metadata
+
                 safetensors_metadata = get_safetensors_metadata(model_name)
                 param_count = safetensors_metadata.parameter_count
                 num_params = float(list(param_count.values())[0] // 10**9)
@@ -139,14 +141,14 @@ def __init__(
             gpu_memory: List[int] = []
             for i in range(n_gpus):
                 gpu_memory.append(torch.cuda.mem_get_info(i)[0] // 1024**3)
-            kwargs = dict(revision='main')
-            kwargs['max_memory'] = {
-                i: f'{memory}GiB'
+            kwargs = dict(revision="main")
+            kwargs["max_memory"] = {
+                i: f"{memory}GiB"
                 for i, memory in enumerate(gpu_memory)
             }
-            kwargs['low_cpu_mem_usage'] = True
-            kwargs['device_map'] = 'auto'
-            kwargs['torch_dtype'] = dtype
+            kwargs["low_cpu_mem_usage"] = True
+            kwargs["device_map"] = "auto"
+            kwargs["torch_dtype"] = dtype
 
         print(f"Setting up '{model_name}' with configuration: {kwargs}")
         self.tokenizer = AutoTokenizer.from_pretrained(
@@ -179,22 +181,24 @@ def __init__(
             self.sys_prompt = sys_prompt
         else:
             self.sys_prompt = ""
-        if 'max_memory' not in kwargs:  # Pure CPU:
+        if "max_memory" not in kwargs:  # Pure CPU:
             warnings.warn(
                 "LLM is being used on CPU, which may be slow. This decision "
                 "was made by a rough hueristic that assumes your GPU set up "
                 "does not have enough GPU RAM. This is done to avoid GPU OOM "
                 "errors. If you think this is a mistake, please initialize "
                 "your LLM with the n_gpus param to dictate how many gpus to "
-                "use for the LLM.", stacklevel=2)
-            self.device = torch.device('cpu')
+                "use for the LLM.",
+                stacklevel=2,
+            )
+            self.device = torch.device("cpu")
             self.autocast_context = nullcontext()
         else:
             self.device = self.llm.device
             if dtype == torch.float32:
                 self.autocast_context = nullcontext()
             else:
-                self.autocast_context = torch.amp.autocast('cuda', dtype=dtype)
+                self.autocast_context = torch.amp.autocast("cuda", dtype=dtype)
 
         # ---- Uncertainty config / backends (opt-in) ----------------------
         self.uncertainty_estim = bool(uncertainty_estim)
@@ -232,16 +236,18 @@ def __init__(
                         HuggingFaceBackend,
                         OllamaBackend,
                     )
+
                     self._HFBackend = HuggingFaceBackend
                     self._OllamaBackend = OllamaBackend
                     self._AnthropicBackend = AnthropicBackend
                 except Exception:
-                    self._HFBackend = self._OllamaBackend = self._AnthropicBackend = None
+                    self._HFBackend = self._OllamaBackend = (
+                        self._AnthropicBackend) = None
             except Exception:
                 self.uncertainty_estim = False
                 warnings.warn(
-                    "Uncertainty requested but `hallucination_toolkit` not importable. "
-                    "Proceeding without uncertainty.",
+                    "Uncertainty requested but `hallucination_toolkit` not "
+                    "importable. Proceeding without uncertainty.",
                     stacklevel=2,
                 )
 
@@ -258,8 +264,8 @@ def _get_decision_backend(self):
         if not self.uncertainty_estim:
             return None
         backend_kind = str(self._uncfg.get("backend", "hf")).lower()
-        if backend_kind == "hf" and getattr(self, "_HFBackend",
-                                            None) is not None:
+        if (backend_kind == "hf"
+                and getattr(self, "_HFBackend", None) is not None):
             self._backend = self._HFBackend(
                 mode="transformers",
                 model_id=self.model_name,
@@ -267,11 +273,11 @@ def _get_decision_backend(self):
                 trust_remote_code=True,
                 **self._dec_backend_kwargs,
             )
-        elif backend_kind == "ollama" and getattr(self, "_OllamaBackend",
-                                                  None) is not None:
+        elif (backend_kind == "ollama"
+              and getattr(self, "_OllamaBackend", None) is not None):
             self._backend = self._OllamaBackend(**self._dec_backend_kwargs)
-        elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend",
-                                                     None) is not None:
+        elif (backend_kind == "anthropic"
+              and getattr(self, "_AnthropicBackend", None) is not None):
             self._backend = self._AnthropicBackend(**self._dec_backend_kwargs)
         else:
             self._backend = None
@@ -340,17 +346,23 @@ def _encode_inputs(
             context = self.tokenizer(context, add_special_tokens=False)
 
         eos_user_tokens = self.tokenizer(EOS_USER, add_special_tokens=False)
-        bos_token = self.tokenizer(
+        bos_token = (self.tokenizer(
             BOS,
             add_special_tokens=False,
-            return_tensors='pt',
-        ).input_ids[0].to(self.device)
+            return_tensors="pt",
+        ).input_ids[0].to(self.device))
         bos_embeds = self.word_embedding(bos_token)
         pad_token = torch.tensor(self.tokenizer.pad_token_id,
                                  device=self.device)
         pad_embeds = self.word_embedding(pad_token).unsqueeze(0)
-        return (batch_size, questions, context, eos_user_tokens, bos_embeds,
-                pad_embeds)
+        return (
+            batch_size,
+            questions,
+            context,
+            eos_user_tokens,
+            bos_embeds,
+            pad_embeds,
+        )
 
     def _label_input_ids(
         self,
@@ -445,8 +457,14 @@ def _get_embeds_old(
         embedding: Optional[List[Tensor]] = None,
         answer: Optional[List[str]] = None,
     ) -> tuple:
-        (batch_size, question, context, eos_user_tokens, bos_embeds,
-         pad_embeds) = self._encode_inputs(question, context)
+        (
+            batch_size,
+            question,
+            context,
+            eos_user_tokens,
+            bos_embeds,
+            pad_embeds,
+        ) = self._encode_inputs(question, context)
 
         batch_label_input_ids = None
         if answer is not None:
@@ -480,8 +498,11 @@ def _get_embeds_old(
             )
 
         inputs_embeds, attention_mask, label_input_ids = self._pad_embeds(
-            pad_embeds, batch_inputs_embeds, batch_attention_mask,
-            batch_label_input_ids)
+            pad_embeds,
+            batch_inputs_embeds,
+            batch_attention_mask,
+            batch_label_input_ids,
+        )
 
         return inputs_embeds, attention_mask, label_input_ids
 
@@ -497,7 +518,9 @@ def _get_embeds(
                 f"HuggingFace model {self.model_name} is not using a "
                 "chat template, using Llama 2 style prompting. Please "
                 "consider using a more recent model and initialize the "
-                "LLM with `sys_prompt`.", stacklevel=2)
+                "LLM with `sys_prompt`.",
+                stacklevel=2,
+            )
             return self._get_embeds_old(question, context, embedding, answer)
         batch_label_input_ids = None
         if answer is not None:
@@ -535,11 +558,11 @@ def _get_embeds(
             else:
                 label_input_ids = None
 
-            bos_token = self.tokenizer(
+            bos_token = (self.tokenizer(
                 self.tokenizer.bos_token,
                 add_special_tokens=False,
-                return_tensors='pt',
-            ).input_ids[0].to(self.device)
+                return_tensors="pt",
+            ).input_ids[0].to(self.device))
 
             bos_embeds = self.word_embedding(bos_token)
 
@@ -569,8 +592,11 @@ def _get_embeds(
         pad_embeds = self.word_embedding(pad_token).unsqueeze(0)
 
         inputs_embeds, attention_mask, label_input_ids = self._pad_embeds(
-            pad_embeds, batch_inputs_embeds, batch_attention_mask,
-            batch_label_input_ids)
+            pad_embeds,
+            batch_inputs_embeds,
+            batch_attention_mask,
+            batch_label_input_ids,
+        )
 
         return inputs_embeds, attention_mask, label_input_ids
 
@@ -600,8 +626,8 @@ def forward(
         if self.uncertainty_estim and label_input_ids is not None:
             unc = self._compute_uncertainty(question, context)
             self._last_uncertainty = unc
-            if (self._uncfg.get("mask_refusals_in_loss", True)
-                    and len(unc) == len(question)):
+            if self._uncfg.get("mask_refusals_in_loss",
+                               True) and len(unc) == len(question):
                 for i, u in enumerate(unc):
                     if not u.decision_answer:
                         label_input_ids[i, :] = IGNORE_INDEX
@@ -630,7 +656,6 @@ def inference(
 
         Args:
             question (list[str]): The questions/prompts.
-            answer (list[str]): The answers/labels.
             context (list[str], optional): Additional context to give to the
                 LLM, such as textified knowledge graphs. (default: :obj:`None`)
             embedding (list[torch.Tensor], optional): RAG embedding
@@ -639,6 +664,12 @@ def inference(
                 both. (default: :obj:`None`)
             max_tokens (int, optional): How many tokens for the LLM to
                 generate. (default: :obj:`32`)
+            return_uncertainty (bool, optional): Whether to also return the
+                uncertainty estimates computed for each item. (default:
+                :obj:`False`)
+            abstain_on_low_ISR (bool, optional): Whether to replace outputs
+                with ``"[ABSTAIN]"`` when the ISR-driven decision flags the
+                answer as a refusal. (default: :obj:`False`)
         """
         inputs_embeds, attention_mask, _ = self._get_embeds(
             question, context, embedding)
@@ -669,4 +700,4 @@ def inference(
         return texts
 
     def __repr__(self) -> str:
-        return f'{self.__class__.__name__}({self.model_name})'
+        return f"{self.__class__.__name__}({self.model_name})"