From 1eace4c53f13d3c752ad4c99c356d04f22cdfc30 Mon Sep 17 00:00:00 2001 From: leochlon Date: Fri, 3 Oct 2025 01:54:22 +0100 Subject: [PATCH 1/5] Add local LLM support and uncertainty estimation testing for PyG RAG --- examples/llm/txt2kg_rag.py | 98 ++++++++-- test/test_uncertainty_techqa.py | 268 +++++++++++++++++++++++++++ torch_geometric/llm/models/llm.py | 199 +++++++++++++++++++- torch_geometric/llm/models/txt2kg.py | 16 +- 4 files changed, 558 insertions(+), 23 deletions(-) create mode 100644 test/test_uncertainty_techqa.py diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py index 09310ac4802e..66b2e6e88765 100644 --- a/examples/llm/txt2kg_rag.py +++ b/examples/llm/txt2kg_rag.py @@ -39,6 +39,8 @@ LLMJudge, SentenceTransformer, ) +# EDFL planner + universal backends (vendor these, or pip-install if you packaged them): +# from hallucination_toolkit import OpenAIPlanner, OpenAIItem # imported internally by LLM now from torch_geometric.llm.models.txt2kg import _chunk_text from torch_geometric.llm.utils.backend_utils import ( create_graph_from_triples, @@ -81,6 +83,11 @@ def parse_args(): '--ENDPOINT_URL', type=str, default=DEFAULT_ENDPOINT_URL, help="The URL hosting your model, \ in case you are not using the public NIM.") + parser.add_argument('--use_local_txt2kg', action="store_true", + help="Use local LLM for TXT2KG instead of NVIDIA NIM") + parser.add_argument('--txt2kg_model', type=str, + default="mistralai/Mistral-7B-Instruct-v0.3", + help="Local model for TXT2KG") parser.add_argument( '--kg_chunk_size', type=int, default=KG_CHUNK_SIZE_DEFAULT, help="When splitting context documents for txt2kg,\ @@ -304,10 +311,19 @@ def get_data(args): def index_kg(args, context_docs): - kg_maker = TXT2KG(NVIDIA_NIM_MODEL=args.NV_NIM_MODEL, - NVIDIA_API_KEY=args.NV_NIM_KEY, - ENDPOINT_URL=args.ENDPOINT_URL, - chunk_size=args.kg_chunk_size) + if args.use_local_txt2kg: + kg_maker = TXT2KG( + local_LM=True, + local_LM_model_name=args.txt2kg_model, + chunk_size=args.kg_chunk_size, + ) + else: + kg_maker = TXT2KG( + NVIDIA_NIM_MODEL=args.NV_NIM_MODEL, + NVIDIA_API_KEY=args.NV_NIM_KEY, + ENDPOINT_URL=args.ENDPOINT_URL, + chunk_size=args.kg_chunk_size, + ) print( "Note that if the TXT2KG process is too slow for you're liking using " "the public NIM, consider deploying yourself using local_lm flag of " @@ -328,9 +344,14 @@ def index_kg(args, context_docs): checkpoint_file = checkpoint_file[0] checkpoint_model_name = checkpoint_file.name.split('--')[0] # check if triples generation are using the correct model - if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name: - raise RuntimeError( - "Error: stored triples were generated using a different model") + if args.use_local_txt2kg: + if checkpoint_model_name != "local": + raise RuntimeError( + "Error: stored triples were generated using a different model") + else: + if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name: + raise RuntimeError( + "Error: stored triples were generated using a different model") saved_relevant_triples = torch.load(checkpoint_file, weights_only=False) kg_maker.relevant_triples = saved_relevant_triples @@ -437,15 +458,21 @@ def make_dataset(args): raw_triples_file = raw_triples_file[0] stored_model_name = raw_triples_file.name.split('--')[0] - if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name: - raise RuntimeError( - "Error: stored triples were generated using a different model") + # Check if stored triples match current model configuration + if args.use_local_txt2kg: + if stored_model_name != "local": + raise RuntimeError( + "Error: stored triples were generated using a different model") + else: + if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name: + raise RuntimeError( + "Error: stored triples were generated using a different model") print(f" -> Saved triples generated with: {stored_model_name}") triples = torch.load(raw_triples_file) else: triples = index_kg(args, context_docs) - + print("Number of triples in our GraphDB =", len(triples)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -574,6 +601,29 @@ def train(args, train_loader, val_loader): hidden_channels = args.gnn_hidden_channels num_gnn_layers = args.num_gnn_layers + def _uncertainty_kwargs(): + return dict( + uncertainty_estim=True, + uncertainty_cfg={ + "h_star": 0.05, + "isr_threshold": 1.0, + "m": 6, + "n_samples": 3, + "B_clip": 12.0, + "clip_mode": "one-sided", + "skeleton_policy": "auto", + "q_floor": None, + "temperature": 0.5, + "max_tokens_decision": 8, + "backend": "hf", # or "ollama" / "anthropic" + "mask_refusals_in_loss": True, + }, + decision_backend_kwargs=dict( + # HF backend (default): reuse the same model id; using a separate pipeline under the hood. + # If using Ollama or Anthropic instead, pass those credentials/args here. + ), + ) + if args.num_gnn_layers > 0: if args.gnn_model == "GAT": gnn = GAT(in_channels=768, hidden_channels=hidden_channels, @@ -589,15 +639,29 @@ def train(args, train_loader, val_loader): gnn = None if args.llm_generator_mode == "full": - llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt, - n_gpus=args.num_gpus) + llm = LLM( + model_name=args.llm_generator_name, + sys_prompt=sys_prompt, + n_gpus=args.num_gpus, + **_uncertainty_kwargs(), + ) elif args.llm_generator_mode == "lora": - llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt, - dtype=torch.float32, n_gpus=args.num_gpus) + llm = LLM( + model_name=args.llm_generator_name, + sys_prompt=sys_prompt, + dtype=torch.float32, + n_gpus=args.num_gpus, + **_uncertainty_kwargs(), + ) else: # frozen - llm = LLM(model_name=args.llm_generator_name, sys_prompt=sys_prompt, - dtype=torch.float32, n_gpus=args.num_gpus).eval() + llm = LLM( + model_name=args.llm_generator_name, + sys_prompt=sys_prompt, + dtype=torch.float32, + n_gpus=args.num_gpus, + **_uncertainty_kwargs(), + ).eval() for _, p in llm.named_parameters(): p.requires_grad = False diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py new file mode 100644 index 000000000000..bb5d75085db4 --- /dev/null +++ b/test/test_uncertainty_techqa.py @@ -0,0 +1,268 @@ +import argparse +import json +import os +import sys +import zipfile +from pathlib import Path + +import torch +from huggingface_hub import hf_hub_download +from tqdm import tqdm + +# Add pytorch_geometric to path if needed +sys.path.insert(0, '/content/pytorch_geometric') + +from torch_geometric.llm.models import LLM, GRetriever + + +def download_techqa_data(dataset_dir="techqa"): + """Download TechQA dataset if it doesn't exist.""" + json_path = Path(dataset_dir) / "train.json" + corpus_path = Path(dataset_dir) / "corpus" + + if json_path.exists() and corpus_path.exists(): + print(f"Dataset already exists in {dataset_dir}") + return + + print("Downloading TechQA dataset...") + os.makedirs(dataset_dir, exist_ok=True) + + # Download corpus zip + zip_path = hf_hub_download( + repo_id="nvidia/TechQA-RAG-Eval", + repo_type="dataset", + filename="corpus.zip", + ) + + # Download train.json + json_download_path = hf_hub_download( + repo_id="nvidia/TechQA-RAG-Eval", + repo_type="dataset", + filename="train.json", + ) + + # Extract corpus + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(dataset_dir) + + # Copy train.json + import shutil + shutil.copy(json_download_path, json_path) + + print(f"Dataset downloaded to {dataset_dir}") + + +def test_uncertainty_rag(args): + """Test LLM with uncertainty estimation on TechQA dataset.""" + + # Download data if needed + download_techqa_data(args.dataset_dir) + + # Load Q&A pairs + json_path = Path(args.dataset_dir) / "train.json" + with open(json_path) as f: + data = json.load(f) + + # Setup LLM with uncertainty + print(f"Setting up LLM: {args.model_name}") + print(f"Using {args.num_gpus} GPU(s)") + + llm = LLM( + model_name=args.model_name, + sys_prompt=args.sys_prompt, + n_gpus=args.num_gpus, + dtype=torch.bfloat16, + uncertainty_estim=args.use_uncertainty, + uncertainty_cfg={ + "h_star": args.h_star, + "isr_threshold": args.isr_threshold, + "m": args.m, + "n_samples": args.n_samples, + "B_clip": args.b_clip, + "clip_mode": "one-sided", + "skeleton_policy": args.skeleton_policy, + "temperature": args.temperature, + "max_tokens_decision": 8, + "backend": args.backend, + "mask_refusals_in_loss": True, + } if args.use_uncertainty else None, + decision_backend_kwargs={} if args.use_uncertainty else None, + ) + + print(f"Uncertainty enabled: {llm.uncertainty_estim}") + + # Create GRetriever (no GNN for this simple test) + model = GRetriever(llm=llm, gnn=None, use_lora=False) + + # Filter out impossible questions + valid_data = [item for item in data if not item.get("is_impossible", False)] + test_data = valid_data[:args.num_questions] + + print(f"\nTesting on {len(test_data)} questions...") + print("=" * 80) + + results = [] + + for item in tqdm(test_data, desc="Processing questions"): + question = item["question"] + answer = item["answer"] + + # Format question with mock context (in real scenario, use actual retrieved docs) + if args.use_mock_context: + context = f"Technical documentation: {answer[:args.context_length]}" + else: + context = "No context provided (zero-shot)" + + formatted_q = f""" +[QUESTION] +{question} +[END_QUESTION] + +[RETRIEVED_CONTEXTS] +{context} +[END_RETRIEVED_CONTEXTS] +""" + + # Run inference + result = model.llm.inference( + question=[formatted_q], + context=[""], + max_tokens=args.max_tokens, + return_uncertainty=args.use_uncertainty, + abstain_on_low_ISR=args.abstain_on_low_isr, + ) + + # Parse result + if args.use_uncertainty and isinstance(result, tuple): + texts, uncertainties = result + generated = texts[0] if texts else "" + uncertainty = uncertainties[0] if uncertainties else None + + result_dict = { + "question": question[:100], + "expected": answer[:100], + "generated": generated[:100], + "isr": uncertainty.isr if uncertainty else None, + "decision": "ANSWER" if (uncertainty and uncertainty.decision_answer) else "REFUSE", + "full_question": question, + "full_answer": answer, + "full_generated": generated, + } + else: + generated = result[0] if isinstance(result, list) else str(result) + result_dict = { + "question": question[:100], + "expected": answer[:100], + "generated": generated[:100], + "full_question": question, + "full_answer": answer, + "full_generated": generated, + } + + results.append(result_dict) + + if args.verbose: + print(f"\nQ: {result_dict['question']}") + print(f"Expected: {result_dict['expected']}") + print(f"Generated: {result_dict['generated']}") + if args.use_uncertainty: + print(f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}") + print("-" * 80) + + # Summary statistics + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + + if args.use_uncertainty: + isr_values = [r['isr'] for r in results if r['isr'] is not None] + answers = [r for r in results if r['decision'] == 'ANSWER'] + refusals = [r for r in results if r['decision'] == 'REFUSE'] + + print(f"Total questions: {len(results)}") + print(f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)") + print(f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)") + print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}") + print(f"Min ISR: {min(isr_values):.3f}") + print(f"Max ISR: {max(isr_values):.3f}") + else: + print(f"Total questions processed: {len(results)}") + + # Save results if requested + if args.output_file: + with open(args.output_file, 'w') as f: + json.dump(results, f, indent=2) + print(f"\nResults saved to {args.output_file}") + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Test LLM with uncertainty estimation on TechQA dataset" + ) + + # Dataset args + parser.add_argument('--dataset_dir', type=str, default='techqa', + help='Directory to store/load TechQA dataset') + parser.add_argument('--num_questions', type=int, default=10, + help='Number of questions to test') + + # Model args + parser.add_argument('--model_name', type=str, + default='mistralai/Mistral-7B-Instruct-v0.3', + help='HuggingFace model name') + parser.add_argument('--num_gpus', type=int, default=1, + help='Number of GPUs to use') + parser.add_argument('--sys_prompt', type=str, + default='Answer questions based on the provided context.', + help='System prompt for the LLM') + parser.add_argument('--max_tokens', type=int, default=50, + help='Maximum tokens to generate') + + # Uncertainty args + parser.add_argument('--use_uncertainty', action='store_true', + help='Enable uncertainty estimation') + parser.add_argument('--h_star', type=float, default=0.05, + help='Entropy threshold for uncertainty') + parser.add_argument('--isr_threshold', type=float, default=1.0, + help='ISR threshold for abstention') + parser.add_argument('--m', type=int, default=6, + help='Number of semantic sets') + parser.add_argument('--n_samples', type=int, default=3, + help='Number of samples per semantic set') + parser.add_argument('--b_clip', type=float, default=12.0, + help='Clipping value for importance sampling') + parser.add_argument('--skeleton_policy', type=str, default='auto', + choices=['auto', 'evidence_erase', 'none'], + help='Skeleton generation policy') + parser.add_argument('--temperature', type=float, default=0.5, + help='Sampling temperature') + parser.add_argument('--backend', type=str, default='hf', + choices=['hf', 'ollama', 'anthropic'], + help='Backend for decision model') + parser.add_argument('--abstain_on_low_isr', action='store_true', + help='Abstain when ISR is below threshold') + + # Context args + parser.add_argument('--use_mock_context', action='store_true', + help='Use partial answer as mock context (for testing)') + parser.add_argument('--context_length', type=int, default=100, + help='Length of mock context to use') + + # Output args + parser.add_argument('--output_file', type=str, default=None, + help='JSON file to save results') + parser.add_argument('--verbose', action='store_true', + help='Print detailed results for each question') + + args = parser.parse_args() + + # Run test + results = test_uncertainty_rag(args) + + print("\nTest complete!") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py index 1a3b2d22eb7b..21932d4bc33e 100644 --- a/torch_geometric/llm/models/llm.py +++ b/torch_geometric/llm/models/llm.py @@ -1,6 +1,8 @@ import warnings from contextlib import nullcontext -from typing import Any, Dict, List, Optional +from dataclasses import dataclass, asdict +import json, re +from typing import Any, Dict, List, Optional, Tuple, Union import torch from torch import Tensor @@ -13,6 +15,40 @@ IGNORE_INDEX = -100 MAX_TXT_LEN = 512 MAX_NEW_TOKENS = 128 + +# ---- Uncertainty (EDFL/B2T/ISR) glue ------------------------------------ +@dataclass +class _Uncertainty: + decision_answer: bool + delta_bar: float + b2t: float + isr: float + roh_bound: float + q_avg: float + q_conservative: float + y_label: str + rationale: str + + +def _parse_decision_json(text: str) -> str: + # Compatible with your toolkit's tiny-JSON head. (answer|refuse) + try: + obj = json.loads(text) + d = str(obj.get("decision", "")).strip().lower() + if d in ("answer", "refuse"): + return d + except Exception: + pass + m = re.search(r'{"\s*decision\s*"\s*:\s*"(answer|refuse)"}', text, + flags=re.I) + if m: + return m.group(1).lower() + t = text.strip().lower() + if "refuse" in t and "answer" not in t: + return "refuse" + if "answer" in t and "refuse" not in t: + return "answer" + return "refuse" # default-safe PAD_TOKEN_ID = 0 PADDING_SIDE = 'left' @@ -74,6 +110,10 @@ def __init__( n_gpus: Optional[int] = None, dtype: Optional[torch.dtype] = torch.bfloat16, sys_prompt: Optional[str] = None, + # --- new --- + uncertainty_estim: bool = False, + uncertainty_cfg: Optional[Dict[str, Any]] = None, + decision_backend_kwargs: Optional[Dict[str, Any]] = None, ) -> None: super().__init__() @@ -152,6 +192,133 @@ def __init__( else: self.autocast_context = torch.amp.autocast('cuda', dtype=dtype) + # ---- Uncertainty config / backends (opt-in) ---------------------- + self.uncertainty_estim = bool(uncertainty_estim) + self._uncfg: Dict[str, Any] = { + "h_star": 0.05, + "isr_threshold": 1.0, + "m": 6, + "n_samples": 3, + "B_clip": 12.0, + "clip_mode": "one-sided", + "skeleton_policy": "auto", + "q_floor": None, + "temperature": 0.5, + "max_tokens_decision": 8, + "backend": "hf", # "hf" | "ollama" | "anthropic" + "mask_refusals_in_loss": True, + } + if uncertainty_cfg: + self._uncfg.update(uncertainty_cfg) + self._dec_backend_kwargs = decision_backend_kwargs or {} + self._last_uncertainty: Optional[List[_Uncertainty]] = None + + # Try to import your planner/backends; fall back to a local adapter: + self._planner = None + self._backend = None + if self.uncertainty_estim: + try: + from hallbayes import OpenAIPlanner, OpenAIItem + + self._OpenAIPlanner = OpenAIPlanner + self._OpenAIItem = OpenAIItem + try: + from hallbayes import HuggingFaceBackend, OllamaBackend, AnthropicBackend + self._HFBackend = HuggingFaceBackend + self._OllamaBackend = OllamaBackend + self._AnthropicBackend = AnthropicBackend + except Exception: + self._HFBackend = self._OllamaBackend = self._AnthropicBackend = None + except Exception: + self.uncertainty_estim = False + warnings.warn( + "Uncertainty requested but `hallucination_toolkit` not importable. " + "Proceeding without uncertainty.", + stacklevel=2, + ) + + # ---- helper: turn (question, context) into the string for decision --- + def _decision_user_text(self, q: str, ctx: Optional[str]) -> str: + if ctx and len(ctx) > 0: + return f"{ctx} - {q}" + return q + + # ---- build the decision backend on first use ------------------------- + def _get_decision_backend(self): + if self._backend is not None: + return self._backend + if not self.uncertainty_estim: + return None + backend_kind = str(self._uncfg.get("backend", "hf")).lower() + if backend_kind == "hf" and getattr(self, "_HFBackend", None) is not None: + self._backend = self._HFBackend( + mode="transformers", + model_id=self.model_name, + device_map="auto", + trust_remote_code=True, + **self._dec_backend_kwargs, + ) + elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", None) is not None: + self._backend = self._OllamaBackend(**self._dec_backend_kwargs) + elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", None) is not None: + self._backend = self._AnthropicBackend(**self._dec_backend_kwargs) + else: + self._backend = None + return self._backend + + # ---- compute EDFL/B2T/ISR metrics per item --------------------------- + def _compute_uncertainty( + self, + questions: List[str], + context: Optional[List[str]] = None, + ) -> List[_Uncertainty]: + backend = self._get_decision_backend() + Planner = getattr(self, "_OpenAIPlanner", None) + Item = getattr(self, "_OpenAIItem", None) + if backend is None or Planner is None or Item is None: + return [] + planner = Planner( + backend=backend, + temperature=float(self._uncfg["temperature"]), + max_tokens_decision=int(self._uncfg["max_tokens_decision"]), + q_floor=self._uncfg["q_floor"], + ) + items = [] + for i, q in enumerate(questions): + ctx_i = None if context is None else context[i] + user_txt = self._decision_user_text(q, ctx_i) + items.append( + Item( + prompt=user_txt, + n_samples=int(self._uncfg["n_samples"]), + m=int(self._uncfg["m"]), + skeleton_policy=str(self._uncfg["skeleton_policy"]), + ) + ) + metrics = planner.run( + items, + h_star=float(self._uncfg["h_star"]), + isr_threshold=float(self._uncfg["isr_threshold"]), + B_clip=float(self._uncfg["B_clip"]), + clip_mode=str(self._uncfg["clip_mode"]), + ) + outs: List[_Uncertainty] = [] + for m in metrics: + outs.append( + _Uncertainty( + decision_answer=bool(m.decision_answer), + delta_bar=float(m.delta_bar), + b2t=float(m.b2t), + isr=float(m.isr), + roh_bound=float(m.roh_bound), + q_avg=float(m.q_avg), + q_conservative=float(m.q_conservative), + y_label=str(m.meta.get("y_label", "") if m.meta else ""), + rationale=str(m.rationale), + ) + ) + return outs + # legacy function - used for Llama 2 style prompting def _encode_inputs( self, @@ -420,6 +587,16 @@ def forward( inputs_embeds, attention_mask, label_input_ids = self._get_embeds( question, context, embedding, answer) + # --- Uncertainty-aware training: mask labels for ISR<1 ------------- + if self.uncertainty_estim and label_input_ids is not None: + unc = self._compute_uncertainty(question, context) + self._last_uncertainty = unc + if (self._uncfg.get("mask_refusals_in_loss", True) + and len(unc) == len(question)): + for i, u in enumerate(unc): + if not u.decision_answer: + label_input_ids[i, :] = IGNORE_INDEX + with self.autocast_context: outputs = self.llm( inputs_embeds=inputs_embeds, @@ -436,7 +613,10 @@ def inference( context: Optional[List[str]] = None, embedding: Optional[List[Tensor]] = None, max_tokens: Optional[int] = MAX_NEW_TOKENS, - ) -> List[str]: + # --- new --- + return_uncertainty: bool = False, + abstain_on_low_ISR: bool = False, + ) -> Union[List[str], Tuple[List[str], List[_Uncertainty]]]: r"""The inference pass. Args: @@ -454,6 +634,11 @@ def inference( inputs_embeds, attention_mask, _ = self._get_embeds( question, context, embedding) + unc: List[_Uncertainty] = [] + if self.uncertainty_estim: + unc = self._compute_uncertainty(question, context) + self._last_uncertainty = unc + with self.autocast_context: outputs = self.llm.generate( inputs_embeds=inputs_embeds, @@ -464,7 +649,15 @@ def inference( use_cache=True, ) - return self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + texts = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + if self.uncertainty_estim and unc and abstain_on_low_ISR: + texts = [ + t if u.decision_answer else "[ABSTAIN]" + for t, u in zip(texts, unc) + ] + if return_uncertainty and (self.uncertainty_estim and unc): + return texts, unc + return texts def __repr__(self) -> str: return f'{self.__class__.__name__}({self.model_name})' diff --git a/torch_geometric/llm/models/txt2kg.py b/torch_geometric/llm/models/txt2kg.py index ac011b15d248..7b413acb7a0a 100644 --- a/torch_geometric/llm/models/txt2kg.py +++ b/torch_geometric/llm/models/txt2kg.py @@ -43,6 +43,9 @@ class TXT2KG(): than deploying your own private NIM endpoint. This flag is mainly recommended for dev/debug. (default: False). + local_LM_model_name : str, optional + Model name to load when `local_LM` is enabled. + (default: "VAGOsolutions/SauerkrautLM-v2-14b-DPO"). chunk_size : int, optional The size of the chunks in which the text data is processed (default: 512). @@ -54,9 +57,13 @@ def __init__( NVIDIA_API_KEY: Optional[str] = "", ENDPOINT_URL: Optional[str] = "https://integrate.api.nvidia.com/v1", local_LM: bool = False, + local_LM_model_name: Optional[str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO", chunk_size: int = 512, + n_gpus=None ) -> None: + self.n_gpus = n_gpus self.local_LM = local_LM + self.local_LM_model_name = local_LM_model_name # Initialize the local LM flag and the NIM model info accordingly if self.local_LM: # If using a local LM, set the initd_LM flag to False @@ -91,9 +98,12 @@ def _chunk_to_triples_str_local(self, txt: str) -> str: # call LLM on text chunk_start_time = time.time() if not self.initd_LM: - from torch_geometric.nn.nlp import LLM - LM_name = "VAGOsolutions/SauerkrautLM-v2-14b-DPO" - self.model = LLM(LM_name).eval() + from torch_geometric.llm.models import LLM + LM_name = self.local_LM_model_name + if self.n_gpus is not None: + self.model = LLM(LM_name, n_gpus=self.n_gpus).eval() + else: + self.model = LLM(LM_name).eval() self.initd_LM = True out_str = self.model.inference(question=[txt + '\n' + SYSTEM_PROMPT], max_tokens=self.chunk_size)[0] From cc4d749158127ed763be92f1905451d88516ede8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Oct 2025 01:00:41 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/llm/txt2kg_rag.py | 14 ++- test/test_uncertainty_techqa.py | 136 +++++++++++++++------------ torch_geometric/llm/models/llm.py | 31 +++--- torch_geometric/llm/models/txt2kg.py | 18 ++-- 4 files changed, 112 insertions(+), 87 deletions(-) diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py index 66b2e6e88765..fe4c2385cffa 100644 --- a/examples/llm/txt2kg_rag.py +++ b/examples/llm/txt2kg_rag.py @@ -347,11 +347,13 @@ def index_kg(args, context_docs): if args.use_local_txt2kg: if checkpoint_model_name != "local": raise RuntimeError( - "Error: stored triples were generated using a different model") + "Error: stored triples were generated using a different model" + ) else: if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name: raise RuntimeError( - "Error: stored triples were generated using a different model") + "Error: stored triples were generated using a different model" + ) saved_relevant_triples = torch.load(checkpoint_file, weights_only=False) kg_maker.relevant_triples = saved_relevant_triples @@ -462,17 +464,19 @@ def make_dataset(args): if args.use_local_txt2kg: if stored_model_name != "local": raise RuntimeError( - "Error: stored triples were generated using a different model") + "Error: stored triples were generated using a different model" + ) else: if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name: raise RuntimeError( - "Error: stored triples were generated using a different model") + "Error: stored triples were generated using a different model" + ) print(f" -> Saved triples generated with: {stored_model_name}") triples = torch.load(raw_triples_file) else: triples = index_kg(args, context_docs) - + print("Number of triples in our GraphDB =", len(triples)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py index bb5d75085db4..69316ee95052 100644 --- a/test/test_uncertainty_techqa.py +++ b/test/test_uncertainty_techqa.py @@ -19,54 +19,53 @@ def download_techqa_data(dataset_dir="techqa"): """Download TechQA dataset if it doesn't exist.""" json_path = Path(dataset_dir) / "train.json" corpus_path = Path(dataset_dir) / "corpus" - + if json_path.exists() and corpus_path.exists(): print(f"Dataset already exists in {dataset_dir}") return - + print("Downloading TechQA dataset...") os.makedirs(dataset_dir, exist_ok=True) - + # Download corpus zip zip_path = hf_hub_download( repo_id="nvidia/TechQA-RAG-Eval", repo_type="dataset", filename="corpus.zip", ) - + # Download train.json json_download_path = hf_hub_download( repo_id="nvidia/TechQA-RAG-Eval", repo_type="dataset", filename="train.json", ) - + # Extract corpus with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(dataset_dir) - + # Copy train.json import shutil shutil.copy(json_download_path, json_path) - + print(f"Dataset downloaded to {dataset_dir}") def test_uncertainty_rag(args): """Test LLM with uncertainty estimation on TechQA dataset.""" - # Download data if needed download_techqa_data(args.dataset_dir) - + # Load Q&A pairs json_path = Path(args.dataset_dir) / "train.json" with open(json_path) as f: data = json.load(f) - + # Setup LLM with uncertainty print(f"Setting up LLM: {args.model_name}") print(f"Using {args.num_gpus} GPU(s)") - + llm = LLM( model_name=args.model_name, sys_prompt=args.sys_prompt, @@ -88,31 +87,33 @@ def test_uncertainty_rag(args): } if args.use_uncertainty else None, decision_backend_kwargs={} if args.use_uncertainty else None, ) - + print(f"Uncertainty enabled: {llm.uncertainty_estim}") - + # Create GRetriever (no GNN for this simple test) model = GRetriever(llm=llm, gnn=None, use_lora=False) - + # Filter out impossible questions - valid_data = [item for item in data if not item.get("is_impossible", False)] + valid_data = [ + item for item in data if not item.get("is_impossible", False) + ] test_data = valid_data[:args.num_questions] - + print(f"\nTesting on {len(test_data)} questions...") print("=" * 80) - + results = [] - + for item in tqdm(test_data, desc="Processing questions"): question = item["question"] answer = item["answer"] - + # Format question with mock context (in real scenario, use actual retrieved docs) if args.use_mock_context: context = f"Technical documentation: {answer[:args.context_length]}" else: context = "No context provided (zero-shot)" - + formatted_q = f""" [QUESTION] {question} @@ -122,7 +123,7 @@ def test_uncertainty_rag(args): {context} [END_RETRIEVED_CONTEXTS] """ - + # Run inference result = model.llm.inference( question=[formatted_q], @@ -131,22 +132,31 @@ def test_uncertainty_rag(args): return_uncertainty=args.use_uncertainty, abstain_on_low_ISR=args.abstain_on_low_isr, ) - + # Parse result if args.use_uncertainty and isinstance(result, tuple): texts, uncertainties = result generated = texts[0] if texts else "" uncertainty = uncertainties[0] if uncertainties else None - + result_dict = { - "question": question[:100], - "expected": answer[:100], - "generated": generated[:100], - "isr": uncertainty.isr if uncertainty else None, - "decision": "ANSWER" if (uncertainty and uncertainty.decision_answer) else "REFUSE", - "full_question": question, - "full_answer": answer, - "full_generated": generated, + "question": + question[:100], + "expected": + answer[:100], + "generated": + generated[:100], + "isr": + uncertainty.isr if uncertainty else None, + "decision": + "ANSWER" if + (uncertainty and uncertainty.decision_answer) else "REFUSE", + "full_question": + question, + "full_answer": + answer, + "full_generated": + generated, } else: generated = result[0] if isinstance(result, list) else str(result) @@ -158,68 +168,73 @@ def test_uncertainty_rag(args): "full_answer": answer, "full_generated": generated, } - + results.append(result_dict) - + if args.verbose: print(f"\nQ: {result_dict['question']}") print(f"Expected: {result_dict['expected']}") print(f"Generated: {result_dict['generated']}") if args.use_uncertainty: - print(f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}") + print( + f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}" + ) print("-" * 80) - + # Summary statistics print("\n" + "=" * 80) print("SUMMARY") print("=" * 80) - + if args.use_uncertainty: isr_values = [r['isr'] for r in results if r['isr'] is not None] answers = [r for r in results if r['decision'] == 'ANSWER'] refusals = [r for r in results if r['decision'] == 'REFUSE'] - + print(f"Total questions: {len(results)}") - print(f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)") - print(f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)") + print( + f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)") + print( + f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)" + ) print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}") print(f"Min ISR: {min(isr_values):.3f}") print(f"Max ISR: {max(isr_values):.3f}") else: print(f"Total questions processed: {len(results)}") - + # Save results if requested if args.output_file: with open(args.output_file, 'w') as f: json.dump(results, f, indent=2) print(f"\nResults saved to {args.output_file}") - + return results def main(): parser = argparse.ArgumentParser( - description="Test LLM with uncertainty estimation on TechQA dataset" - ) - + description="Test LLM with uncertainty estimation on TechQA dataset") + # Dataset args parser.add_argument('--dataset_dir', type=str, default='techqa', help='Directory to store/load TechQA dataset') parser.add_argument('--num_questions', type=int, default=10, help='Number of questions to test') - + # Model args parser.add_argument('--model_name', type=str, default='mistralai/Mistral-7B-Instruct-v0.3', help='HuggingFace model name') parser.add_argument('--num_gpus', type=int, default=1, help='Number of GPUs to use') - parser.add_argument('--sys_prompt', type=str, - default='Answer questions based on the provided context.', - help='System prompt for the LLM') + parser.add_argument( + '--sys_prompt', type=str, + default='Answer questions based on the provided context.', + help='System prompt for the LLM') parser.add_argument('--max_tokens', type=int, default=50, help='Maximum tokens to generate') - + # Uncertainty args parser.add_argument('--use_uncertainty', action='store_true', help='Enable uncertainty estimation') @@ -234,8 +249,8 @@ def main(): parser.add_argument('--b_clip', type=float, default=12.0, help='Clipping value for importance sampling') parser.add_argument('--skeleton_policy', type=str, default='auto', - choices=['auto', 'evidence_erase', 'none'], - help='Skeleton generation policy') + choices=['auto', 'evidence_erase', + 'none'], help='Skeleton generation policy') parser.add_argument('--temperature', type=float, default=0.5, help='Sampling temperature') parser.add_argument('--backend', type=str, default='hf', @@ -243,26 +258,27 @@ def main(): help='Backend for decision model') parser.add_argument('--abstain_on_low_isr', action='store_true', help='Abstain when ISR is below threshold') - + # Context args - parser.add_argument('--use_mock_context', action='store_true', - help='Use partial answer as mock context (for testing)') + parser.add_argument( + '--use_mock_context', action='store_true', + help='Use partial answer as mock context (for testing)') parser.add_argument('--context_length', type=int, default=100, help='Length of mock context to use') - + # Output args parser.add_argument('--output_file', type=str, default=None, help='JSON file to save results') parser.add_argument('--verbose', action='store_true', help='Print detailed results for each question') - + args = parser.parse_args() - + # Run test - results = test_uncertainty_rag(args) - + test_uncertainty_rag(args) + print("\nTest complete!") if __name__ == '__main__': - main() \ No newline at end of file + main() diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py index 21932d4bc33e..47a27dce3c2e 100644 --- a/torch_geometric/llm/models/llm.py +++ b/torch_geometric/llm/models/llm.py @@ -1,7 +1,8 @@ +import json +import re import warnings from contextlib import nullcontext -from dataclasses import dataclass, asdict -import json, re +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -16,6 +17,7 @@ MAX_TXT_LEN = 512 MAX_NEW_TOKENS = 128 + # ---- Uncertainty (EDFL/B2T/ISR) glue ------------------------------------ @dataclass class _Uncertainty: @@ -49,6 +51,8 @@ def _parse_decision_json(text: str) -> str: if "answer" in t and "refuse" not in t: return "answer" return "refuse" # default-safe + + PAD_TOKEN_ID = 0 PADDING_SIDE = 'left' @@ -218,12 +222,16 @@ def __init__( self._backend = None if self.uncertainty_estim: try: - from hallbayes import OpenAIPlanner, OpenAIItem + from hallbayes import OpenAIItem, OpenAIPlanner self._OpenAIPlanner = OpenAIPlanner self._OpenAIItem = OpenAIItem try: - from hallbayes import HuggingFaceBackend, OllamaBackend, AnthropicBackend + from hallbayes import ( + AnthropicBackend, + HuggingFaceBackend, + OllamaBackend, + ) self._HFBackend = HuggingFaceBackend self._OllamaBackend = OllamaBackend self._AnthropicBackend = AnthropicBackend @@ -250,7 +258,8 @@ def _get_decision_backend(self): if not self.uncertainty_estim: return None backend_kind = str(self._uncfg.get("backend", "hf")).lower() - if backend_kind == "hf" and getattr(self, "_HFBackend", None) is not None: + if backend_kind == "hf" and getattr(self, "_HFBackend", + None) is not None: self._backend = self._HFBackend( mode="transformers", model_id=self.model_name, @@ -258,9 +267,11 @@ def _get_decision_backend(self): trust_remote_code=True, **self._dec_backend_kwargs, ) - elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", None) is not None: + elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", + None) is not None: self._backend = self._OllamaBackend(**self._dec_backend_kwargs) - elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", None) is not None: + elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", + None) is not None: self._backend = self._AnthropicBackend(**self._dec_backend_kwargs) else: self._backend = None @@ -293,8 +304,7 @@ def _compute_uncertainty( n_samples=int(self._uncfg["n_samples"]), m=int(self._uncfg["m"]), skeleton_policy=str(self._uncfg["skeleton_policy"]), - ) - ) + )) metrics = planner.run( items, h_star=float(self._uncfg["h_star"]), @@ -315,8 +325,7 @@ def _compute_uncertainty( q_conservative=float(m.q_conservative), y_label=str(m.meta.get("y_label", "") if m.meta else ""), rationale=str(m.rationale), - ) - ) + )) return outs # legacy function - used for Llama 2 style prompting diff --git a/torch_geometric/llm/models/txt2kg.py b/torch_geometric/llm/models/txt2kg.py index 7b413acb7a0a..f94143157b35 100644 --- a/torch_geometric/llm/models/txt2kg.py +++ b/torch_geometric/llm/models/txt2kg.py @@ -50,17 +50,13 @@ class TXT2KG(): The size of the chunks in which the text data is processed (default: 512). """ - def __init__( - self, - NVIDIA_NIM_MODEL: Optional[ - str] = "nvidia/llama-3.1-nemotron-70b-instruct", - NVIDIA_API_KEY: Optional[str] = "", - ENDPOINT_URL: Optional[str] = "https://integrate.api.nvidia.com/v1", - local_LM: bool = False, - local_LM_model_name: Optional[str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO", - chunk_size: int = 512, - n_gpus=None - ) -> None: + def __init__(self, NVIDIA_NIM_MODEL: Optional[ + str] = "nvidia/llama-3.1-nemotron-70b-instruct", + NVIDIA_API_KEY: Optional[str] = "", ENDPOINT_URL: Optional[ + str] = "https://integrate.api.nvidia.com/v1", + local_LM: bool = False, local_LM_model_name: Optional[ + str] = "VAGOsolutions/SauerkrautLM-v2-14b-DPO", + chunk_size: int = 512, n_gpus=None) -> None: self.n_gpus = n_gpus self.local_LM = local_LM self.local_LM_model_name = local_LM_model_name From 5ca988f6259dbabe872400663eb11e1598610282 Mon Sep 17 00:00:00 2001 From: leochlon Date: Fri, 3 Oct 2025 03:10:00 +0100 Subject: [PATCH 3/5] Add uncertainty test, fix linting, and update changelog --- CHANGELOG.md | 9 + test/llm/models/test_llm.py | 60 ++++++- test/test_uncertainty_techqa.py | 284 -------------------------------- 3 files changed, 67 insertions(+), 286 deletions(-) delete mode 100644 test/test_uncertainty_techqa.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a590afe5e190..8f742bf26408 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [Unreleased] + +### Added +- Added local LLM support for TXT2KG knowledge graph extraction ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) +- Added uncertainty estimation test for LLM class ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) + +### Fixed +- Fixed LLM import path in TXT2KG from `torch_geometric.nn` to `torch_geometric.llm.models` ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) + ## [2.7.0] - 2025-MM-DD ### Fixed diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py index a6a2608551a8..fa13f0eaf526 100644 --- a/test/llm/models/test_llm.py +++ b/test/llm/models/test_llm.py @@ -10,8 +10,8 @@ @onlyRAG @withPackage('transformers', 'accelerate') def test_llm() -> None: - question = ["Is PyG the best open-source GNN library?"] - answer = ["yes!"] + question = ['Is PyG the best open-source GNN library?'] + answer = ['yes!'] model = LLM( model_name='HuggingFaceTB/SmolLM-360M', @@ -30,3 +30,59 @@ def test_llm() -> None: del model gc.collect() torch.cuda.empty_cache() + + +@onlyRAG +@withPackage('transformers', 'accelerate') +def test_llm_uncertainty() -> None: + """Test LLM with uncertainty estimation.""" + question = ['What is the capital of France?'] + answer = ['Paris'] + + # Test with uncertainty enabled + model = LLM( + model_name='HuggingFaceTB/SmolLM-360M', + num_params=1, + dtype=torch.float16, + uncertainty_estim=True, + uncertainty_cfg={ + 'h_star': 0.05, + 'isr_threshold': 1.0, + 'm': 3, + 'n_samples': 2, + 'B_clip': 12.0, + 'clip_mode': 'one-sided', + 'skeleton_policy': 'auto', + 'temperature': 0.5, + 'max_tokens_decision': 8, + 'backend': 'hf', + 'mask_refusals_in_loss': True, + }, + ) + + assert model.uncertainty_estim is True + + # Test inference with uncertainty + result = model.inference( + question=question, + context=[''], + max_tokens=10, + return_uncertainty=True, + ) + + # Should return tuple of (texts, uncertainties) + assert isinstance(result, tuple) + texts, uncertainties = result + assert len(texts) == 1 + assert len(uncertainties) == 1 + + # Check uncertainty object has expected attributes + uncertainty = uncertainties[0] + assert hasattr(uncertainty, 'isr') + assert hasattr(uncertainty, 'decision_answer') + assert isinstance(uncertainty.isr, float) + assert isinstance(uncertainty.decision_answer, bool) + + del model + gc.collect() + torch.cuda.empty_cache() diff --git a/test/test_uncertainty_techqa.py b/test/test_uncertainty_techqa.py deleted file mode 100644 index 69316ee95052..000000000000 --- a/test/test_uncertainty_techqa.py +++ /dev/null @@ -1,284 +0,0 @@ -import argparse -import json -import os -import sys -import zipfile -from pathlib import Path - -import torch -from huggingface_hub import hf_hub_download -from tqdm import tqdm - -# Add pytorch_geometric to path if needed -sys.path.insert(0, '/content/pytorch_geometric') - -from torch_geometric.llm.models import LLM, GRetriever - - -def download_techqa_data(dataset_dir="techqa"): - """Download TechQA dataset if it doesn't exist.""" - json_path = Path(dataset_dir) / "train.json" - corpus_path = Path(dataset_dir) / "corpus" - - if json_path.exists() and corpus_path.exists(): - print(f"Dataset already exists in {dataset_dir}") - return - - print("Downloading TechQA dataset...") - os.makedirs(dataset_dir, exist_ok=True) - - # Download corpus zip - zip_path = hf_hub_download( - repo_id="nvidia/TechQA-RAG-Eval", - repo_type="dataset", - filename="corpus.zip", - ) - - # Download train.json - json_download_path = hf_hub_download( - repo_id="nvidia/TechQA-RAG-Eval", - repo_type="dataset", - filename="train.json", - ) - - # Extract corpus - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - zip_ref.extractall(dataset_dir) - - # Copy train.json - import shutil - shutil.copy(json_download_path, json_path) - - print(f"Dataset downloaded to {dataset_dir}") - - -def test_uncertainty_rag(args): - """Test LLM with uncertainty estimation on TechQA dataset.""" - # Download data if needed - download_techqa_data(args.dataset_dir) - - # Load Q&A pairs - json_path = Path(args.dataset_dir) / "train.json" - with open(json_path) as f: - data = json.load(f) - - # Setup LLM with uncertainty - print(f"Setting up LLM: {args.model_name}") - print(f"Using {args.num_gpus} GPU(s)") - - llm = LLM( - model_name=args.model_name, - sys_prompt=args.sys_prompt, - n_gpus=args.num_gpus, - dtype=torch.bfloat16, - uncertainty_estim=args.use_uncertainty, - uncertainty_cfg={ - "h_star": args.h_star, - "isr_threshold": args.isr_threshold, - "m": args.m, - "n_samples": args.n_samples, - "B_clip": args.b_clip, - "clip_mode": "one-sided", - "skeleton_policy": args.skeleton_policy, - "temperature": args.temperature, - "max_tokens_decision": 8, - "backend": args.backend, - "mask_refusals_in_loss": True, - } if args.use_uncertainty else None, - decision_backend_kwargs={} if args.use_uncertainty else None, - ) - - print(f"Uncertainty enabled: {llm.uncertainty_estim}") - - # Create GRetriever (no GNN for this simple test) - model = GRetriever(llm=llm, gnn=None, use_lora=False) - - # Filter out impossible questions - valid_data = [ - item for item in data if not item.get("is_impossible", False) - ] - test_data = valid_data[:args.num_questions] - - print(f"\nTesting on {len(test_data)} questions...") - print("=" * 80) - - results = [] - - for item in tqdm(test_data, desc="Processing questions"): - question = item["question"] - answer = item["answer"] - - # Format question with mock context (in real scenario, use actual retrieved docs) - if args.use_mock_context: - context = f"Technical documentation: {answer[:args.context_length]}" - else: - context = "No context provided (zero-shot)" - - formatted_q = f""" -[QUESTION] -{question} -[END_QUESTION] - -[RETRIEVED_CONTEXTS] -{context} -[END_RETRIEVED_CONTEXTS] -""" - - # Run inference - result = model.llm.inference( - question=[formatted_q], - context=[""], - max_tokens=args.max_tokens, - return_uncertainty=args.use_uncertainty, - abstain_on_low_ISR=args.abstain_on_low_isr, - ) - - # Parse result - if args.use_uncertainty and isinstance(result, tuple): - texts, uncertainties = result - generated = texts[0] if texts else "" - uncertainty = uncertainties[0] if uncertainties else None - - result_dict = { - "question": - question[:100], - "expected": - answer[:100], - "generated": - generated[:100], - "isr": - uncertainty.isr if uncertainty else None, - "decision": - "ANSWER" if - (uncertainty and uncertainty.decision_answer) else "REFUSE", - "full_question": - question, - "full_answer": - answer, - "full_generated": - generated, - } - else: - generated = result[0] if isinstance(result, list) else str(result) - result_dict = { - "question": question[:100], - "expected": answer[:100], - "generated": generated[:100], - "full_question": question, - "full_answer": answer, - "full_generated": generated, - } - - results.append(result_dict) - - if args.verbose: - print(f"\nQ: {result_dict['question']}") - print(f"Expected: {result_dict['expected']}") - print(f"Generated: {result_dict['generated']}") - if args.use_uncertainty: - print( - f"ISR: {result_dict['isr']:.3f}, Decision: {result_dict['decision']}" - ) - print("-" * 80) - - # Summary statistics - print("\n" + "=" * 80) - print("SUMMARY") - print("=" * 80) - - if args.use_uncertainty: - isr_values = [r['isr'] for r in results if r['isr'] is not None] - answers = [r for r in results if r['decision'] == 'ANSWER'] - refusals = [r for r in results if r['decision'] == 'REFUSE'] - - print(f"Total questions: {len(results)}") - print( - f"Answered: {len(answers)} ({len(answers)/len(results)*100:.1f}%)") - print( - f"Refused: {len(refusals)} ({len(refusals)/len(results)*100:.1f}%)" - ) - print(f"Average ISR: {sum(isr_values)/len(isr_values):.3f}") - print(f"Min ISR: {min(isr_values):.3f}") - print(f"Max ISR: {max(isr_values):.3f}") - else: - print(f"Total questions processed: {len(results)}") - - # Save results if requested - if args.output_file: - with open(args.output_file, 'w') as f: - json.dump(results, f, indent=2) - print(f"\nResults saved to {args.output_file}") - - return results - - -def main(): - parser = argparse.ArgumentParser( - description="Test LLM with uncertainty estimation on TechQA dataset") - - # Dataset args - parser.add_argument('--dataset_dir', type=str, default='techqa', - help='Directory to store/load TechQA dataset') - parser.add_argument('--num_questions', type=int, default=10, - help='Number of questions to test') - - # Model args - parser.add_argument('--model_name', type=str, - default='mistralai/Mistral-7B-Instruct-v0.3', - help='HuggingFace model name') - parser.add_argument('--num_gpus', type=int, default=1, - help='Number of GPUs to use') - parser.add_argument( - '--sys_prompt', type=str, - default='Answer questions based on the provided context.', - help='System prompt for the LLM') - parser.add_argument('--max_tokens', type=int, default=50, - help='Maximum tokens to generate') - - # Uncertainty args - parser.add_argument('--use_uncertainty', action='store_true', - help='Enable uncertainty estimation') - parser.add_argument('--h_star', type=float, default=0.05, - help='Entropy threshold for uncertainty') - parser.add_argument('--isr_threshold', type=float, default=1.0, - help='ISR threshold for abstention') - parser.add_argument('--m', type=int, default=6, - help='Number of semantic sets') - parser.add_argument('--n_samples', type=int, default=3, - help='Number of samples per semantic set') - parser.add_argument('--b_clip', type=float, default=12.0, - help='Clipping value for importance sampling') - parser.add_argument('--skeleton_policy', type=str, default='auto', - choices=['auto', 'evidence_erase', - 'none'], help='Skeleton generation policy') - parser.add_argument('--temperature', type=float, default=0.5, - help='Sampling temperature') - parser.add_argument('--backend', type=str, default='hf', - choices=['hf', 'ollama', 'anthropic'], - help='Backend for decision model') - parser.add_argument('--abstain_on_low_isr', action='store_true', - help='Abstain when ISR is below threshold') - - # Context args - parser.add_argument( - '--use_mock_context', action='store_true', - help='Use partial answer as mock context (for testing)') - parser.add_argument('--context_length', type=int, default=100, - help='Length of mock context to use') - - # Output args - parser.add_argument('--output_file', type=str, default=None, - help='JSON file to save results') - parser.add_argument('--verbose', action='store_true', - help='Print detailed results for each question') - - args = parser.parse_args() - - # Run test - test_uncertainty_rag(args) - - print("\nTest complete!") - - -if __name__ == '__main__': - main() From b9c61abf39d3752a91f93518424fe208a8bcc48e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 3 Oct 2025 02:15:56 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- CHANGELOG.md | 2 ++ test/llm/models/test_llm.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f742bf26408..d9f780d15522 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] ### Added + - Added local LLM support for TXT2KG knowledge graph extraction ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) - Added uncertainty estimation test for LLM class ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) ### Fixed + - Fixed LLM import path in TXT2KG from `torch_geometric.nn` to `torch_geometric.llm.models` ([#10479](https://github.com/pyg-team/pytorch_geometric/pull/10479)) ## [2.7.0] - 2025-MM-DD diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py index fa13f0eaf526..f2a3ec49fd09 100644 --- a/test/llm/models/test_llm.py +++ b/test/llm/models/test_llm.py @@ -37,7 +37,6 @@ def test_llm() -> None: def test_llm_uncertainty() -> None: """Test LLM with uncertainty estimation.""" question = ['What is the capital of France?'] - answer = ['Paris'] # Test with uncertainty enabled model = LLM( From acad83c4fa93175d6c4d46056a066462e2a5d325 Mon Sep 17 00:00:00 2001 From: leochlon Date: Fri, 3 Oct 2025 03:52:16 +0100 Subject: [PATCH 5/5] Fix line lengths --- examples/llm/txt2kg_rag.py | 443 +++++++++++++++++++----------- test/llm/models/test_llm.py | 2 +- torch_geometric/llm/models/llm.py | 127 +++++---- 3 files changed, 365 insertions(+), 207 deletions(-) diff --git a/examples/llm/txt2kg_rag.py b/examples/llm/txt2kg_rag.py index fe4c2385cffa..ede39a46c6ba 100644 --- a/examples/llm/txt2kg_rag.py +++ b/examples/llm/txt2kg_rag.py @@ -14,6 +14,7 @@ try: import wandb + wandb_available = True except ImportError: wandb_available = False @@ -39,8 +40,10 @@ LLMJudge, SentenceTransformer, ) -# EDFL planner + universal backends (vendor these, or pip-install if you packaged them): -# from hallucination_toolkit import OpenAIPlanner, OpenAIItem # imported internally by LLM now +# EDFL planner + universal backends (vendor these, +# or pip-install if you packaged them): +# from hallucination_toolkit import OpenAIPlanner, OpenAIItem # imported +# internally by LLM now from torch_geometric.llm.models.txt2kg import _chunk_text from torch_geometric.llm.utils.backend_utils import ( create_graph_from_triples, @@ -72,97 +75,162 @@ def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument('--gnn_model', type=str, default="GAT", - choices=["GAT", "SGFormer"], - help="The GNN model to use. Default is GAT.") - parser.add_argument('--NV_NIM_MODEL', type=str, - default=NV_NIM_MODEL_DEFAULT, - help="The NIM LLM to use for TXT2KG for LLMJudge") - parser.add_argument('--NV_NIM_KEY', type=str, help="NVIDIA API key") parser.add_argument( - '--ENDPOINT_URL', type=str, default=DEFAULT_ENDPOINT_URL, + "--gnn_model", + type=str, + default="GAT", + choices=["GAT", "SGFormer"], + help="The GNN model to use. Default is GAT.", + ) + parser.add_argument( + "--NV_NIM_MODEL", + type=str, + default=NV_NIM_MODEL_DEFAULT, + help="The NIM LLM to use for TXT2KG for LLMJudge", + ) + parser.add_argument("--NV_NIM_KEY", type=str, help="NVIDIA API key") + parser.add_argument( + "--ENDPOINT_URL", + type=str, + default=DEFAULT_ENDPOINT_URL, help="The URL hosting your model, \ - in case you are not using the public NIM.") - parser.add_argument('--use_local_txt2kg', action="store_true", - help="Use local LLM for TXT2KG instead of NVIDIA NIM") - parser.add_argument('--txt2kg_model', type=str, - default="mistralai/Mistral-7B-Instruct-v0.3", - help="Local model for TXT2KG") + in case you are not using the public NIM.", + ) + parser.add_argument( + "--use_local_txt2kg", + action="store_true", + help="Use local LLM for TXT2KG instead of NVIDIA NIM", + ) + parser.add_argument( + "--txt2kg_model", + type=str, + default="mistralai/Mistral-7B-Instruct-v0.3", + help="Local model for TXT2KG", + ) parser.add_argument( - '--kg_chunk_size', type=int, default=KG_CHUNK_SIZE_DEFAULT, + "--kg_chunk_size", + type=int, + default=KG_CHUNK_SIZE_DEFAULT, help="When splitting context documents for txt2kg,\ - the maximum number of characters per chunk.") - parser.add_argument('--gnn_hidden_channels', type=int, - default=GNN_HID_CHANNELS_DEFAULT, - help="Hidden channels for GNN") - parser.add_argument('--num_gnn_layers', type=int, - default=GNN_LAYERS_DEFAULT, - help="Number of GNN layers") - parser.add_argument('--lr', type=float, default=LR_DEFAULT, + the maximum number of characters per chunk.", + ) + parser.add_argument( + "--gnn_hidden_channels", + type=int, + default=GNN_HID_CHANNELS_DEFAULT, + help="Hidden channels for GNN", + ) + parser.add_argument( + "--num_gnn_layers", + type=int, + default=GNN_LAYERS_DEFAULT, + help="Number of GNN layers", + ) + parser.add_argument("--lr", type=float, default=LR_DEFAULT, help="Learning rate") - parser.add_argument('--epochs', type=int, default=EPOCHS_DEFAULT, + parser.add_argument("--epochs", type=int, default=EPOCHS_DEFAULT, help="Number of epochs") - parser.add_argument('--batch_size', type=int, default=BATCH_SIZE_DEFAULT, + parser.add_argument("--batch_size", type=int, default=BATCH_SIZE_DEFAULT, help="Batch size") - parser.add_argument('--eval_batch_size', type=int, - default=EVAL_BATCH_SIZE_DEFAULT, - help="Evaluation batch size") - parser.add_argument('--llm_generator_name', type=str, - default=LLM_GENERATOR_NAME_DEFAULT, - help="The LLM to use for Generation") parser.add_argument( - '--llm_generator_mode', type=str, default=LLM_GEN_MODE_DEFAULT, - choices=["frozen", "lora", - "full"], help="Whether to freeze the Generator LLM,\ - use LORA, or fully finetune") - parser.add_argument('--dont_save_model', action="store_true", - help="Whether to skip model saving.") - parser.add_argument('--log_steps', type=int, default=30, + "--eval_batch_size", + type=int, + default=EVAL_BATCH_SIZE_DEFAULT, + help="Evaluation batch size", + ) + parser.add_argument( + "--llm_generator_name", + type=str, + default=LLM_GENERATOR_NAME_DEFAULT, + help="The LLM to use for Generation", + ) + parser.add_argument( + "--llm_generator_mode", + type=str, + default=LLM_GEN_MODE_DEFAULT, + choices=["frozen", "lora", "full"], + help="Whether to freeze the Generator LLM,\ + use LORA, or fully finetune", + ) + parser.add_argument( + "--dont_save_model", + action="store_true", + help="Whether to skip model saving.", + ) + parser.add_argument("--log_steps", type=int, default=30, help="Log to wandb every N steps") - parser.add_argument('--wandb_project', type=str, default="techqa", - help="Weights & Biases project name") - parser.add_argument('--wandb', action="store_true", + parser.add_argument( + "--wandb_project", + type=str, + default="techqa", + help="Weights & Biases project name", + ) + parser.add_argument("--wandb", action="store_true", help="Enable wandb logging") parser.add_argument( - '--num_gpus', type=int, default=None, + "--num_gpus", + type=int, + default=None, help="Number of GPUs to use. If not specified," - "will determine automatically based on model size.") - parser.add_argument('--regenerate_dataset', action="store_true", - help="Regenerate the dataset") + "will determine automatically based on model size.", + ) + parser.add_argument( + "--regenerate_dataset", + action="store_true", + help="Regenerate the dataset", + ) parser.add_argument( - '--doc_parsing_mode', type=str, default=None, - choices=["paragraph", - "file"], help="How to parse documents: 'paragraph' splits " + "--doc_parsing_mode", + type=str, + default=None, + choices=["paragraph", "file"], + help="How to parse documents: 'paragraph' splits " "files by paragraphs, 'file' treats each file as" "one document. " - "This will override any value set in the config file.") + "This will override any value set in the config file.", + ) parser.add_argument( - '--k_for_docs', type=int, default=None, + "--k_for_docs", + type=int, + default=None, help="Number of docs to retrieve for each question. " - "This will override any value set in the config file.") + "This will override any value set in the config file.", + ) parser.add_argument( - '--doc_chunk_size', type=int, default=None, + "--doc_chunk_size", + type=int, + default=None, help="The chunk size to use VectorRAG (document retrieval). " - "This will override any value set in the config file.") + "This will override any value set in the config file.", + ) parser.add_argument( - '--dataset', type=str, default="techqa", help="Dataset folder name, " + "--dataset", + type=str, + default="techqa", + help="Dataset folder name, " "should contain corpus and train.json files." "extracted triples, processed dataset, " "document retriever, and model checkpoints " - "will be saved in the dataset folder") + "will be saved in the dataset folder", + ) parser.add_argument( - '--skip_graph_rag', action="store_true", + "--skip_graph_rag", + action="store_true", help="Skip the graph RAG step. " - "Used to compare the performance of Vector+Graph RAG vs Vector RAG.") + "Used to compare the performance of Vector+Graph RAG vs Vector RAG.", + ) parser.add_argument( - '--use_x_percent_corpus', default=100.0, type=float, + "--use_x_percent_corpus", + default=100.0, + type=float, help="Debug flag that allows user to only use a random percentage " - "of available knowledge base corpus for RAG") + "of available knowledge base corpus for RAG", + ) args = parser.parse_args() assert args.NV_NIM_KEY, "NVIDIA API key is required for TXT2KG and eval" - assert args.use_x_percent_corpus <= 100 and \ - args.use_x_percent_corpus > 0, "Please provide a value in (0,100]" + assert (args.use_x_percent_corpus <= 100 and args.use_x_percent_corpus + > 0), "Please provide a value in (0,100]" if args.skip_graph_rag: print("Skipping graph RAG step, setting GNN layers to 0...") args.num_gnn_layers = 0 @@ -176,7 +244,9 @@ def parse_args(): if config is not None: # Use a loop to check and apply config values for each parameter config_params = [ - 'doc_parsing_mode', 'doc_chunk_size', 'k_for_docs' + "doc_parsing_mode", + "doc_chunk_size", + "k_for_docs", ] for param in config_params: if param in config and getattr(args, param) is None: @@ -221,7 +291,7 @@ def _process_and_chunk_text(text, chunk_size, doc_parsing_mode): if multiple paragraphs are detected. """ if doc_parsing_mode == "paragraph": - paragraphs = re.split(r'\n{2,}', text) + paragraphs = re.split(r"\n{2,}", text) else: # doc_parsing_mode == 'file' or doc_parsing_mode is None paragraphs = [text] @@ -265,9 +335,11 @@ def get_data(args): if not os.path.exists(args.dataset): os.mkdir(args.dataset) import zipfile - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + + with zipfile.ZipFile(zip_path, "r") as zip_ref: zip_ref.extractall(args.dataset) import shutil + shutil.copy(json_path, os.path.join(args.dataset, "train.json")) elif user_input.lower() == "n" or user_input.lower() == "no": sys.exit("No selected, no data to work with... exiting.") @@ -292,9 +364,11 @@ def get_data(args): raise ValueError(f"Bad extraction for {file_path}, expecting " f"text only but got {doc_type}") text_contexts.extend( - _process_and_chunk_text(data[0]["metadata"]["content"], - args.doc_chunk_size, - args.doc_parsing_mode)) + _process_and_chunk_text( + data[0]["metadata"]["content"], + args.doc_chunk_size, + args.doc_parsing_mode, + )) else: for file_path in glob(os.path.join(args.dataset, "corpus", "*")): with open(file_path, "r+") as f: @@ -327,12 +401,13 @@ def index_kg(args, context_docs): print( "Note that if the TXT2KG process is too slow for you're liking using " "the public NIM, consider deploying yourself using local_lm flag of " - "TXT2KG or using https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct " # noqa + "TXT2KG or using " + "https://build.nvidia.com/nvidia/llama-3_1-nemotron-70b-instruct " "to deploy to a private endpoint, which you can pass to this script " "w/ --ENDPOINT_URL flag.") - print( - "Guide for deploying NIM: https://developer.nvidia.com/blog/a-simple-guide-to-deploying-generative-ai-with-nvidia-nim/" # noqa - ) + print("Guide for deploying NIM: " + "https://developer.nvidia.com/blog/" + "a-simple-guide-to-deploying-generative-ai-with-nvidia-nim/") total_tqdm_count = len(context_docs) initial_tqdm_count = 0 checkpoint_file = list(Path(args.dataset).glob("*--*--checkpoint_kg.pt")) @@ -342,18 +417,16 @@ def index_kg(args, context_docs): if len(checkpoint_file) == 1: print("Restoring KG from checkpoint") checkpoint_file = checkpoint_file[0] - checkpoint_model_name = checkpoint_file.name.split('--')[0] + checkpoint_model_name = checkpoint_file.name.split("--")[0] # check if triples generation are using the correct model if args.use_local_txt2kg: if checkpoint_model_name != "local": raise RuntimeError( - "Error: stored triples were generated using a different model" - ) + "Error: stored triples generated using a different model") else: - if args.NV_NIM_MODEL.split('/')[-1] != checkpoint_model_name: + if args.NV_NIM_MODEL.split("/")[-1] != checkpoint_model_name: raise RuntimeError( - "Error: stored triples were generated using a different model" - ) + "Error: stored triples generated using a different model") saved_relevant_triples = torch.load(checkpoint_file, weights_only=False) kg_maker.relevant_triples = saved_relevant_triples @@ -363,16 +436,19 @@ def index_kg(args, context_docs): chkpt_interval = 10 chkpt_count = 0 - for context_doc in tqdm(context_docs, total=total_tqdm_count, - initial=initial_tqdm_count, - desc="Extracting KG triples"): + for context_doc in tqdm( + context_docs, + total=total_tqdm_count, + initial=initial_tqdm_count, + desc="Extracting KG triples", + ): kg_maker.add_doc_2_KG(txt=context_doc) chkpt_count += 1 if chkpt_count == chkpt_interval: chkpt_count = 0 path = args.dataset + "/{m}--{t}--checkpoint_kg.pt" - model = kg_maker.NIM_MODEL.split( - '/')[-1] if not kg_maker.local_LM else "local" + model = (kg_maker.NIM_MODEL.split("/")[-1] + if not kg_maker.local_LM else "local") path = path.format(m=model, t=datetime.now().strftime("%Y%m%d_%H%M%S")) torch.save(kg_maker.relevant_triples, path) @@ -384,12 +460,13 @@ def index_kg(args, context_docs): triples = list(dict.fromkeys(triples)) raw_triples_path = args.dataset + "/{m}--{t}--raw_triples.pt" - model_name = kg_maker.NIM_MODEL.split( - '/')[-1] if not kg_maker.local_LM else "local" + model_name = (kg_maker.NIM_MODEL.split("/")[-1] + if not kg_maker.local_LM else "local") torch.save( triples, raw_triples_path.format(m=model_name, - t=datetime.now().strftime("%Y%m%d_%H%M%S"))) + t=datetime.now().strftime("%Y%m%d_%H%M%S")), + ) for old_checkpoint_file in Path( args.dataset).glob("*--*--checkpoint_kg.pt"): @@ -402,8 +479,8 @@ def update_data_lists(args, data_lists): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # creating the embedding model sent_trans_batch_size = 256 - model = SentenceTransformer( - model_name=ENCODER_MODEL_NAME_DEFAULT).to(device).eval() + model = (SentenceTransformer( + model_name=ENCODER_MODEL_NAME_DEFAULT).to(device).eval()) model_kwargs = { "output_device": device, "batch_size": int(sent_trans_batch_size / 4), @@ -458,19 +535,17 @@ def make_dataset(args): raise RuntimeError("Error: multiple raw_triples files found") if len(raw_triples_file) == 1: raw_triples_file = raw_triples_file[0] - stored_model_name = raw_triples_file.name.split('--')[0] + stored_model_name = raw_triples_file.name.split("--")[0] # Check if stored triples match current model configuration if args.use_local_txt2kg: if stored_model_name != "local": raise RuntimeError( - "Error: stored triples were generated using a different model" - ) + "Error: stored triples generated using a different model") else: - if args.NV_NIM_MODEL.split('/')[-1] != stored_model_name: + if args.NV_NIM_MODEL.split("/")[-1] != stored_model_name: raise RuntimeError( - "Error: stored triples were generated using a different model" - ) + "Error: stored triples generated using a different model") print(f" -> Saved triples generated with: {stored_model_name}") triples = torch.load(raw_triples_file) @@ -488,18 +563,23 @@ def make_dataset(args): print("Creating the graph data from raw triples...") # create the graph data from raw triples graph_data = create_graph_from_triples( - triples=triples, embedding_model=model.encode, + triples=triples, + embedding_model=model.encode, embedding_method_kwargs={ "batch_size": min(len(triples), sent_trans_batch_size), - "verbose": True - }, pre_transform=preprocess_triplet) + "verbose": True, + }, + pre_transform=preprocess_triplet, + ) print("Creating the graph and feature stores...") # creating the graph and feature stores fs, gs = create_remote_backend_from_graph_data( - graph_data=graph_data, path="backend", + graph_data=graph_data, + path="backend", graph_db=NeighborSamplingRAGGraphStore, - feature_db=KNNRAGFeatureStore).load() + feature_db=KNNRAGFeatureStore, + ).load() """ NOTE: these retriever hyperparams are very important. Tuning may be needed for custom data... @@ -508,7 +588,7 @@ def make_dataset(args): model_kwargs = { "output_device": device, "batch_size": int(sent_trans_batch_size / 4), - "verbose": True + "verbose": True, } doc_retriever_path = os.path.join(args.dataset, "document_retriever.pt") @@ -521,10 +601,12 @@ def make_dataset(args): vector_retriever.k_for_docs = args.k_for_docs else: print("Creating document retriever...") - vector_retriever = DocumentRetriever(context_docs, - k_for_docs=args.k_for_docs, - model=model.encode, - model_kwargs=model_kwargs) + vector_retriever = DocumentRetriever( + context_docs, + k_for_docs=args.k_for_docs, + model=model.encode, + model_kwargs=model_kwargs, + ) vector_retriever.save(doc_retriever_path) subgraph_filter = make_pcst_filter( @@ -532,8 +614,9 @@ def make_dataset(args): model, topk=5, # nodes topk_e=5, # edges - cost_e=.5, # edge cost - num_clusters=10) # num clusters + cost_e=0.5, # edge cost + num_clusters=10, + ) # num clusters # number of neighbors for each seed node selected by KNN fanout = 100 @@ -552,10 +635,12 @@ def make_dataset(args): print("Now to retrieve context for each query from " "our Vector and Graph DBs...") - query_loader = RAGQueryLoader(graph_data=(fs, gs), - subgraph_filter=subgraph_filter, - vector_retriever=vector_retriever, - config=query_loader_config) + query_loader = RAGQueryLoader( + graph_data=(fs, gs), + subgraph_filter=subgraph_filter, + vector_retriever=vector_retriever, + config=query_loader_config, + ) # pre-process the dataset total_data_list = [] @@ -578,15 +663,17 @@ def make_dataset(args): # stats print("Min # of Retrieved Triples =", min(extracted_triple_sizes)) print("Max # of Retrieved Triples =", max(extracted_triple_sizes)) - print("Average # of Retrieved Triples =", - sum(extracted_triple_sizes) / len(extracted_triple_sizes)) + print( + "Average # of Retrieved Triples =", + sum(extracted_triple_sizes) / len(extracted_triple_sizes), + ) # 60:20:20 split - data_lists["train"] = total_data_list[:int(.6 * len(total_data_list))] - data_lists["validation"] = total_data_list[int(.6 * len(total_data_list) - ):int(.8 * + data_lists["train"] = total_data_list[:int(0.6 * len(total_data_list))] + data_lists["validation"] = total_data_list[int(0.6 * len(total_data_list) + ):int(0.8 * len(total_data_list))] - data_lists["test"] = total_data_list[int(.8 * len(total_data_list)):] + data_lists["test"] = total_data_list[int(0.8 * len(total_data_list)):] dataset_name = os.path.basename(args.dataset) dataset_path = os.path.join(args.dataset, f"{dataset_name}.pt") @@ -599,9 +686,11 @@ def make_dataset(args): def train(args, train_loader, val_loader): if args.wandb: - wandb.init(project=args.wandb_project, - name=f"run_{datetime.now().strftime('%Y-%m-%d_%H:%M')}", - config=vars(args)) + wandb.init( + project=args.wandb_project, + name=f"run_{datetime.now().strftime('%Y-%m-%d_%H:%M')}", + config=vars(args), + ) hidden_channels = args.gnn_hidden_channels num_gnn_layers = args.num_gnn_layers @@ -623,20 +712,32 @@ def _uncertainty_kwargs(): "mask_refusals_in_loss": True, }, decision_backend_kwargs=dict( - # HF backend (default): reuse the same model id; using a separate pipeline under the hood. - # If using Ollama or Anthropic instead, pass those credentials/args here. + # HF backend (default): reuse the same model id; using a + # separate pipeline under the hood. + # If using Ollama or Anthropic instead, pass those + # credentials/args here. ), ) if args.num_gnn_layers > 0: if args.gnn_model == "GAT": - gnn = GAT(in_channels=768, hidden_channels=hidden_channels, - out_channels=1024, num_layers=num_gnn_layers, heads=4) + gnn = GAT( + in_channels=768, + hidden_channels=hidden_channels, + out_channels=1024, + num_layers=num_gnn_layers, + heads=4, + ) elif args.gnn_model == "SGFormer": - gnn = SGFormer(in_channels=768, hidden_channels=hidden_channels, - out_channels=1024, trans_num_heads=1, - trans_dropout=0.5, gnn_num_layers=num_gnn_layers, - gnn_dropout=0.5) + gnn = SGFormer( + in_channels=768, + hidden_channels=hidden_channels, + out_channels=1024, + trans_num_heads=1, + trans_dropout=0.5, + gnn_num_layers=num_gnn_layers, + gnn_dropout=0.5, + ) else: raise ValueError(f"Invalid GNN model: {args.gnn_model}") else: @@ -685,17 +786,20 @@ def _uncertainty_kwargs(): else: params = [p for _, p in model.named_parameters() if p.requires_grad] lr = args.lr - optimizer = torch.optim.AdamW([{ - 'params': params, - 'lr': lr, - 'weight_decay': 0.05 - }], betas=(0.9, 0.95)) + optimizer = torch.optim.AdamW( + [{ + "params": params, + "lr": lr, + "weight_decay": 0.05 + }], + betas=(0.9, 0.95), + ) num_oom_errors = 0 for epoch in range(args.epochs): model.train() epoch_loss = 0 - epoch_str = f'Epoch: {epoch + 1}|{args.epochs}' + epoch_str = f"Epoch: {epoch + 1}|{args.epochs}" loader = tqdm(train_loader, desc=epoch_str) for step, batch in enumerate(loader): new_qs = [] @@ -704,7 +808,8 @@ def _uncertainty_kwargs(): new_qs.append( prompt_template.format( question=q, - context="\n".join(batch.text_context[i]))) + context="\n".join(batch.text_context[i]), + )) batch.question = new_qs if args.skip_graph_rag: @@ -714,37 +819,44 @@ def _uncertainty_kwargs(): try: loss = get_loss(model, batch) loss.backward() - clip_grad_norm_(optimizer.param_groups[0]['params'], 0.1) + clip_grad_norm_(optimizer.param_groups[0]["params"], 0.1) if (step + 1) % 2 == 0: - adjust_learning_rate(optimizer.param_groups[0], lr, - step / len(train_loader) + epoch, - args.epochs) + adjust_learning_rate( + optimizer.param_groups[0], + lr, + step / len(train_loader) + epoch, + args.epochs, + ) optimizer.step() epoch_loss += float(loss.detach()) if args.wandb and (step + 1) % args.log_steps == 0: wandb.log({ "train/loss": float(loss.detach()), - "train/lr": optimizer.param_groups[0]['lr'], + "train/lr": optimizer.param_groups[0]["lr"], }) if (step + 1) % 2 == 0: - lr = optimizer.param_groups[0]['lr'] + lr = optimizer.param_groups[0]["lr"] except torch.cuda.OutOfMemoryError: torch.cuda.empty_cache() - print("Sequence length of last batch: ", - model.seq_length_stats[-1]) + print( + "Sequence length of last batch: ", + model.seq_length_stats[-1], + ) # TODO: Implement CPU fallback (WIP) num_oom_errors += 1 print("Sequence length stats: ") - print("seq_len avg: ", - sum(model.seq_length_stats) / len(model.seq_length_stats)) + print( + "seq_len avg: ", + sum(model.seq_length_stats) / len(model.seq_length_stats), + ) print("seq_len min: ", min(model.seq_length_stats)) print("seq_len max: ", max(model.seq_length_stats)) print("Percent of OOM errors: ", num_oom_errors / len(train_loader)) train_loss = epoch_loss / len(train_loader) - print(epoch_str + f', Train Loss: {train_loss:4f}') + print(epoch_str + f", Train Loss: {train_loss:4f}") # Eval Step val_loss = 0 @@ -757,7 +869,8 @@ def _uncertainty_kwargs(): new_qs.append( prompt_template.format( question=q, - context="\n".join(batch.text_context[i]))) + context="\n".join(batch.text_context[i]), + )) batch.question = new_qs if args.skip_graph_rag: batch.desc = "" @@ -770,7 +883,7 @@ def _uncertainty_kwargs(): wandb.log({ "val/loss": val_loss, "train/epoch_loss": train_loss, - "epoch": epoch + 1 + "epoch": epoch + 1, }) if args.wandb: @@ -804,8 +917,8 @@ def eval(question: str, pred: str, correct_answer: str): test_batch.question = new_qs if args.skip_graph_rag: test_batch.desc = "" - preds = (inference_step(model, test_batch, - max_out_tokens=max_chars_in_train_answer / 2)) + preds = inference_step(model, test_batch, + max_out_tokens=max_chars_in_train_answer / 2) for question, pred, label in zip(raw_qs, preds, test_batch.label): eval_tuples.append((question, pred, label)) for question, pred, label in tqdm(eval_tuples, desc="Eval"): @@ -817,7 +930,7 @@ def eval(question: str, pred: str, correct_answer: str): print("Improvement of this estimation process is WIP...") -if __name__ == '__main__': +if __name__ == "__main__": # for reproducibility seed_everything(50) @@ -848,13 +961,27 @@ def eval(question: str, pred: str, correct_answer: str): data_lists = make_dataset(args) batch_size = args.batch_size eval_batch_size = args.eval_batch_size - train_loader = DataLoader(data_lists["train"], batch_size=batch_size, - drop_last=True, pin_memory=True, shuffle=True) - val_loader = DataLoader(data_lists["validation"], - batch_size=eval_batch_size, drop_last=False, - pin_memory=True, shuffle=False) - test_loader = DataLoader(data_lists["test"], batch_size=eval_batch_size, - drop_last=False, pin_memory=True, shuffle=False) + train_loader = DataLoader( + data_lists["train"], + batch_size=batch_size, + drop_last=True, + pin_memory=True, + shuffle=True, + ) + val_loader = DataLoader( + data_lists["validation"], + batch_size=eval_batch_size, + drop_last=False, + pin_memory=True, + shuffle=False, + ) + test_loader = DataLoader( + data_lists["test"], + batch_size=eval_batch_size, + drop_last=False, + pin_memory=True, + shuffle=False, + ) model = train(args, train_loader, val_loader) test(model, test_loader, args) diff --git a/test/llm/models/test_llm.py b/test/llm/models/test_llm.py index f2a3ec49fd09..07283775e73a 100644 --- a/test/llm/models/test_llm.py +++ b/test/llm/models/test_llm.py @@ -36,7 +36,7 @@ def test_llm() -> None: @withPackage('transformers', 'accelerate') def test_llm_uncertainty() -> None: """Test LLM with uncertainty estimation.""" - question = ['What is the capital of France?'] + question = ['Who won the Nobel Prize in Physics in 2050?'] # Test with uncertainty enabled model = LLM( diff --git a/torch_geometric/llm/models/llm.py b/torch_geometric/llm/models/llm.py index 47a27dce3c2e..8596046073ee 100644 --- a/torch_geometric/llm/models/llm.py +++ b/torch_geometric/llm/models/llm.py @@ -54,12 +54,12 @@ def _parse_decision_json(text: str) -> str: PAD_TOKEN_ID = 0 -PADDING_SIDE = 'left' +PADDING_SIDE = "left" # legacy constants - used for Llama 2 style prompting -BOS = '[INST]' -EOS_USER = '[/INST]' -EOS = '[/s]' +BOS = "[INST]" +EOS_USER = "[/INST]" +EOS = "[/s]" def get_llm_kwargs(required_memory: int, dtype=torch.dtype) -> Dict[str, Any]: @@ -75,15 +75,15 @@ def get_llm_kwargs(required_memory: int, dtype=torch.dtype) -> Dict[str, Any]: if sum(gpu_memory) < required_memory: gpu_memory = [] # If not enough VRAM, use pure CPU. - kwargs = dict(revision='main') + kwargs = dict(revision="main") if len(gpu_memory) > 0: - kwargs['max_memory'] = { - i: f'{memory}GiB' + kwargs["max_memory"] = { + i: f"{memory}GiB" for i, memory in enumerate(gpu_memory) } - kwargs['low_cpu_mem_usage'] = True - kwargs['device_map'] = 'auto' - kwargs['torch_dtype'] = dtype + kwargs["low_cpu_mem_usage"] = True + kwargs["device_map"] = "auto" + kwargs["torch_dtype"] = dtype return kwargs @@ -124,9 +124,11 @@ def __init__( self.model_name = model_name from transformers import AutoModelForCausalLM, AutoTokenizer + if n_gpus is None: if num_params is None: from huggingface_hub import get_safetensors_metadata + safetensors_metadata = get_safetensors_metadata(model_name) param_count = safetensors_metadata.parameter_count num_params = float(list(param_count.values())[0] // 10**9) @@ -139,14 +141,14 @@ def __init__( gpu_memory: List[int] = [] for i in range(n_gpus): gpu_memory.append(torch.cuda.mem_get_info(i)[0] // 1024**3) - kwargs = dict(revision='main') - kwargs['max_memory'] = { - i: f'{memory}GiB' + kwargs = dict(revision="main") + kwargs["max_memory"] = { + i: f"{memory}GiB" for i, memory in enumerate(gpu_memory) } - kwargs['low_cpu_mem_usage'] = True - kwargs['device_map'] = 'auto' - kwargs['torch_dtype'] = dtype + kwargs["low_cpu_mem_usage"] = True + kwargs["device_map"] = "auto" + kwargs["torch_dtype"] = dtype print(f"Setting up '{model_name}' with configuration: {kwargs}") self.tokenizer = AutoTokenizer.from_pretrained( @@ -179,22 +181,24 @@ def __init__( self.sys_prompt = sys_prompt else: self.sys_prompt = "" - if 'max_memory' not in kwargs: # Pure CPU: + if "max_memory" not in kwargs: # Pure CPU: warnings.warn( "LLM is being used on CPU, which may be slow. This decision " "was made by a rough hueristic that assumes your GPU set up " "does not have enough GPU RAM. This is done to avoid GPU OOM " "errors. If you think this is a mistake, please initialize " "your LLM with the n_gpus param to dictate how many gpus to " - "use for the LLM.", stacklevel=2) - self.device = torch.device('cpu') + "use for the LLM.", + stacklevel=2, + ) + self.device = torch.device("cpu") self.autocast_context = nullcontext() else: self.device = self.llm.device if dtype == torch.float32: self.autocast_context = nullcontext() else: - self.autocast_context = torch.amp.autocast('cuda', dtype=dtype) + self.autocast_context = torch.amp.autocast("cuda", dtype=dtype) # ---- Uncertainty config / backends (opt-in) ---------------------- self.uncertainty_estim = bool(uncertainty_estim) @@ -232,16 +236,18 @@ def __init__( HuggingFaceBackend, OllamaBackend, ) + self._HFBackend = HuggingFaceBackend self._OllamaBackend = OllamaBackend self._AnthropicBackend = AnthropicBackend except Exception: - self._HFBackend = self._OllamaBackend = self._AnthropicBackend = None + self._HFBackend = self._OllamaBackend = ( + self._AnthropicBackend) = None except Exception: self.uncertainty_estim = False warnings.warn( - "Uncertainty requested but `hallucination_toolkit` not importable. " - "Proceeding without uncertainty.", + "Uncertainty requested but `hallucination_toolkit` not " + "importable. Proceeding without uncertainty.", stacklevel=2, ) @@ -258,8 +264,8 @@ def _get_decision_backend(self): if not self.uncertainty_estim: return None backend_kind = str(self._uncfg.get("backend", "hf")).lower() - if backend_kind == "hf" and getattr(self, "_HFBackend", - None) is not None: + if (backend_kind == "hf" + and getattr(self, "_HFBackend", None) is not None): self._backend = self._HFBackend( mode="transformers", model_id=self.model_name, @@ -267,11 +273,11 @@ def _get_decision_backend(self): trust_remote_code=True, **self._dec_backend_kwargs, ) - elif backend_kind == "ollama" and getattr(self, "_OllamaBackend", - None) is not None: + elif (backend_kind == "ollama" + and getattr(self, "_OllamaBackend", None) is not None): self._backend = self._OllamaBackend(**self._dec_backend_kwargs) - elif backend_kind == "anthropic" and getattr(self, "_AnthropicBackend", - None) is not None: + elif (backend_kind == "anthropic" + and getattr(self, "_AnthropicBackend", None) is not None): self._backend = self._AnthropicBackend(**self._dec_backend_kwargs) else: self._backend = None @@ -340,17 +346,23 @@ def _encode_inputs( context = self.tokenizer(context, add_special_tokens=False) eos_user_tokens = self.tokenizer(EOS_USER, add_special_tokens=False) - bos_token = self.tokenizer( + bos_token = (self.tokenizer( BOS, add_special_tokens=False, - return_tensors='pt', - ).input_ids[0].to(self.device) + return_tensors="pt", + ).input_ids[0].to(self.device)) bos_embeds = self.word_embedding(bos_token) pad_token = torch.tensor(self.tokenizer.pad_token_id, device=self.device) pad_embeds = self.word_embedding(pad_token).unsqueeze(0) - return (batch_size, questions, context, eos_user_tokens, bos_embeds, - pad_embeds) + return ( + batch_size, + questions, + context, + eos_user_tokens, + bos_embeds, + pad_embeds, + ) def _label_input_ids( self, @@ -445,8 +457,14 @@ def _get_embeds_old( embedding: Optional[List[Tensor]] = None, answer: Optional[List[str]] = None, ) -> tuple: - (batch_size, question, context, eos_user_tokens, bos_embeds, - pad_embeds) = self._encode_inputs(question, context) + ( + batch_size, + question, + context, + eos_user_tokens, + bos_embeds, + pad_embeds, + ) = self._encode_inputs(question, context) batch_label_input_ids = None if answer is not None: @@ -480,8 +498,11 @@ def _get_embeds_old( ) inputs_embeds, attention_mask, label_input_ids = self._pad_embeds( - pad_embeds, batch_inputs_embeds, batch_attention_mask, - batch_label_input_ids) + pad_embeds, + batch_inputs_embeds, + batch_attention_mask, + batch_label_input_ids, + ) return inputs_embeds, attention_mask, label_input_ids @@ -497,7 +518,9 @@ def _get_embeds( f"HuggingFace model {self.model_name} is not using a " "chat template, using Llama 2 style prompting. Please " "consider using a more recent model and initialize the " - "LLM with `sys_prompt`.", stacklevel=2) + "LLM with `sys_prompt`.", + stacklevel=2, + ) return self._get_embeds_old(question, context, embedding, answer) batch_label_input_ids = None if answer is not None: @@ -535,11 +558,11 @@ def _get_embeds( else: label_input_ids = None - bos_token = self.tokenizer( + bos_token = (self.tokenizer( self.tokenizer.bos_token, add_special_tokens=False, - return_tensors='pt', - ).input_ids[0].to(self.device) + return_tensors="pt", + ).input_ids[0].to(self.device)) bos_embeds = self.word_embedding(bos_token) @@ -569,8 +592,11 @@ def _get_embeds( pad_embeds = self.word_embedding(pad_token).unsqueeze(0) inputs_embeds, attention_mask, label_input_ids = self._pad_embeds( - pad_embeds, batch_inputs_embeds, batch_attention_mask, - batch_label_input_ids) + pad_embeds, + batch_inputs_embeds, + batch_attention_mask, + batch_label_input_ids, + ) return inputs_embeds, attention_mask, label_input_ids @@ -600,8 +626,8 @@ def forward( if self.uncertainty_estim and label_input_ids is not None: unc = self._compute_uncertainty(question, context) self._last_uncertainty = unc - if (self._uncfg.get("mask_refusals_in_loss", True) - and len(unc) == len(question)): + if self._uncfg.get("mask_refusals_in_loss", + True) and len(unc) == len(question): for i, u in enumerate(unc): if not u.decision_answer: label_input_ids[i, :] = IGNORE_INDEX @@ -630,7 +656,6 @@ def inference( Args: question (list[str]): The questions/prompts. - answer (list[str]): The answers/labels. context (list[str], optional): Additional context to give to the LLM, such as textified knowledge graphs. (default: :obj:`None`) embedding (list[torch.Tensor], optional): RAG embedding @@ -639,6 +664,12 @@ def inference( both. (default: :obj:`None`) max_tokens (int, optional): How many tokens for the LLM to generate. (default: :obj:`32`) + return_uncertainty (bool, optional): Whether to also return the + uncertainty estimates computed for each item. (default: + :obj:`False`) + abstain_on_low_ISR (bool, optional): Whether to replace outputs + with ``"[ABSTAIN]"`` when the ISR-driven decision flags the + answer as a refusal. (default: :obj:`False`) """ inputs_embeds, attention_mask, _ = self._get_embeds( question, context, embedding) @@ -669,4 +700,4 @@ def inference( return texts def __repr__(self) -> str: - return f'{self.__class__.__name__}({self.model_name})' + return f"{self.__class__.__name__}({self.model_name})"