From 913c55bf08eb02a29b94601845a418885e6c9e6d Mon Sep 17 00:00:00 2001 From: degenfabian Date: Mon, 18 Aug 2025 20:00:11 +0200 Subject: [PATCH] updated loading in llama 2 demo to use transformer bridge --- .github/workflows/checks.yml | 2 +- demos/LLaMA2_GPU_Quantized.ipynb | 14 ++++++++------ 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 4de51026c..a6f7935c9 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -154,7 +154,7 @@ jobs: # - "Head_Detector_Demo" - "Interactive_Neuroscope" # - "LLaMA" - # - "LLaMA2_GPU_Quantized" + - "LLaMA2_GPU_Quantized" - "Main_Demo" # - "No_Position_Experiment" - "Othello_GPT" diff --git a/demos/LLaMA2_GPU_Quantized.ipynb b/demos/LLaMA2_GPU_Quantized.ipynb index 685e6803d..2ce0a9f86 100644 --- a/demos/LLaMA2_GPU_Quantized.ipynb +++ b/demos/LLaMA2_GPU_Quantized.ipynb @@ -212,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "P8zS3MPkCUsR" }, @@ -232,7 +232,7 @@ "from transformer_lens.hook_points import (\n", " HookPoint,\n", ") # Hooking utilities\n", - "from transformer_lens import HookedTransformer\n", + "from transformer_lens.model_bridge import TransformerBridge\n", "\n", "torch.set_grad_enabled(False)\n", "\n", @@ -291,7 +291,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "id": "RdJ0AuW_CUsS" }, @@ -303,7 +303,8 @@ " tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH)\n", " hf_model = LlamaForCausalLM.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)\n", "\n", - " model = HookedTransformer.from_pretrained(\"llama-7b\", hf_model=hf_model, device=\"cpu\", fold_ln=False, center_writing_weights=False, center_unembed=False, tokenizer=tokenizer)\n", + " model = TransformerBridge.boot_transformers(\"llama-7b\", hf_model=hf_model, device=\"cpu\", fold_ln=False, center_writing_weights=False, center_unembed=False, tokenizer=tokenizer)\n", + " model.enable_compatibility_mode()\n", "\n", " model = model.to(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", " model.generate(\"The capital of Germany is\", max_new_tokens=20, temperature=0)" @@ -406,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -730,7 +731,7 @@ "\n", "tokenizer = AutoTokenizer.from_pretrained(LLAMA_2_7B_CHAT_PATH)\n", "\n", - "model = HookedTransformer.from_pretrained(LLAMA_2_7B_CHAT_PATH,\n", + "model = TransformerBridge.boot_transformers(LLAMA_2_7B_CHAT_PATH,\n", " hf_model=hf_model,\n", " dtype=inference_dtype,\n", " fold_ln=False,\n", @@ -738,6 +739,7 @@ " center_writing_weights=False,\n", " center_unembed=False,\n", " tokenizer=tokenizer)\n", + "model.enable_compatibility_mode()\n", "\n", "model.generate(\"The capital of Germany is\", max_new_tokens=2, temperature=0)\n", "\n"