diff --git a/examples/ModernBERT_NER.ipynb b/examples/ModernBERT_NER.ipynb new file mode 100644 index 00000000..2a0e441a --- /dev/null +++ b/examples/ModernBERT_NER.ipynb @@ -0,0 +1,715 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "code", + "source": [ + "# !pip install transformers datasets torch accelerate seqeval\n", + "# For GPU training\n", + "# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118" + ], + "metadata": { + "id": "N1_GgA-_uQRb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install seqeval" + ], + "metadata": { + "id": "Vs4d2KX8xOtd" + }, + "execution_count": 1, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "!pip install -U datasets\n" + ], + "metadata": { + "id": "IPJ3CXlIxlDC" + }, + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "from datasets import load_dataset, Dataset\n", + "from transformers import (\n", + " AutoTokenizer,\n", + " AutoModelForTokenClassification,\n", + " TrainingArguments,\n", + " Trainer,\n", + " DataCollatorForTokenClassification\n", + ")\n", + "from seqeval.metrics import accuracy_score, classification_report, f1_score\n", + "import numpy as np\n" + ], + "metadata": { + "id": "l3o0vdx1uRbZ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Load ModernBERT tokenizer and model\n", + "model_name = \"answerdotai/ModernBERT-base\"\n", + "tokenizer = AutoTokenizer.from_pretrained(model_name)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "LCdaBaN2uRd2", + "outputId": "4413ecc2-2066-4f1a-eba2-ba89ee147f01" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Example: Load CoNLL-2003 dataset for NER\n", + "# You can replace this with your own dataset\n", + "dataset = load_dataset(\"conll2003\")" + ], + "metadata": { + "id": "4qUTkIOJuRgc" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "dataset" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A_oLinbVuRii", + "outputId": "06ccf7fc-f134-40e3-df2d-8c4957d9eb48" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "DatasetDict({\n", + " train: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 14041\n", + " })\n", + " validation: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3250\n", + " })\n", + " test: Dataset({\n", + " features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],\n", + " num_rows: 3453\n", + " })\n", + "})" + ] + }, + "metadata": {}, + "execution_count": 4 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Define label names and create label mappings\n", + "label_list = dataset[\"train\"].features[\"ner_tags\"].feature.names\n", + "label_to_id = {label: i for i, label in enumerate(label_list)}\n", + "id_to_label = {i: label for i, label in enumerate(label_list)}\n", + "\n", + "print(f\"Labels: {label_list}\")\n", + "print(f\"Number of labels: {len(label_list)}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eG81u-sOuRkp", + "outputId": "99b2a3dc-af98-45d0-fcf8-baefceaf6208" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Labels: ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']\n", + "Number of labels: 9\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Initialize model for token classification\n", + "model = AutoModelForTokenClassification.from_pretrained(\n", + " model_name,\n", + " num_labels=len(label_list),\n", + " id2label=id_to_label,\n", + " label2id=label_to_id\n", + ")" + ], + "metadata": { + "id": "1gxMM1yZuRms" + }, + "execution_count": 4, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def tokenize_and_align_labels(examples):\n", + " \"\"\"Tokenize inputs and align labels with tokenized tokens\"\"\"\n", + " tokenized_inputs = tokenizer(\n", + " examples[\"tokens\"],\n", + " truncation=True,\n", + " is_split_into_words=True,\n", + " padding=False\n", + " )\n", + "\n", + " labels = []\n", + " for i, label in enumerate(examples[\"ner_tags\"]):\n", + " word_ids = tokenized_inputs.word_ids(batch_index=i)\n", + " previous_word_idx = None\n", + " label_ids = []\n", + "\n", + " for word_idx in word_ids:\n", + " if word_idx is None:\n", + " # Special tokens get -100 label (ignored in loss)\n", + " label_ids.append(-100)\n", + " elif word_idx != previous_word_idx:\n", + " # First token of a word gets the label\n", + " label_ids.append(label[word_idx])\n", + " else:\n", + " # Subsequent tokens of same word get -100 (or same label)\n", + " label_ids.append(-100)\n", + " previous_word_idx = word_idx\n", + "\n", + " labels.append(label_ids)\n", + "\n", + " tokenized_inputs[\"labels\"] = labels\n", + " return tokenized_inputs" + ], + "metadata": { + "id": "RzZyq2iHyA36" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Tokenize datasets\n", + "tokenized_train = dataset[\"train\"].map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=dataset[\"train\"].column_names\n", + ")\n", + "tokenized_valid = dataset[\"validation\"].map(\n", + " tokenize_and_align_labels,\n", + " batched=True,\n", + " remove_columns=dataset[\"validation\"].column_names\n", + ")" + ], + "metadata": { + "id": "J4wFi1SRyGZj" + }, + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "# Data collator\n", + "data_collator = DataCollatorForTokenClassification(\n", + " tokenizer=tokenizer,\n", + " padding=True\n", + ")\n" + ], + "metadata": { + "id": "cL1Bv-qvyKRe" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def compute_metrics(eval_pred):\n", + " \"\"\"Compute seqeval metrics for NER evaluation\"\"\"\n", + " predictions, labels = eval_pred\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " # Remove ignored index (special tokens)\n", + " true_predictions = [\n", + " [id_to_label[p] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " true_labels = [\n", + " [id_to_label[l] for (p, l) in zip(prediction, label) if l != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " return {\n", + " \"accuracy\": accuracy_score(true_labels, true_predictions),\n", + " \"f1\": f1_score(true_labels, true_predictions),\n", + " }" + ], + "metadata": { + "id": "5-4vG0ZMyNWn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "\n", + "# Training arguments\n", + "training_args = TrainingArguments(\n", + " output_dir=\"./modernbert-ner\",\n", + " eval_strategy=\"steps\",\n", + " eval_steps=500,\n", + " save_strategy=\"steps\",\n", + " save_steps=500,\n", + " logging_steps=100,\n", + " num_train_epochs=3,\n", + " per_device_train_batch_size=16,\n", + " per_device_eval_batch_size=16,\n", + " warmup_steps=500,\n", + " weight_decay=0.01,\n", + " learning_rate=2e-5,\n", + " fp16=True, # Use mixed precision if you have a compatible GPU\n", + " dataloader_pin_memory=False,\n", + " remove_unused_columns=True, # Change from False to True\n", + " push_to_hub=False, # Set to True if you want to push to HuggingFace Hub\n", + ")\n" + ], + "metadata": { + "id": "K12n7AxvyOdu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Initialize trainer\n", + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=tokenized_train,\n", + " eval_dataset=tokenized_valid,\n", + " tokenizer=tokenizer,\n", + " data_collator=data_collator,\n", + " compute_metrics=compute_metrics,\n", + ")\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ol9KtDKPygu-", + "outputId": "50240e0a-c7ac-4622-fa21-1dfcb8c40776" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/tmp/ipython-input-39-4177626116.py:2: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `Trainer.__init__`. Use `processing_class` instead.\n", + " trainer = Trainer(\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Train the model\n", + "print(\"Starting training...\")\n", + "trainer.train()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 301 + }, + "id": "iiQkq2H0ykKG", + "outputId": "765cdc35-dd7e-402f-c7ac-c1a2c91692c6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Starting training...\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + "W0628 18:37:21.148000 1914 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "
\n", + " \n", + " \n", + " [2634/2634 10:55, Epoch 3/3]\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation LossAccuracyF1
5000.1179000.1049120.9710290.824140
10000.0398000.0575820.9853980.910766
15000.0278000.0512130.9880260.929259
20000.0123000.0492200.9888830.934884
25000.0113000.0467810.9891550.936273

" + ] + }, + "metadata": {} + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "TrainOutput(global_step=2634, training_loss=0.10961505901931178, metrics={'train_runtime': 668.867, 'train_samples_per_second': 62.977, 'train_steps_per_second': 3.938, 'total_flos': 1577408010395238.0, 'train_loss': 0.10961505901931178, 'epoch': 3.0})" + ] + }, + "metadata": {}, + "execution_count": 40 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Save the model\n", + "trainer.save_model()\n", + "tokenizer.save_pretrained(\"./modernbert-ner\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1rwVhSWyymtS", + "outputId": "c5b27a01-1c4c-4f3d-b511-144fc040075a" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('./modernbert-ner/tokenizer_config.json',\n", + " './modernbert-ner/special_tokens_map.json',\n", + " './modernbert-ner/tokenizer.json')" + ] + }, + "metadata": {}, + "execution_count": 41 + } + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 92 + }, + "id": "5-vK4SCMuOaV", + "outputId": "dc568d98-1c2c-4de0-cdf8-32def8a55d21" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" + ], + "text/html": [ + "\n", + "

\n", + " \n", + " \n", + " [204/204 00:16]\n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test results: {'eval_loss': 0.04691711440682411, 'eval_accuracy': 0.9892527549550251, 'eval_f1': 0.9363408521303258, 'eval_runtime': 16.8501, 'eval_samples_per_second': 192.877, 'eval_steps_per_second': 12.107, 'epoch': 3.0}\n", + "Training completed!\n" + ] + } + ], + "source": [ + "# Evaluate on test set\n", + "test_results = trainer.evaluate(tokenized_valid)\n", + "print(f\"Test results: {test_results}\")\n", + "\n", + "print(\"Training completed!\")" + ] + }, + { + "cell_type": "code", + "source": [ + "# Inference on new text\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "model.to(device)\n", + "test_text = \"Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\"\n", + "inputs = tokenizer(test_text, return_tensors=\"pt\", truncation=True, padding=True)\n", + "inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to same device as model\n", + "with torch.no_grad():\n", + " outputs = model(**inputs)\n", + " predictions = torch.argmax(outputs.logits, dim=2)\n", + " tokens = tokenizer.convert_ids_to_tokens(inputs[\"input_ids\"][0])\n", + " predicted_labels = [id_to_label[pred.item()] for pred in predictions[0]]\n", + "\n", + " # Better aligned output\n", + " print(f\"\\nInput text: {test_text}\")\n", + " print(\"-\" * 50)\n", + " print(f\"{'Token':<15} {'Label':<10}\")\n", + " print(\"-\" * 50)\n", + " for token, label in zip(tokens, predicted_labels):\n", + " print(f\"{token:<15} {label:<10}\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jyCvDUGe8D14", + "outputId": "153b738f-87be-4aba-946f-b4161f5e2d86" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "Input text: Amazon founder Jeff Bezos, sourish and Tesla CEO Elon Musk attended the World Economic Forum in Davos, Switzerland, where they discussed space exploration with NASA administrator Bill Nelson and European Space Agency director Josef Aschbacher.\n", + "--------------------------------------------------\n", + "Token Label \n", + "--------------------------------------------------\n", + "[CLS] O \n", + "Amazon B-ORG \n", + "Ġfounder O \n", + "ĠJeff B-PER \n", + "ĠBe I-PER \n", + "zos I-PER \n", + ", O \n", + "Ġsour B-PER \n", + "ish O \n", + "Ġand O \n", + "ĠTesla B-ORG \n", + "ĠCEO O \n", + "ĠEl B-PER \n", + "on I-PER \n", + "ĠMusk I-PER \n", + "Ġattended O \n", + "Ġthe O \n", + "ĠWorld B-MISC \n", + "ĠEconomic I-MISC \n", + "ĠForum I-MISC \n", + "Ġin O \n", + "ĠDav B-LOC \n", + "os I-LOC \n", + ", O \n", + "ĠSwitzerland B-LOC \n", + ", O \n", + "Ġwhere O \n", + "Ġthey O \n", + "Ġdiscussed O \n", + "Ġspace O \n", + "Ġexploration O \n", + "Ġwith O \n", + "ĠNASA B-ORG \n", + "Ġadministrator O \n", + "ĠBill B-PER \n", + "ĠNelson I-PER \n", + "Ġand O \n", + "ĠEuropean B-ORG \n", + "ĠSpace I-ORG \n", + "ĠAgency I-ORG \n", + "Ġdirector O \n", + "ĠJose B-PER \n", + "f I-PER \n", + "ĠAs I-PER \n", + "ch I-PER \n", + "b I-PER \n", + "acher O \n", + ". O \n", + "[SEP] I-ORG \n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# ignore first and last special characters" + ], + "metadata": { + "id": "-hLU3syz8hbO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline\n", + "\n", + "# Load model and tokenizer\n", + "tokenizer = AutoTokenizer.from_pretrained(\"joe-xhedi/ModernBERT-NER\")\n", + "model = AutoModelForTokenClassification.from_pretrained(\"joe-xhedi/ModernBERT-NER\")\n", + "\n", + "# Create NER pipeline\n", + "ner_pipeline = pipeline(\"ner\",\n", + " model=model,\n", + " tokenizer=tokenizer,\n", + " aggregation_strategy=\"simple\")\n", + "\n", + "# Example usage\n", + "text = \"John Doe works at OpenAI in San Francisco.\"\n", + "results = ner_pipeline(text)\n", + "print(results)\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "SB4RTqAN-27M", + "outputId": "f3026953-92b2-463e-e50e-8cc1edbf4466" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Device set to use cuda:0 \n", + "\n", + "[{'entity_group': 'PER', 'score': np.float32(0.9963356), 'word': 'John Doe', 'start': 0, 'end': 8}, {'entity_group': 'ORG', 'score': np.float32(0.98997074), 'word': ' OpenAI', 'start': 17, 'end': 24}, {'entity_group': 'LOC', 'score': np.float32(0.82463753), 'word': ' San Francisco', 'start': 27, 'end': 41}]\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "lj5c6Xxg-3Xb" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file