Skip to content

Commit 24e2c24

Browse files
authored
SONARPY-2082 Jupyter notebooks should not have their fileLinesContext filled (#1936)
1 parent ff4c4aa commit 24e2c24

File tree

2 files changed

+5
-2
lines changed

2 files changed

+5
-2
lines changed

sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,10 @@ private void saveMeasures(PythonInputFile inputFile, PythonVisitorContext visito
334334
saveMetricOnFile(inputFile, CoreMetrics.COMMENT_LINES, fileLinesVisitor.getCommentLineCount());
335335

336336
FileLinesContext fileLinesContext = fileLinesContextFactory.createFor(inputFile.wrappedFile());
337-
for (int line : linesOfCode) {
338-
fileLinesContext.setIntValue(CoreMetrics.NCLOC_DATA_KEY, line, 1);
337+
if (inputFile.kind() == PythonInputFile.Kind.PYTHON) {
338+
for (int line : linesOfCode) {
339+
fileLinesContext.setIntValue(CoreMetrics.NCLOC_DATA_KEY, line, 1);
340+
}
339341
}
340342
for (int line : fileLinesVisitor.getExecutableLines()) {
341343
fileLinesContext.setIntValue(CoreMetrics.EXECUTABLE_LINES_DATA_KEY, line, 1);
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"cells":[{"cell_type":"markdown","metadata":{"id":"66BqbnsyeJhy"},"source":["# Classification de token (TensorFlow)"]},{"cell_type":"markdown","metadata":{"id":"jYFSj10BeJh0"},"source":["Installez les bibliothèques 🤗 *Datasets*, 🤗 *Transformers* et 🤗 *Accelerate* pour exécuter ce *notebook*."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2nlx8iiUeJh2"},"outputs":[],"source":["!pip install datasets transformers[sentencepiece]\n","!apt install git-lfs"]},{"cell_type":"markdown","metadata":{"id":"1zy96qUbeJh5"},"source":["Vous aurez besoin de configurer git, adaptez votre email et votre nom dans la cellule suivante."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"REUDT7bEeJh7"},"outputs":[],"source":["!git config --global user.email \"[email protected]\"\n","!git config --global user.name \"Your Name\""]},{"cell_type":"markdown","metadata":{"id":"e9w47W4aeJh8"},"source":["Vous devrez également être connecté au Hub d'Hugging Face. Exécutez ce qui suit et entrez vos informations d'identification."]},{"cell_type":"code","execution_count":null,"metadata":{"id":"87pR9AHWeJh9"},"outputs":[],"source":["from huggingface_hub import notebook_login\n","\n","notebook_login()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DKUp34igeJh9"},"outputs":[],"source":["from datasets import load_dataset\n","\n","raw_datasets = load_dataset(\"wikiann\",\"fr\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"JOSGslMOeJh-"},"outputs":[],"source":["raw_datasets"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"2LIzeE4deJiA"},"outputs":[],"source":["raw_datasets[\"train\"][0][\"tokens\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pE7bU0fOeJiA"},"outputs":[],"source":["raw_datasets[\"train\"][0][\"ner_tags\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"kAC5v8DHeJiB"},"outputs":[],"source":["ner_feature = raw_datasets[\"train\"].features[\"ner_tags\"]\n","ner_feature"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"QQuJ6sOSeJiC"},"outputs":[],"source":["label_names = ner_feature.feature.names\n","label_names"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"dhzU6DQgeJiD"},"outputs":[],"source":["words = raw_datasets[\"train\"][0][\"tokens\"]\n","labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n","line1 = \"\"\n","line2 = \"\"\n","for word, label in zip(words, labels):\n"," full_label = label_names[label]\n"," max_length = max(len(word), len(full_label))\n"," line1 += word + \" \" * (max_length - len(word) + 1)\n"," line2 += full_label + \" \" * (max_length - len(full_label) + 1)\n","\n","print(line1)\n","print(line2)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"1ukb6erAeJiE"},"outputs":[],"source":["from transformers import AutoTokenizer\n","\n","model_checkpoint = \"camembert-base\"\n","tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"hldTWv0-eJiF"},"outputs":[],"source":["tokenizer.is_fast"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"3K9rk_DMeJiF"},"outputs":[],"source":["inputs = tokenizer(raw_datasets[\"train\"][0][\"tokens\"], is_split_into_words=True)\n","inputs.tokens()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"9y2rOgoXeJiH"},"outputs":[],"source":["inputs.word_ids()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"qbuxqHKJeJiH"},"outputs":[],"source":["def align_labels_with_tokens(labels, word_ids):\n"," new_labels = []\n"," current_word = None\n"," for word_id in word_ids:\n"," if word_id != current_word:\n"," # Début d'un nouveau mot !\n"," current_word = word_id\n"," label = -100 if word_id is None else labels[word_id]\n"," new_labels.append(label)\n"," elif word_id is None:\n"," # Token special\n"," new_labels.append(-100)\n"," else:\n"," # Même mot que le token précédent\n"," label = labels[word_id]\n"," # Si l'étiquette est B-XXX, nous la changeons en I-XXX\n"," if label % 2 == 1:\n"," label += 1\n"," new_labels.append(label)\n","\n"," return new_labels"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"7BSr1BhoeJiI"},"outputs":[],"source":["labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n","word_ids = inputs.word_ids()\n","print(labels)\n","print(align_labels_with_tokens(labels, word_ids))"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"Ny0elDVCeJiI"},"outputs":[],"source":["def tokenize_and_align_labels(examples):\n"," tokenized_inputs = tokenizer(\n"," examples[\"tokens\"], truncation=True, is_split_into_words=True\n"," )\n"," all_labels = examples[\"ner_tags\"]\n"," new_labels = []\n"," for i, labels in enumerate(all_labels):\n"," word_ids = tokenized_inputs.word_ids(i)\n"," new_labels.append(align_labels_with_tokens(labels, word_ids))\n","\n"," tokenized_inputs[\"labels\"] = new_labels\n"," return tokenized_inputs"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DegRuotBeJiJ"},"outputs":[],"source":["tokenized_datasets = raw_datasets.map(\n"," tokenize_and_align_labels,\n"," batched=True,\n"," remove_columns=raw_datasets[\"train\"].column_names,\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"i4m1_SpZeJiJ"},"outputs":[],"source":["from transformers import DataCollatorForTokenClassification\n","\n","data_collator = DataCollatorForTokenClassification(\n"," tokenizer=tokenizer, return_tensors=\"tf\"\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"FhmQ-gNfeJiK"},"outputs":[],"source":["batch = data_collator([tokenized_datasets[\"train\"][i] for i in range(2)])\n","batch[\"labels\"]"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"qH9hhenQeJiL"},"outputs":[],"source":["for i in range(2):\n"," print(tokenized_datasets[\"train\"][i][\"labels\"])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"BIZ3zwumeJiL"},"outputs":[],"source":["tf_train_dataset = tokenized_datasets[\"train\"].to_tf_dataset(\n"," columns=[\"attention_mask\", \"input_ids\", \"labels\"],\n"," collate_fn=data_collator,\n"," shuffle=True,\n"," batch_size=16,\n",")\n","\n","tf_eval_dataset = tokenized_datasets[\"validation\"].to_tf_dataset(\n"," columns=[\"attention_mask\", \"input_ids\", \"labels\"],\n"," collate_fn=data_collator,\n"," shuffle=False,\n"," batch_size=16,\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"HlLRZuczeJiL"},"outputs":[],"source":["id2label = {str(i): label for i, label in enumerate(label_names)}\n","label2id = {v: k for k, v in id2label.items()}"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"HRg3NkpieJiM"},"outputs":[],"source":["from transformers import TFAutoModelForTokenClassification\n","\n","model = TFAutoModelForTokenClassification.from_pretrained(\n"," model_checkpoint,\n"," id2label=id2label,\n"," label2id=label2id,\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DDmxHRz_eJiM"},"outputs":[],"source":["model.config.num_labels"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"nzXgLZYveJiN"},"outputs":[],"source":["from huggingface_hub import notebook_login\n","\n","notebook_login()"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"eq8ORfWEeJiN"},"outputs":[],"source":["from transformers import create_optimizer\n","import tensorflow as tf\n","\n","# Train in mixed-precision float16\n","# Commentez cette ligne si vous utilisez un GPU qui ne bénéficiera pas de cette fonction.\n","tf.keras.mixed_precision.set_global_policy(\"mixed_float16\")\n","\n","# Le nombre d'étapes d'entraînement est le nombre d'échantillons dans le jeu de données, divisé par la taille du batch puis multiplié\n","# par le nombre total d'époques. Notez que le jeu de données tf_train_dataset est ici un lot de données tf.data.Dataset,\n","# pas le jeu de données original Hugging Face, donc son len() est déjà num_samples // batch_size.\n","num_epochs = 3\n","num_train_steps = len(tf_train_dataset) * num_epochs\n","\n","optimizer, schedule = create_optimizer(\n"," init_lr=2e-5,\n"," num_warmup_steps=0,\n"," num_train_steps=num_train_steps,\n"," weight_decay_rate=0.01,\n",")\n","model.compile(optimizer=optimizer)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"PcwfRwJQeJiN"},"outputs":[],"source":["from transformers.keras_callbacks import PushToHubCallback\n","\n","callback = PushToHubCallback(output_dir=\"camembert-finetuned-ner\", tokenizer=tokenizer)\n","\n","model.fit(\n"," tf_train_dataset,\n"," validation_data=tf_eval_dataset,\n"," callbacks=[callback],\n"," epochs=num_epochs,\n",")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"pvCebi_5eJiO"},"outputs":[],"source":["!pip install seqeval"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"UsD5VprAeJiP"},"outputs":[],"source":["from datasets import load_metric\n","\n","metric = load_metric(\"seqeval\")"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"N4aw3s1neJiP"},"outputs":[],"source":["labels = raw_datasets[\"train\"][0][\"ner_tags\"]\n","labels = [label_names[i] for i in labels]\n","labels"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"kr6RQD5KeJiQ"},"outputs":[],"source":["predictions = labels.copy()\n","predictions[2] = \"O\"\n","metric.compute(predictions=[predictions], references=[labels])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"025gksjzeJiQ"},"outputs":[],"source":["import numpy as np\n","\n","all_predictions = []\n","all_labels = []\n","for batch in tf_eval_dataset:\n"," logits = model.predict_on_batch(batch)[\"logits\"]\n"," labels = batch[\"labels\"]\n"," predictions = np.argmax(logits, axis=-1)\n"," for prediction, label in zip(predictions, labels):\n"," for predicted_idx, label_idx in zip(prediction, label):\n"," if label_idx == -100:\n"," continue\n"," all_predictions.append(label_names[predicted_idx])\n"," all_labels.append(label_names[label_idx])\n","metric.compute(predictions=[all_predictions], references=[all_labels])"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"H4caxFcfeJiQ"},"outputs":[],"source":["from transformers import pipeline\n","\n","# Remplacez par votre propre checkpoint\n","model_checkpoint = \"huggingface-course/camembert-finetuned-ner\"\n","token_classifier = pipeline(\n"," \"token-classification\", model=model_checkpoint, aggregation_strategy=\"simple\"\n",")\n","oken_classifier(\"Je m'appelle Sylvain et je travaille à Hugging Face à Brooklyn.\")"]}],"metadata":{"colab":{"provenance":[],"collapsed_sections":[]},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.5"},"accelerator":"GPU"},"nbformat":4,"nbformat_minor":0}

0 commit comments

Comments
 (0)