|
| 1 | +import os |
| 2 | +os.chdir("ape") |
| 3 | +import pandas as pd |
| 4 | + |
| 5 | +# Import NAF classification |
| 6 | +naf = pd.read_excel("data/naf.parquet", skiprows = 2) |
| 7 | +# Import training data |
| 8 | +train = pd.read_parquet("data/data.parquet") |
| 9 | +# train = train.sample(10000) |
| 10 | + |
| 11 | +# Merge classification info |
| 12 | +naf['Code'] = naf['Code'].str.replace(".","") |
| 13 | +train = train.merge(naf, left_on = "nace", right_on = "Code") |
| 14 | +train.head(5) |
| 15 | + |
| 16 | + |
| 17 | + |
| 18 | +def filter_train_data(train_data, sequence): |
| 19 | + sequence_capitalized = sequence.upper() |
| 20 | + mask = train_data['text'].str.contains(sequence_capitalized) |
| 21 | + nb_occurrence = mask.astype(int).sum() |
| 22 | + print( |
| 23 | + f"Nombre d'occurrences de la séquence '{sequence}': {nb_occurrence}" |
| 24 | + ) |
| 25 | + return train_data.loc[mask] |
| 26 | + |
| 27 | + |
| 28 | +from nltk.tokenize import word_tokenize |
| 29 | +import spacy |
| 30 | + |
| 31 | +os.system("python -m spacy download fr_core_news_sm") |
| 32 | +nlp = spacy.load("fr_core_news_sm") |
| 33 | +stop_words = nlp.Defaults.stop_words |
| 34 | +stop_words = set(stop_words) |
| 35 | + |
| 36 | +import nltk |
| 37 | +nltk.download('punkt_tab') |
| 38 | +nltk.download('stopwords') |
| 39 | + |
| 40 | +# Function to remove stopwords |
| 41 | +def remove_stopwords(text): |
| 42 | + word_tokens = word_tokenize(text) |
| 43 | + filtered_text = [word for word in word_tokens if word.lower() not in stop_words] |
| 44 | + return ' '.join(filtered_text) |
| 45 | + |
| 46 | +def remove_single_letters(text): |
| 47 | + word_tokens = word_tokenize(text) |
| 48 | + filtered_text = [word for word in word_tokens if len(word) > 1] |
| 49 | + return ' '.join(filtered_text) |
| 50 | + |
| 51 | +# Apply the function to the 'text' column |
| 52 | +train['text_clean'] = (train['text'] |
| 53 | + .apply(remove_stopwords) |
| 54 | + .apply(remove_single_letters) |
| 55 | +) |
| 56 | + |
| 57 | + |
| 58 | + |
| 59 | + |
| 60 | +from processor import Preprocessor |
| 61 | +preprocessor = Preprocessor() |
| 62 | + |
| 63 | + |
| 64 | +# Preprocess data before training and testing |
| 65 | +TEXT_FEATURE = "text" |
| 66 | +Y = "nace" |
| 67 | + |
| 68 | +df = train.copy() |
| 69 | +df = preprocessor.clean_text(df, TEXT_FEATURE).drop('text_clean', axis = "columns") |
| 70 | +df.head(2) |
| 71 | + |
| 72 | +df = df.dropna(subset = [Y, TEXT_FEATURE]) |
| 73 | +X = df[TEXT_FEATURE].values |
| 74 | +y = df[Y].values |
| 75 | + |
| 76 | +from sklearn.preprocessing import LabelEncoder |
| 77 | +le = LabelEncoder() |
| 78 | +y_encoded = le.fit_transform(y) # Convertit ["cat", "dog"] → [0, 1] |
| 79 | + |
| 80 | +# Première division : train (80 %) + test (20%) |
| 81 | +from sklearn.model_selection import train_test_split |
| 82 | + |
| 83 | +X_train, X_test, y_train, y_test = train_test_split( |
| 84 | + X, |
| 85 | + y_encoded, |
| 86 | + test_size=0.2, |
| 87 | + random_state=0, |
| 88 | + shuffle=True, |
| 89 | +) |
| 90 | +# Deuxième division pour aboutir à : train (60 % = 80% * 75%) + val = (60 % = 80% * 25%) + test (20%) |
| 91 | + |
| 92 | +X_train, X_val, y_train, y_val = train_test_split( |
| 93 | + X_train, |
| 94 | + y_train, |
| 95 | + test_size=0.25, |
| 96 | + random_state=0, |
| 97 | + shuffle=True, |
| 98 | +) |
| 99 | + |
| 100 | +from torchTextClassifiers.tokenizers.ngram import NGramTokenizer |
| 101 | + |
| 102 | +tokenizer = NGramTokenizer( |
| 103 | + min_count=2, # On considère un mot s'il est trouvé au moins 2 fois dans le corpus |
| 104 | + min_n=2, |
| 105 | + max_n=4, # On fait des 2grams, 3grams et 4grams de caractères |
| 106 | + len_word_ngrams=2, # On fait des 2grams de mots |
| 107 | + num_tokens=10000, # Nombre max de tokens considérés |
| 108 | + training_text=X, |
| 109 | +) |
| 110 | + |
| 111 | +from torchTextClassifiers import ModelConfig |
| 112 | +import numpy as np |
| 113 | + |
| 114 | +# Embedding dimension |
| 115 | +embedding_dim = 64 |
| 116 | + |
| 117 | +# Count number of unique labels |
| 118 | +unique_values, counts = np.unique(y, return_counts=True) |
| 119 | +num_unique = len(unique_values) |
| 120 | + |
| 121 | +model_config = ModelConfig( |
| 122 | + embedding_dim=embedding_dim, |
| 123 | + num_classes=num_unique |
| 124 | +) |
| 125 | + |
| 126 | +from torchTextClassifiers import torchTextClassifiers |
| 127 | + |
| 128 | +classifier = torchTextClassifiers( |
| 129 | + tokenizer=tokenizer, |
| 130 | + model_config=model_config, |
| 131 | +) |
| 132 | + |
| 133 | + |
| 134 | +import torch |
| 135 | +# s3_path = "s3://projet-formation/nouvelles-sources/model_ape.pth" |
| 136 | + |
| 137 | +# state_dict = torch.load("model_ape.pth", map_location="cpu") |
| 138 | + |
| 139 | + |
| 140 | + |
| 141 | +from torchTextClassifiers import TrainingConfig |
| 142 | + |
| 143 | +# Training params (torch style) |
| 144 | +training_config = TrainingConfig( |
| 145 | + num_epochs=30, |
| 146 | + batch_size=8, |
| 147 | + lr=1e-3, |
| 148 | + patience_early_stopping=7, |
| 149 | + num_workers=0, |
| 150 | + trainer_params={'deterministic': True}, |
| 151 | + save_path="model_ape" |
| 152 | +) |
| 153 | + |
| 154 | +# Training ! |
| 155 | +classifier.train( |
| 156 | + X_train, |
| 157 | + y_train, |
| 158 | + training_config, |
| 159 | + X_val, |
| 160 | + y_val, |
| 161 | + verbose=True |
| 162 | +) |
| 163 | + |
| 164 | +# Inference on testset |
| 165 | +result = classifier.predict(X_test) |
| 166 | +predictions = result["prediction"].squeeze().numpy() |
| 167 | + |
| 168 | +# Step 8: Evaluate |
| 169 | +accuracy = (predictions == y_test).mean() |
| 170 | +print(f"Test accuracy: {accuracy:.3f}") |
| 171 | + |
| 172 | + |
| 173 | +# mc cp --recursive ape/model_ape s3/projet-formation/nouvelles-sources/ |
0 commit comments