Skip to content

Commit f4e22eb

Browse files
authored
Merge pull request #11 from InseeFrLab/maj-25
Update application nlp
2 parents fe2b1a2 + 6c38a08 commit f4e22eb

File tree

8 files changed

+578
-226
lines changed

8 files changed

+578
-226
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,8 @@ data/
1616
*.gpkgdata/
1717

1818
**/*.quarto_ipynb
19+
applications/data/
20+
applications/model_ape/
21+
*.pkl
22+
*.ckpt
23+
hparams.yaml

applications/ape.qmd

Lines changed: 355 additions & 218 deletions
Large diffs are not rendered by default.

applications/ape_train.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
import os
2+
os.chdir("ape")
3+
import pandas as pd
4+
5+
# Import NAF classification
6+
naf = pd.read_excel("data/naf.parquet", skiprows = 2)
7+
# Import training data
8+
train = pd.read_parquet("data/data.parquet")
9+
# train = train.sample(10000)
10+
11+
# Merge classification info
12+
naf['Code'] = naf['Code'].str.replace(".","")
13+
train = train.merge(naf, left_on = "nace", right_on = "Code")
14+
train.head(5)
15+
16+
17+
18+
def filter_train_data(train_data, sequence):
19+
sequence_capitalized = sequence.upper()
20+
mask = train_data['text'].str.contains(sequence_capitalized)
21+
nb_occurrence = mask.astype(int).sum()
22+
print(
23+
f"Nombre d'occurrences de la séquence '{sequence}': {nb_occurrence}"
24+
)
25+
return train_data.loc[mask]
26+
27+
28+
from nltk.tokenize import word_tokenize
29+
import spacy
30+
31+
os.system("python -m spacy download fr_core_news_sm")
32+
nlp = spacy.load("fr_core_news_sm")
33+
stop_words = nlp.Defaults.stop_words
34+
stop_words = set(stop_words)
35+
36+
import nltk
37+
nltk.download('punkt_tab')
38+
nltk.download('stopwords')
39+
40+
# Function to remove stopwords
41+
def remove_stopwords(text):
42+
word_tokens = word_tokenize(text)
43+
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
44+
return ' '.join(filtered_text)
45+
46+
def remove_single_letters(text):
47+
word_tokens = word_tokenize(text)
48+
filtered_text = [word for word in word_tokens if len(word) > 1]
49+
return ' '.join(filtered_text)
50+
51+
# Apply the function to the 'text' column
52+
train['text_clean'] = (train['text']
53+
.apply(remove_stopwords)
54+
.apply(remove_single_letters)
55+
)
56+
57+
58+
59+
60+
from processor import Preprocessor
61+
preprocessor = Preprocessor()
62+
63+
64+
# Preprocess data before training and testing
65+
TEXT_FEATURE = "text"
66+
Y = "nace"
67+
68+
df = train.copy()
69+
df = preprocessor.clean_text(df, TEXT_FEATURE).drop('text_clean', axis = "columns")
70+
df.head(2)
71+
72+
df = df.dropna(subset = [Y, TEXT_FEATURE])
73+
X = df[TEXT_FEATURE].values
74+
y = df[Y].values
75+
76+
from sklearn.preprocessing import LabelEncoder
77+
le = LabelEncoder()
78+
y_encoded = le.fit_transform(y) # Convertit ["cat", "dog"] → [0, 1]
79+
80+
# Première division : train (80 %) + test (20%)
81+
from sklearn.model_selection import train_test_split
82+
83+
X_train, X_test, y_train, y_test = train_test_split(
84+
X,
85+
y_encoded,
86+
test_size=0.2,
87+
random_state=0,
88+
shuffle=True,
89+
)
90+
# Deuxième division pour aboutir à : train (60 % = 80% * 75%) + val = (60 % = 80% * 25%) + test (20%)
91+
92+
X_train, X_val, y_train, y_val = train_test_split(
93+
X_train,
94+
y_train,
95+
test_size=0.25,
96+
random_state=0,
97+
shuffle=True,
98+
)
99+
100+
from torchTextClassifiers.tokenizers.ngram import NGramTokenizer
101+
102+
tokenizer = NGramTokenizer(
103+
min_count=2, # On considère un mot s'il est trouvé au moins 2 fois dans le corpus
104+
min_n=2,
105+
max_n=4, # On fait des 2grams, 3grams et 4grams de caractères
106+
len_word_ngrams=2, # On fait des 2grams de mots
107+
num_tokens=10000, # Nombre max de tokens considérés
108+
training_text=X,
109+
)
110+
111+
from torchTextClassifiers import ModelConfig
112+
import numpy as np
113+
114+
# Embedding dimension
115+
embedding_dim = 64
116+
117+
# Count number of unique labels
118+
unique_values, counts = np.unique(y, return_counts=True)
119+
num_unique = len(unique_values)
120+
121+
model_config = ModelConfig(
122+
embedding_dim=embedding_dim,
123+
num_classes=num_unique
124+
)
125+
126+
from torchTextClassifiers import torchTextClassifiers
127+
128+
classifier = torchTextClassifiers(
129+
tokenizer=tokenizer,
130+
model_config=model_config,
131+
)
132+
133+
134+
import torch
135+
# s3_path = "s3://projet-formation/nouvelles-sources/model_ape.pth"
136+
137+
# state_dict = torch.load("model_ape.pth", map_location="cpu")
138+
139+
140+
141+
from torchTextClassifiers import TrainingConfig
142+
143+
# Training params (torch style)
144+
training_config = TrainingConfig(
145+
num_epochs=30,
146+
batch_size=8,
147+
lr=1e-3,
148+
patience_early_stopping=7,
149+
num_workers=0,
150+
trainer_params={'deterministic': True},
151+
save_path="model_ape"
152+
)
153+
154+
# Training !
155+
classifier.train(
156+
X_train,
157+
y_train,
158+
training_config,
159+
X_val,
160+
y_val,
161+
verbose=True
162+
)
163+
164+
# Inference on testset
165+
result = classifier.predict(X_test)
166+
predictions = result["prediction"].squeeze().numpy()
167+
168+
# Step 8: Evaluate
169+
accuracy = (predictions == y_test).mean()
170+
print(f"Test accuracy: {accuracy:.3f}")
171+
172+
173+
# mc cp --recursive ape/model_ape s3/projet-formation/nouvelles-sources/

applications/app2-init.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
3+
mkdir ape
4+
cd ape
5+
6+
mkdir data
7+
echo "data/" >> .gitignore
8+
9+
curl https://minio.lab.sspcloud.fr/projet-formation/diffusion/mlops/data/firm_activity_data.parquet --output data/data.parquet
10+
curl https://minio.lab.sspcloud.fr/projet-formation/nouvelles-sources/data/naf2008_liste_n5.xls --output data/naf.parquet
11+
curl -O https://raw.githubusercontent.com/InseeFrLab/cours-nouvelles-donnees-site/main/applications/processor.py
12+
13+
pip install wordcloud
14+
pip install xlrd
15+
pip install spacy
16+
pip install nltk
17+
pip install unidecode
18+
pip install pytorch_lightning
19+
pip install torchTextClassifiers[huggingface]
20+
pip install scikit-learn
21+
22+
echo -e "# Script du TP\nCi-dessous une balise pour voir les résultats en intéractif\n# %%" > script_tp.py

applications/vscode.png

28.4 KB
Loading

download_nlp_reqs.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,10 @@ import nltk
44
nltk.download('stopwords')
55
nltk.download('punkt_tab')
66
END_SCRIPT
7-
python -m spacy download fr_core_news_sm
7+
python -m spacy download fr_core_news_sm
8+
9+
# App APE
10+
mkdir applications/data
11+
12+
curl https://minio.lab.sspcloud.fr/projet-formation/diffusion/mlops/data/firm_activity_data.parquet --output applications/data/data.parquet
13+
curl https://minio.lab.sspcloud.fr/projet-formation/nouvelles-sources/data/naf2008_liste_n5.xls --output applications/data/naf.parquet

preview_site.sh

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# cd cours-nouvelles-donnees-site/
2+
# git checkout maj-25
3+
pip install -r requirements.txt
4+
Rscript -e "renv::restore()"
5+
bash download_nlp_reqs.sh
6+
quarto preview

requirements.txt

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
pandas
2-
nltk
3-
unidecode
42
matplotlib
5-
wordcloud
6-
spacy
7-
scikit-learn
8-
fasttext
93
plotnine
10-
xlrd
114
jupyter
125
pyarrow
136
loguru
7+
8+
# Application APE
9+
wordcloud
10+
xlrd
11+
spacy
12+
nltk
13+
unidecode
14+
pytorch_lightning
15+
torchTextClassifiers
16+
scikit-learn

0 commit comments

Comments
 (0)