Skip to content

Commit 0206ad7

Browse files
Merge pull request #2378 from smty2018/BERT
NLP with BERT(Bidirectional Encoder Representations Transformers) for Movie Review Sentiment Analysis
2 parents 58fe07e + 2704891 commit 0206ad7

File tree

2 files changed

+369
-0
lines changed

2 files changed

+369
-0
lines changed
Lines changed: 369 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,369 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"provenance": []
7+
},
8+
"kernelspec": {
9+
"name": "python3",
10+
"display_name": "Python 3"
11+
},
12+
"language_info": {
13+
"name": "python"
14+
}
15+
},
16+
"cells": [
17+
{
18+
"cell_type": "markdown",
19+
"source": [
20+
"#Libraries"
21+
],
22+
"metadata": {
23+
"id": "XmjSOfm5C7Y3"
24+
}
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": null,
29+
"metadata": {
30+
"colab": {
31+
"base_uri": "https://localhost:8080/"
32+
},
33+
"id": "3syypoOe4SZ0",
34+
"outputId": "b319cd48-1f8c-46aa-8e76-721f90fb13b9"
35+
},
36+
"outputs": [
37+
{
38+
"output_type": "stream",
39+
"name": "stdout",
40+
"text": [
41+
"Requirement already satisfied: ktrain in /usr/local/lib/python3.10/dist-packages (0.37.6)\n",
42+
"Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.2.2)\n",
43+
"Requirement already satisfied: matplotlib>=3.0.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (3.7.1)\n",
44+
"Requirement already satisfied: pandas>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.5.3)\n",
45+
"Requirement already satisfied: fastprogress>=0.1.21 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.0.3)\n",
46+
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.27.1)\n",
47+
"Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.3.1)\n",
48+
"Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from ktrain) (23.1)\n",
49+
"Requirement already satisfied: langdetect in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.0.9)\n",
50+
"Requirement already satisfied: jieba in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.42.1)\n",
51+
"Requirement already satisfied: cchardet in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.1.7)\n",
52+
"Requirement already satisfied: chardet in /usr/local/lib/python3.10/dist-packages (from ktrain) (4.0.0)\n",
53+
"Requirement already satisfied: syntok>1.3.3 in /usr/local/lib/python3.10/dist-packages (from ktrain) (1.4.4)\n",
54+
"Requirement already satisfied: tika in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.6.0)\n",
55+
"Requirement already satisfied: transformers>=4.17.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (4.31.0)\n",
56+
"Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.1.99)\n",
57+
"Requirement already satisfied: keras-bert>=0.86.0 in /usr/local/lib/python3.10/dist-packages (from ktrain) (0.89.0)\n",
58+
"Requirement already satisfied: whoosh in /usr/local/lib/python3.10/dist-packages (from ktrain) (2.7.4)\n",
59+
"Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from keras-bert>=0.86.0->ktrain) (1.22.4)\n",
60+
"Requirement already satisfied: keras-transformer==0.40.0 in /usr/local/lib/python3.10/dist-packages (from keras-bert>=0.86.0->ktrain) (0.40.0)\n",
61+
"Requirement already satisfied: keras-pos-embd==0.13.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.13.0)\n",
62+
"Requirement already satisfied: keras-multi-head==0.29.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.29.0)\n",
63+
"Requirement already satisfied: keras-layer-normalization==0.16.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.16.0)\n",
64+
"Requirement already satisfied: keras-position-wise-feed-forward==0.8.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.8.0)\n",
65+
"Requirement already satisfied: keras-embed-sim==0.10.0 in /usr/local/lib/python3.10/dist-packages (from keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.10.0)\n",
66+
"Requirement already satisfied: keras-self-attention==0.51.0 in /usr/local/lib/python3.10/dist-packages (from keras-multi-head==0.29.0->keras-transformer==0.40.0->keras-bert>=0.86.0->ktrain) (0.51.0)\n",
67+
"Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (1.1.0)\n",
68+
"Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (0.11.0)\n",
69+
"Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (4.41.0)\n",
70+
"Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (1.4.4)\n",
71+
"Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (8.4.0)\n",
72+
"Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (3.1.0)\n",
73+
"Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib>=3.0.0->ktrain) (2.8.2)\n",
74+
"Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.1->ktrain) (2022.7.1)\n",
75+
"Requirement already satisfied: regex>2016 in /usr/local/lib/python3.10/dist-packages (from syntok>1.3.3->ktrain) (2022.10.31)\n",
76+
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (3.12.2)\n",
77+
"Requirement already satisfied: huggingface-hub<1.0,>=0.14.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.16.4)\n",
78+
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (6.0.1)\n",
79+
"Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.13.3)\n",
80+
"Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (0.3.1)\n",
81+
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.17.0->ktrain) (4.65.0)\n",
82+
"Requirement already satisfied: six in /usr/local/lib/python3.10/dist-packages (from langdetect->ktrain) (1.16.0)\n",
83+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (1.26.16)\n",
84+
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (2023.5.7)\n",
85+
"Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (2.0.12)\n",
86+
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->ktrain) (3.4)\n",
87+
"Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->ktrain) (1.10.1)\n",
88+
"Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->ktrain) (3.2.0)\n",
89+
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from tika->ktrain) (67.7.2)\n",
90+
"Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers>=4.17.0->ktrain) (2023.6.0)\n",
91+
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.14.1->transformers>=4.17.0->ktrain) (4.7.1)\n"
92+
]
93+
}
94+
],
95+
"source": [
96+
"!pip3 install ktrain"
97+
]
98+
},
99+
{
100+
"cell_type": "code",
101+
"source": [
102+
"import os.path\n",
103+
"import numpy as np\n",
104+
"import ktrain\n",
105+
"from ktrain import text\n",
106+
"import tensorflow"
107+
],
108+
"metadata": {
109+
"id": "0ZejN0MU6dnb"
110+
},
111+
"execution_count": null,
112+
"outputs": []
113+
},
114+
{
115+
"cell_type": "markdown",
116+
"source": [
117+
"#Dataset"
118+
],
119+
"metadata": {
120+
"id": "oSJh43dYC_I4"
121+
}
122+
},
123+
{
124+
"cell_type": "code",
125+
"source": [
126+
"data=tensorflow.keras.utils.get_file(fname=\"aclImdb_v1.tar.gz\",origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\",extract=True)"
127+
],
128+
"metadata": {
129+
"id": "navTD1Nu7NMH"
130+
},
131+
"execution_count": null,
132+
"outputs": []
133+
},
134+
{
135+
"cell_type": "code",
136+
"source": [
137+
"dir=os.path.join(os.path.dirname(data),\"aclImdb\")"
138+
],
139+
"metadata": {
140+
"id": "DJD9_h829wMX"
141+
},
142+
"execution_count": null,
143+
"outputs": []
144+
},
145+
{
146+
"cell_type": "code",
147+
"source": [
148+
"(x_train,y_train),(x_test,y_test),preproc=text.texts_from_folder(datadir=dir,classes=[\"pos\",\"neg\"],train_test_names=[\"train\",\"test\"],preprocess_mode=\"bert\")"
149+
],
150+
"metadata": {
151+
"colab": {
152+
"base_uri": "https://localhost:8080/",
153+
"height": 161
154+
},
155+
"id": "M84oU3gM-1zZ",
156+
"outputId": "4cfe9061-cd3f-4d21-8826-c78853d4e090"
157+
},
158+
"execution_count": null,
159+
"outputs": [
160+
{
161+
"output_type": "stream",
162+
"name": "stdout",
163+
"text": [
164+
"detected encoding: utf-8\n",
165+
"preprocessing train...\n",
166+
"language: en\n"
167+
]
168+
},
169+
{
170+
"output_type": "display_data",
171+
"data": {
172+
"text/plain": [
173+
"<IPython.core.display.HTML object>"
174+
],
175+
"text/html": [
176+
"\n",
177+
"<style>\n",
178+
" /* Turns off some styling */\n",
179+
" progress {\n",
180+
" /* gets rid of default border in Firefox and Opera. */\n",
181+
" border: none;\n",
182+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
183+
" background-size: auto;\n",
184+
" }\n",
185+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
186+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
187+
" }\n",
188+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
189+
" background: #F44336;\n",
190+
" }\n",
191+
"</style>\n"
192+
]
193+
},
194+
"metadata": {}
195+
},
196+
{
197+
"output_type": "display_data",
198+
"data": {
199+
"text/plain": [
200+
"<IPython.core.display.HTML object>"
201+
],
202+
"text/html": [
203+
"done."
204+
]
205+
},
206+
"metadata": {}
207+
},
208+
{
209+
"output_type": "stream",
210+
"name": "stdout",
211+
"text": [
212+
"Is Multi-Label? False\n",
213+
"preprocessing test...\n",
214+
"language: en\n"
215+
]
216+
},
217+
{
218+
"output_type": "display_data",
219+
"data": {
220+
"text/plain": [
221+
"<IPython.core.display.HTML object>"
222+
],
223+
"text/html": [
224+
"\n",
225+
"<style>\n",
226+
" /* Turns off some styling */\n",
227+
" progress {\n",
228+
" /* gets rid of default border in Firefox and Opera. */\n",
229+
" border: none;\n",
230+
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
231+
" background-size: auto;\n",
232+
" }\n",
233+
" progress:not([value]), progress:not([value])::-webkit-progress-bar {\n",
234+
" background: repeating-linear-gradient(45deg, #7e7e7e, #7e7e7e 10px, #5c5c5c 10px, #5c5c5c 20px);\n",
235+
" }\n",
236+
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
237+
" background: #F44336;\n",
238+
" }\n",
239+
"</style>\n"
240+
]
241+
},
242+
"metadata": {}
243+
},
244+
{
245+
"output_type": "display_data",
246+
"data": {
247+
"text/plain": [
248+
"<IPython.core.display.HTML object>"
249+
],
250+
"text/html": [
251+
"done."
252+
]
253+
},
254+
"metadata": {}
255+
}
256+
]
257+
},
258+
{
259+
"cell_type": "markdown",
260+
"source": [
261+
"#BERT Model(Bidirectional Encoder Representations from Transformers)"
262+
],
263+
"metadata": {
264+
"id": "HsD1RIeyDDHi"
265+
}
266+
},
267+
{
268+
"cell_type": "code",
269+
"source": [
270+
"model=text.text_classifier(name=\"bert\",train_data=(x_train,y_train),preproc=preproc)"
271+
],
272+
"metadata": {
273+
"id": "egXY63ExDBG9",
274+
"colab": {
275+
"base_uri": "https://localhost:8080/"
276+
},
277+
"outputId": "9fec6679-1aeb-4098-e9d4-57cb869765cd"
278+
},
279+
"execution_count": null,
280+
"outputs": [
281+
{
282+
"output_type": "stream",
283+
"name": "stdout",
284+
"text": [
285+
"Is Multi-Label? False\n",
286+
"maxlen is 400\n"
287+
]
288+
},
289+
{
290+
"output_type": "stream",
291+
"name": "stderr",
292+
"text": [
293+
"/usr/local/lib/python3.10/dist-packages/keras/initializers/initializers.py:120: UserWarning: The initializer GlorotNormal is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once.\n",
294+
" warnings.warn(\n"
295+
]
296+
},
297+
{
298+
"output_type": "stream",
299+
"name": "stdout",
300+
"text": [
301+
"done.\n"
302+
]
303+
}
304+
]
305+
},
306+
{
307+
"cell_type": "code",
308+
"source": [
309+
"a=ktrain.get_learner(model=model,train_data=(x_train,y_train),val_data=(x_test,y_test),batch_size=32)"
310+
],
311+
"metadata": {
312+
"id": "ICtxz7LHaB1I",
313+
"colab": {
314+
"base_uri": "https://localhost:8080/"
315+
},
316+
"outputId": "c3b1c676-3fab-4445-e227-975f6a015e16"
317+
},
318+
"execution_count": null,
319+
"outputs": [
320+
{
321+
"output_type": "stream",
322+
"name": "stderr",
323+
"text": [
324+
"/usr/local/lib/python3.10/dist-packages/ktrain/__init__.py:100: UserWarning: For a GPU with 12GB of RAM, the following maxima apply:\n",
325+
" sequence len=64, max_batch_size=64\n",
326+
" sequence len=128, max_batch_size=32\n",
327+
" sequence len=256, max_batch_size=16\n",
328+
" sequence len=320, max_batch_size=14\n",
329+
" sequence len=384, max_batch_size=12\n",
330+
" sequence len=512, max_batch_size=6\n",
331+
"\n",
332+
" You've exceeded these limits.\n",
333+
" If using a GPU with <=12GB of memory, you may run out of memory during training.\n",
334+
" If necessary, adjust sequence length or batch size based on above.\n",
335+
" I.warnings.warn(msg)\n"
336+
]
337+
}
338+
]
339+
},
340+
{
341+
"cell_type": "code",
342+
"source": [
343+
"a.fit_onecycle(lr=2e-5,epochs=1)"
344+
],
345+
"metadata": {
346+
"id": "mAjZxMowbr_R",
347+
"colab": {
348+
"base_uri": "https://localhost:8080/",
349+
"height": 171
350+
},
351+
"outputId": "47cc0abe-4083-4cd5-cc8d-6d1ee3e5cb31"
352+
},
353+
"execution_count": null,
354+
"outputs": [
355+
{
356+
"output_type": "error",
357+
"ename": "NameError",
358+
"evalue": "ignored",
359+
"traceback": [
360+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
361+
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
362+
"\u001b[0;32m<ipython-input-1-3c959640d8b7>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0ma\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_onecycle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlr\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2e-5\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
363+
"\u001b[0;31mNameError\u001b[0m: name 'a' is not defined"
364+
]
365+
}
366+
]
367+
}
368+
]
369+
}
Binary file not shown.

0 commit comments

Comments
 (0)