Skip to content
This repository was archived by the owner on Nov 28, 2020. It is now read-only.

Commit 3c22e2f

Browse files
committed
Module 4: logistic regression + SVM with SKLearn
1 parent 63461cf commit 3c22e2f

File tree

2 files changed

+331
-1
lines changed

2 files changed

+331
-1
lines changed

PyDM.Module4/project2.ipynb

Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"**Note:** I tried several times to use PySpark to use `Logistic Regression` procedure, but most of times I got stuck on its processing. Therefore, I did a research and learn how to do that using SKLearn instead. Fortunately, I got better results using it rather than PySpark Framework for this purpose."
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import pandas as pd\n",
17+
"from pandas.core.frame import DataFrame\n",
18+
"from pandas.core.series import Series\n",
19+
"\n",
20+
"import string\n",
21+
"\n",
22+
"import nltk\n",
23+
"from nltk.corpus import stopwords\n",
24+
"\n",
25+
"from numpy import ndarray\n",
26+
"\n",
27+
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
28+
"from sklearn.linear_model import LogisticRegression\n",
29+
"from sklearn.metrics import accuracy_score\n",
30+
"from sklearn.model_selection import train_test_split\n",
31+
"from sklearn.pipeline import Pipeline\n",
32+
"from sklearn.utils import shuffle"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 2,
38+
"metadata": {},
39+
"outputs": [
40+
{
41+
"data": {
42+
"text/plain": [
43+
"True"
44+
]
45+
},
46+
"execution_count": 2,
47+
"metadata": {},
48+
"output_type": "execute_result"
49+
}
50+
],
51+
"source": [
52+
"nltk.download('stopwords', quiet=True)"
53+
]
54+
},
55+
{
56+
"cell_type": "code",
57+
"execution_count": 3,
58+
"metadata": {},
59+
"outputs": [],
60+
"source": [
61+
"# Number of entries per dataframe\n",
62+
"NUMBER_ENTRIES_PER_DF = 100"
63+
]
64+
},
65+
{
66+
"cell_type": "code",
67+
"execution_count": 4,
68+
"metadata": {},
69+
"outputs": [],
70+
"source": [
71+
"def get_subsets(df:DataFrame, subset:str)->type(list):\n",
72+
" return df.drop_duplicates(subset=[subset])[subset].to_list()\n",
73+
"\n",
74+
"def get_accuracy(s:Series, arr:ndarray)->type(int):\n",
75+
" return round(accuracy_score(s, arr) * 100, 2)\n",
76+
"\n",
77+
"def get_random_df(df:DataFrame)->type(DataFrame):\n",
78+
" n_df = shuffle(df.reset_index(drop=True))\\\n",
79+
" .head(NUMBER_ENTRIES_PER_DF)\\\n",
80+
" .reset_index(drop=True)\n",
81+
" n_df.info()\n",
82+
" return n_df"
83+
]
84+
},
85+
{
86+
"cell_type": "code",
87+
"execution_count": 5,
88+
"metadata": {},
89+
"outputs": [],
90+
"source": [
91+
"t_df = pd.read_csv('../Datasets/True.csv')\n",
92+
"f_df = pd.read_csv('../Datasets/Fake.csv')"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": 6,
98+
"metadata": {},
99+
"outputs": [],
100+
"source": [
101+
"t_df['label'] = 'Real News'\n",
102+
"f_df['label'] = 'Fake News'"
103+
]
104+
},
105+
{
106+
"cell_type": "code",
107+
"execution_count": 7,
108+
"metadata": {},
109+
"outputs": [
110+
{
111+
"name": "stdout",
112+
"output_type": "stream",
113+
"text": [
114+
"Real News - DF info:\n",
115+
"<class 'pandas.core.frame.DataFrame'>\n",
116+
"RangeIndex: 100 entries, 0 to 99\n",
117+
"Data columns (total 5 columns):\n",
118+
" # Column Non-Null Count Dtype \n",
119+
"--- ------ -------------- ----- \n",
120+
" 0 title 100 non-null object\n",
121+
" 1 text 100 non-null object\n",
122+
" 2 subject 100 non-null object\n",
123+
" 3 date 100 non-null object\n",
124+
" 4 label 100 non-null object\n",
125+
"dtypes: object(5)\n",
126+
"memory usage: 4.0+ KB\n"
127+
]
128+
}
129+
],
130+
"source": [
131+
"print(\"Real News - DF info:\")\n",
132+
"t_df = get_random_df(t_df)"
133+
]
134+
},
135+
{
136+
"cell_type": "code",
137+
"execution_count": 8,
138+
"metadata": {},
139+
"outputs": [
140+
{
141+
"name": "stdout",
142+
"output_type": "stream",
143+
"text": [
144+
"Fake News - DF info:\n",
145+
"<class 'pandas.core.frame.DataFrame'>\n",
146+
"RangeIndex: 100 entries, 0 to 99\n",
147+
"Data columns (total 5 columns):\n",
148+
" # Column Non-Null Count Dtype \n",
149+
"--- ------ -------------- ----- \n",
150+
" 0 title 100 non-null object\n",
151+
" 1 text 100 non-null object\n",
152+
" 2 subject 100 non-null object\n",
153+
" 3 date 100 non-null object\n",
154+
" 4 label 100 non-null object\n",
155+
"dtypes: object(5)\n",
156+
"memory usage: 4.0+ KB\n"
157+
]
158+
}
159+
],
160+
"source": [
161+
"print(\"Fake News - DF info:\")\n",
162+
"f_df = get_random_df(f_df)"
163+
]
164+
},
165+
{
166+
"cell_type": "code",
167+
"execution_count": 9,
168+
"metadata": {},
169+
"outputs": [
170+
{
171+
"name": "stdout",
172+
"output_type": "stream",
173+
"text": [
174+
"<class 'pandas.core.frame.DataFrame'>\n",
175+
"RangeIndex: 200 entries, 0 to 199\n",
176+
"Data columns (total 4 columns):\n",
177+
" # Column Non-Null Count Dtype \n",
178+
"--- ------ -------------- ----- \n",
179+
" 0 title 200 non-null object\n",
180+
" 1 text 200 non-null object\n",
181+
" 2 subject 200 non-null object\n",
182+
" 3 label 200 non-null object\n",
183+
"dtypes: object(4)\n",
184+
"memory usage: 6.4+ KB\n"
185+
]
186+
}
187+
],
188+
"source": [
189+
"df = shuffle(pd\\\n",
190+
" .concat([t_df, f_df])\\\n",
191+
" .reset_index(drop=True))\n",
192+
"df = df.reset_index(drop=True)\n",
193+
"df.drop(['date'], axis=1, inplace=True)\n",
194+
"df.info()"
195+
]
196+
},
197+
{
198+
"cell_type": "code",
199+
"execution_count": 10,
200+
"metadata": {},
201+
"outputs": [],
202+
"source": [
203+
"df['text'] = df['text']\\\n",
204+
" .map(lambda x : x.lower()\\\n",
205+
" .translate(str\\\n",
206+
" .maketrans('', '', string.punctuation))\n",
207+
" .join([word for word in x.split() if word not in stopwords.words('english')]))"
208+
]
209+
},
210+
{
211+
"cell_type": "code",
212+
"execution_count": 11,
213+
"metadata": {},
214+
"outputs": [
215+
{
216+
"name": "stdout",
217+
"output_type": "stream",
218+
"text": [
219+
"Subjects: ['politics', 'News', 'politicsNews', 'worldnews', 'left-news', 'US_News', 'Government News', 'Middle-east']\n"
220+
]
221+
},
222+
{
223+
"data": {
224+
"text/plain": [
225+
"subject\n",
226+
"Government News 5\n",
227+
"Middle-east 2\n",
228+
"News 39\n",
229+
"US_News 2\n",
230+
"left-news 16\n",
231+
"politics 36\n",
232+
"politicsNews 46\n",
233+
"worldnews 54\n",
234+
"Name: label, dtype: int64"
235+
]
236+
},
237+
"execution_count": 11,
238+
"metadata": {},
239+
"output_type": "execute_result"
240+
}
241+
],
242+
"source": [
243+
"print(\"Subjects: {}\".format(get_subsets(df, 'subject')))\n",
244+
"df.groupby(['subject'])['label'].count()"
245+
]
246+
},
247+
{
248+
"cell_type": "code",
249+
"execution_count": 12,
250+
"metadata": {},
251+
"outputs": [],
252+
"source": [
253+
"X_training, X_testing, y_training, y_testing = train_test_split(\n",
254+
" df['text'],\n",
255+
" df['label'],\n",
256+
" test_size=0.3\n",
257+
")"
258+
]
259+
},
260+
{
261+
"cell_type": "code",
262+
"execution_count": 13,
263+
"metadata": {},
264+
"outputs": [],
265+
"source": [
266+
"ml_pipeline = Pipeline([\n",
267+
" ('vect', CountVectorizer()),\n",
268+
" ('tfidf', TfidfTransformer()),\n",
269+
" ('model', LogisticRegression())\n",
270+
"])"
271+
]
272+
},
273+
{
274+
"cell_type": "code",
275+
"execution_count": 14,
276+
"metadata": {},
277+
"outputs": [],
278+
"source": [
279+
"ml_model = ml_pipeline.fit(X_training, y_training)"
280+
]
281+
},
282+
{
283+
"cell_type": "code",
284+
"execution_count": 15,
285+
"metadata": {},
286+
"outputs": [],
287+
"source": [
288+
"ml_preds = ml_model.predict(X_testing)"
289+
]
290+
},
291+
{
292+
"cell_type": "code",
293+
"execution_count": 16,
294+
"metadata": {},
295+
"outputs": [
296+
{
297+
"name": "stdout",
298+
"output_type": "stream",
299+
"text": [
300+
"Prediction accuracy: 91.67%\n"
301+
]
302+
}
303+
],
304+
"source": [
305+
"print(\"Prediction accuracy: {}%\".format(get_accuracy(y_testing, ml_preds)))"
306+
]
307+
}
308+
],
309+
"metadata": {
310+
"kernelspec": {
311+
"display_name": "Python 3",
312+
"language": "python",
313+
"name": "python3"
314+
},
315+
"language_info": {
316+
"codemirror_mode": {
317+
"name": "ipython",
318+
"version": 3
319+
},
320+
"file_extension": ".py",
321+
"mimetype": "text/x-python",
322+
"name": "python",
323+
"nbconvert_exporter": "python",
324+
"pygments_lexer": "ipython3",
325+
"version": "3.8.3"
326+
}
327+
},
328+
"nbformat": 4,
329+
"nbformat_minor": 4
330+
}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ This repository contains all practices from [Pirple's "Data Mining With Python"
55
- **Module 1** - [`Data Wrangling`][module-1]
66
- **Module 2** - [`Data Mining Fundamentals`][module-2]
77
- **Module 3** - [`Frameworks Explained`][module-3]
8-
- **Module 4** - [`Mining and Storing Data`][module-4] :hammer:
8+
- **Module 4** - [`Mining and Storing Data`][module-4]
99

1010
## Languages
1111
![python-language-badge] ![jupyter-language-badge]

0 commit comments

Comments
 (0)