Skip to content
This repository was archived by the owner on Nov 28, 2020. It is now read-only.

Commit 655fa04

Browse files
committed
Module 5: SKLearn TFIDF, LR, CV + fake news prediction with API RESTful
1 parent 8ea6f04 commit 655fa04

File tree

2 files changed

+381
-1
lines changed

2 files changed

+381
-1
lines changed

PyDM.Module5/project3.ipynb

Lines changed: 380 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,380 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import pandas as pd\n",
10+
"from pandas.core.frame import DataFrame\n",
11+
"from pandas.core.series import Series\n",
12+
"\n",
13+
"import string\n",
14+
"\n",
15+
"import nltk\n",
16+
"from nltk.corpus import stopwords\n",
17+
"\n",
18+
"from numpy import ndarray\n",
19+
"\n",
20+
"import requests as rq\n",
21+
"\n",
22+
"from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
23+
"from sklearn.linear_model import LogisticRegression\n",
24+
"from sklearn.metrics import accuracy_score\n",
25+
"from sklearn.model_selection import train_test_split\n",
26+
"from sklearn.pipeline import Pipeline\n",
27+
"from sklearn.utils import shuffle"
28+
]
29+
},
30+
{
31+
"cell_type": "code",
32+
"execution_count": 2,
33+
"metadata": {},
34+
"outputs": [
35+
{
36+
"data": {
37+
"text/plain": [
38+
"True"
39+
]
40+
},
41+
"execution_count": 2,
42+
"metadata": {},
43+
"output_type": "execute_result"
44+
}
45+
],
46+
"source": [
47+
"nltk.download('stopwords', quiet=True)"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": 3,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"# Number of entries per dataframe\n",
57+
"NUMBER_ENTRIES_PER_DF = 100\n",
58+
"\n",
59+
"# API Key for NEWs API\n",
60+
"API_KEY = \"c1e1bd32547f418190f6bd1a5fa4748d\"\n",
61+
"\n",
62+
"# API URI\n",
63+
"URI = \"https://newsapi.org/v2/everything?apiKey={apiKey}&q=politics&sortBy=popularity\".format(apiKey=API_KEY)"
64+
]
65+
},
66+
{
67+
"cell_type": "code",
68+
"execution_count": 4,
69+
"metadata": {},
70+
"outputs": [],
71+
"source": [
72+
"def get_subsets(df:DataFrame, subset:str)->type(list):\n",
73+
" return df.drop_duplicates(subset=[subset])[subset].to_list()\n",
74+
"\n",
75+
"def get_accuracy(s:Series, arr:ndarray)->type(int):\n",
76+
" return round(accuracy_score(s, arr) * 100, 2)\n",
77+
"\n",
78+
"def get_random_df(df:DataFrame)->type(DataFrame):\n",
79+
" n_df = shuffle(df.reset_index(drop=True))\\\n",
80+
" .head(NUMBER_ENTRIES_PER_DF)\\\n",
81+
" .reset_index(drop=True)\n",
82+
" n_df.info()\n",
83+
" return n_df\n",
84+
"\n",
85+
"def get_remote_news()->type(dict):\n",
86+
" response = rq.get(URI)\n",
87+
" return response.json()\n",
88+
"\n",
89+
"def get_dataframe_from_api_response(data:str, cols_ref:list)\\\n",
90+
"->type(DataFrame):\n",
91+
" news = []\n",
92+
" for article in data['articles']:\n",
93+
" news.append([\n",
94+
" article['title'], # title\n",
95+
" article['description'], # text\n",
96+
" 'politicsNews', # subject\n",
97+
" article['publishedAt'], # date\n",
98+
" 'News' # label\n",
99+
" ])\n",
100+
" return DataFrame(news, columns=cols_ref)\n",
101+
"\n",
102+
"def categorize_news(accuracy:int)->type(str):\n",
103+
" return \"likely \" + (\"true\" if accuracy >= 50\\\n",
104+
" else \"false\")"
105+
]
106+
},
107+
{
108+
"cell_type": "code",
109+
"execution_count": 5,
110+
"metadata": {},
111+
"outputs": [],
112+
"source": [
113+
"t_df = pd.read_csv('../Datasets/True.csv')"
114+
]
115+
},
116+
{
117+
"cell_type": "code",
118+
"execution_count": 6,
119+
"metadata": {},
120+
"outputs": [],
121+
"source": [
122+
"t_df['label'] = 'Real News'"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": 7,
128+
"metadata": {},
129+
"outputs": [
130+
{
131+
"name": "stdout",
132+
"output_type": "stream",
133+
"text": [
134+
"Real News - DF info:\n",
135+
"<class 'pandas.core.frame.DataFrame'>\n",
136+
"RangeIndex: 100 entries, 0 to 99\n",
137+
"Data columns (total 5 columns):\n",
138+
" # Column Non-Null Count Dtype \n",
139+
"--- ------ -------------- ----- \n",
140+
" 0 title 100 non-null object\n",
141+
" 1 text 100 non-null object\n",
142+
" 2 subject 100 non-null object\n",
143+
" 3 date 100 non-null object\n",
144+
" 4 label 100 non-null object\n",
145+
"dtypes: object(5)\n",
146+
"memory usage: 4.0+ KB\n"
147+
]
148+
}
149+
],
150+
"source": [
151+
"print(\"Real News - DF info:\")\n",
152+
"t_df = get_random_df(t_df)"
153+
]
154+
},
155+
{
156+
"cell_type": "code",
157+
"execution_count": 8,
158+
"metadata": {},
159+
"outputs": [
160+
{
161+
"name": "stdout",
162+
"output_type": "stream",
163+
"text": [
164+
"April 11, 2017 \n",
165+
"September 9, 2017 \n"
166+
]
167+
}
168+
],
169+
"source": [
170+
"print(t_df['date'].min())\n",
171+
"print(t_df['date'].max())"
172+
]
173+
},
174+
{
175+
"cell_type": "code",
176+
"execution_count": 9,
177+
"metadata": {},
178+
"outputs": [],
179+
"source": [
180+
"n_data = get_remote_news()"
181+
]
182+
},
183+
{
184+
"cell_type": "code",
185+
"execution_count": 10,
186+
"metadata": {},
187+
"outputs": [
188+
{
189+
"name": "stdout",
190+
"output_type": "stream",
191+
"text": [
192+
"News - DF info:\n",
193+
"<class 'pandas.core.frame.DataFrame'>\n",
194+
"RangeIndex: 20 entries, 0 to 19\n",
195+
"Data columns (total 5 columns):\n",
196+
" # Column Non-Null Count Dtype \n",
197+
"--- ------ -------------- ----- \n",
198+
" 0 title 20 non-null object\n",
199+
" 1 text 20 non-null object\n",
200+
" 2 subject 20 non-null object\n",
201+
" 3 date 20 non-null object\n",
202+
" 4 label 20 non-null object\n",
203+
"dtypes: object(5)\n",
204+
"memory usage: 928.0+ bytes\n"
205+
]
206+
}
207+
],
208+
"source": [
209+
"print(\"News - DF info:\")\n",
210+
"n_df = get_dataframe_from_api_response(n_data, t_df.columns.to_list())\n",
211+
"n_df = get_random_df(n_df)"
212+
]
213+
},
214+
{
215+
"cell_type": "code",
216+
"execution_count": 11,
217+
"metadata": {},
218+
"outputs": [
219+
{
220+
"name": "stdout",
221+
"output_type": "stream",
222+
"text": [
223+
"<class 'pandas.core.frame.DataFrame'>\n",
224+
"RangeIndex: 120 entries, 0 to 119\n",
225+
"Data columns (total 4 columns):\n",
226+
" # Column Non-Null Count Dtype \n",
227+
"--- ------ -------------- ----- \n",
228+
" 0 title 120 non-null object\n",
229+
" 1 text 120 non-null object\n",
230+
" 2 subject 120 non-null object\n",
231+
" 3 label 120 non-null object\n",
232+
"dtypes: object(4)\n",
233+
"memory usage: 3.9+ KB\n"
234+
]
235+
}
236+
],
237+
"source": [
238+
"df = shuffle(pd\\\n",
239+
" .concat([t_df, n_df])\\\n",
240+
" .reset_index(drop=True))\n",
241+
"df = df.reset_index(drop=True)\n",
242+
"df.drop(['date'], axis=1, inplace=True)\n",
243+
"df.info()"
244+
]
245+
},
246+
{
247+
"cell_type": "code",
248+
"execution_count": 12,
249+
"metadata": {},
250+
"outputs": [],
251+
"source": [
252+
"df['text'] = df['text']\\\n",
253+
" .map(lambda x : x.lower()\\\n",
254+
" .translate(str\\\n",
255+
" .maketrans('', '', string.punctuation))\n",
256+
" .join([word for word in x.split() if word not in stopwords.words('english')]))"
257+
]
258+
},
259+
{
260+
"cell_type": "code",
261+
"execution_count": 13,
262+
"metadata": {},
263+
"outputs": [
264+
{
265+
"name": "stdout",
266+
"output_type": "stream",
267+
"text": [
268+
"Subjects: ['worldnews', 'politicsNews']\n"
269+
]
270+
},
271+
{
272+
"data": {
273+
"text/plain": [
274+
"subject\n",
275+
"politicsNews 67\n",
276+
"worldnews 53\n",
277+
"Name: label, dtype: int64"
278+
]
279+
},
280+
"execution_count": 13,
281+
"metadata": {},
282+
"output_type": "execute_result"
283+
}
284+
],
285+
"source": [
286+
"print(\"Subjects: {}\".format(get_subsets(df, 'subject')))\n",
287+
"df.groupby(['subject'])['label'].count()"
288+
]
289+
},
290+
{
291+
"cell_type": "code",
292+
"execution_count": 14,
293+
"metadata": {},
294+
"outputs": [],
295+
"source": [
296+
"X_training, X_testing, y_training, y_testing = train_test_split(\n",
297+
" df['text'],\n",
298+
" df['label']\n",
299+
")"
300+
]
301+
},
302+
{
303+
"cell_type": "code",
304+
"execution_count": 15,
305+
"metadata": {},
306+
"outputs": [],
307+
"source": [
308+
"ml_pipeline = Pipeline([\n",
309+
" ('vect', CountVectorizer()),\n",
310+
" ('tfidf', TfidfTransformer()),\n",
311+
" ('model', LogisticRegression())\n",
312+
"])"
313+
]
314+
},
315+
{
316+
"cell_type": "code",
317+
"execution_count": 16,
318+
"metadata": {},
319+
"outputs": [],
320+
"source": [
321+
"ml_model = ml_pipeline.fit(X_training, y_training)"
322+
]
323+
},
324+
{
325+
"cell_type": "code",
326+
"execution_count": 17,
327+
"metadata": {},
328+
"outputs": [],
329+
"source": [
330+
"ml_preds = ml_model.predict(X_testing)"
331+
]
332+
},
333+
{
334+
"cell_type": "code",
335+
"execution_count": 18,
336+
"metadata": {},
337+
"outputs": [
338+
{
339+
"name": "stdout",
340+
"output_type": "stream",
341+
"text": [
342+
"Prediction accuracy: 76.67% (truthfulness rating: likely true)\n"
343+
]
344+
}
345+
],
346+
"source": [
347+
"acc = get_accuracy(y_testing, ml_preds)\n",
348+
"print(\"Prediction accuracy: {}% (truthfulness rating: {})\".format(acc, categorize_news(acc)))"
349+
]
350+
},
351+
{
352+
"cell_type": "code",
353+
"execution_count": null,
354+
"metadata": {},
355+
"outputs": [],
356+
"source": []
357+
}
358+
],
359+
"metadata": {
360+
"kernelspec": {
361+
"display_name": "Python 3",
362+
"language": "python",
363+
"name": "python3"
364+
},
365+
"language_info": {
366+
"codemirror_mode": {
367+
"name": "ipython",
368+
"version": 3
369+
},
370+
"file_extension": ".py",
371+
"mimetype": "text/x-python",
372+
"name": "python",
373+
"nbconvert_exporter": "python",
374+
"pygments_lexer": "ipython3",
375+
"version": "3.8.3"
376+
}
377+
},
378+
"nbformat": 4,
379+
"nbformat_minor": 4
380+
}

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ This repository contains all practices from [Pirple's "Data Mining With Python"
66
- **Module 2** - [`Data Mining Fundamentals`][module-2]
77
- **Module 3** - [`Frameworks Explained`][module-3]
88
- **Module 4** - [`Mining and Storing Data`][module-4]
9-
- **Module 5** - [`Natural Language Processing`][module-5] :hammer:
9+
- **Module 5** - [`Natural Language Processing`][module-5]
1010

1111
## Languages
1212
![python-language-badge] ![jupyter-language-badge]

0 commit comments

Comments
 (0)