Module 5: SKLearn TFIDF, LR, CV + fake news prediction with API RESTful

Devwarlt · Devwarlt · commit 655fa04251f3 · 2020-11-27T20:57:33.000-02:00
diff --git a/PyDM.Module5/project3.ipynb b/PyDM.Module5/project3.ipynb
@@ -0,0 +1,380 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pandas.core.frame import DataFrame\n",
+    "from pandas.core.series import Series\n",
+    "\n",
+    "import string\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from numpy import ndarray\n",
+    "\n",
+    "import requests as rq\n",
+    "\n",
+    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.utils import shuffle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.download('stopwords', quiet=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of entries per dataframe\n",
+    "NUMBER_ENTRIES_PER_DF = 100\n",
+    "\n",
+    "# API Key for NEWs API\n",
+    "API_KEY = \"c1e1bd32547f418190f6bd1a5fa4748d\"\n",
+    "\n",
+    "# API URI\n",
+    "URI = \"https://newsapi.org/v2/everything?apiKey={apiKey}&q=politics&sortBy=popularity\".format(apiKey=API_KEY)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_subsets(df:DataFrame, subset:str)->type(list):\n",
+    "    return df.drop_duplicates(subset=[subset])[subset].to_list()\n",
+    "\n",
+    "def get_accuracy(s:Series, arr:ndarray)->type(int):\n",
+    "    return round(accuracy_score(s, arr) * 100, 2)\n",
+    "\n",
+    "def get_random_df(df:DataFrame)->type(DataFrame):\n",
+    "    n_df = shuffle(df.reset_index(drop=True))\\\n",
+    "        .head(NUMBER_ENTRIES_PER_DF)\\\n",
+    "        .reset_index(drop=True)\n",
+    "    n_df.info()\n",
+    "    return n_df\n",
+    "\n",
+    "def get_remote_news()->type(dict):\n",
+    "    response = rq.get(URI)\n",
+    "    return response.json()\n",
+    "\n",
+    "def get_dataframe_from_api_response(data:str, cols_ref:list)\\\n",
+    "->type(DataFrame):\n",
+    "    news = []\n",
+    "    for article in data['articles']:\n",
+    "        news.append([\n",
+    "            article['title'], # title\n",
+    "            article['description'], # text\n",
+    "            'politicsNews', # subject\n",
+    "            article['publishedAt'], # date\n",
+    "            'News' # label\n",
+    "        ])\n",
+    "    return DataFrame(news, columns=cols_ref)\n",
+    "\n",
+    "def categorize_news(accuracy:int)->type(str):\n",
+    "    return \"likely \" + (\"true\" if accuracy >= 50\\\n",
+    "        else \"false\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_df = pd.read_csv('../Datasets/True.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_df['label'] = 'Real News'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Real News - DF info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    100 non-null    object\n",
+      " 1   text     100 non-null    object\n",
+      " 2   subject  100 non-null    object\n",
+      " 3   date     100 non-null    object\n",
+      " 4   label    100 non-null    object\n",
+      "dtypes: object(5)\n",
+      "memory usage: 4.0+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Real News - DF info:\")\n",
+    "t_df = get_random_df(t_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "April 11, 2017 \n",
+      "September 9, 2017 \n"
+     ]
+    }
+   ],
+   "source": [
+    "print(t_df['date'].min())\n",
+    "print(t_df['date'].max())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_data = get_remote_news()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "News - DF info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 20 entries, 0 to 19\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    20 non-null     object\n",
+      " 1   text     20 non-null     object\n",
+      " 2   subject  20 non-null     object\n",
+      " 3   date     20 non-null     object\n",
+      " 4   label    20 non-null     object\n",
+      "dtypes: object(5)\n",
+      "memory usage: 928.0+ bytes\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"News - DF info:\")\n",
+    "n_df = get_dataframe_from_api_response(n_data, t_df.columns.to_list())\n",
+    "n_df = get_random_df(n_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 120 entries, 0 to 119\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    120 non-null    object\n",
+      " 1   text     120 non-null    object\n",
+      " 2   subject  120 non-null    object\n",
+      " 3   label    120 non-null    object\n",
+      "dtypes: object(4)\n",
+      "memory usage: 3.9+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = shuffle(pd\\\n",
+    "     .concat([t_df, n_df])\\\n",
+    "     .reset_index(drop=True))\n",
+    "df = df.reset_index(drop=True)\n",
+    "df.drop(['date'], axis=1, inplace=True)\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['text'] = df['text']\\\n",
+    "    .map(lambda x : x.lower()\\\n",
+    "         .translate(str\\\n",
+    "                .maketrans('', '', string.punctuation))\n",
+    "         .join([word for word in x.split() if word not in stopwords.words('english')]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Subjects: ['worldnews', 'politicsNews']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "subject\n",
+       "politicsNews    67\n",
+       "worldnews       53\n",
+       "Name: label, dtype: int64"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"Subjects: {}\".format(get_subsets(df, 'subject')))\n",
+    "df.groupby(['subject'])['label'].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_training, X_testing, y_training, y_testing = train_test_split(\n",
+    "    df['text'],\n",
+    "    df['label']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer()),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('model', LogisticRegression())\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_model = ml_pipeline.fit(X_training, y_training)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_preds = ml_model.predict(X_testing)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction accuracy: 76.67% (truthfulness rating: likely true)\n"
+     ]
+    }
+   ],
+   "source": [
+    "acc = get_accuracy(y_testing, ml_preds)\n",
+    "print(\"Prediction accuracy: {}% (truthfulness rating: {})\".format(acc, categorize_news(acc)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ This repository contains all practices from [Pirple's "Data Mining With Python"
 - **Module 2** - [`Data Mining Fundamentals`][module-2]
 - **Module 3** - [`Frameworks Explained`][module-3]
 - **Module 4** - [`Mining and Storing Data`][module-4]
-- **Module 5** - [`Natural Language Processing`][module-5] :hammer:
+- **Module 5** - [`Natural Language Processing`][module-5]
 
 ## Languages
 ![python-language-badge] ![jupyter-language-badge]