Module 4: logistic regression + SVM with SKLearn

Devwarlt · Devwarlt · commit 3c22e2f40b65 · 2020-11-26T15:56:26.000-02:00
diff --git a/PyDM.Module4/project2.ipynb b/PyDM.Module4/project2.ipynb
@@ -0,0 +1,330 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** I tried several times to use PySpark to use `Logistic Regression` procedure, but most of times I got stuck on its processing. Therefore, I did a research and learn how to do that using SKLearn instead. Fortunately, I got better results using it rather than PySpark Framework for this purpose."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from pandas.core.frame import DataFrame\n",
+    "from pandas.core.series import Series\n",
+    "\n",
+    "import string\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "from numpy import ndarray\n",
+    "\n",
+    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.utils import shuffle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.download('stopwords', quiet=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of entries per dataframe\n",
+    "NUMBER_ENTRIES_PER_DF = 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_subsets(df:DataFrame, subset:str)->type(list):\n",
+    "    return df.drop_duplicates(subset=[subset])[subset].to_list()\n",
+    "\n",
+    "def get_accuracy(s:Series, arr:ndarray)->type(int):\n",
+    "    return round(accuracy_score(s, arr) * 100, 2)\n",
+    "\n",
+    "def get_random_df(df:DataFrame)->type(DataFrame):\n",
+    "    n_df = shuffle(df.reset_index(drop=True))\\\n",
+    "        .head(NUMBER_ENTRIES_PER_DF)\\\n",
+    "        .reset_index(drop=True)\n",
+    "    n_df.info()\n",
+    "    return n_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_df = pd.read_csv('../Datasets/True.csv')\n",
+    "f_df = pd.read_csv('../Datasets/Fake.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t_df['label'] = 'Real News'\n",
+    "f_df['label'] = 'Fake News'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Real News - DF info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    100 non-null    object\n",
+      " 1   text     100 non-null    object\n",
+      " 2   subject  100 non-null    object\n",
+      " 3   date     100 non-null    object\n",
+      " 4   label    100 non-null    object\n",
+      "dtypes: object(5)\n",
+      "memory usage: 4.0+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Real News - DF info:\")\n",
+    "t_df = get_random_df(t_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fake News - DF info:\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 100 entries, 0 to 99\n",
+      "Data columns (total 5 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    100 non-null    object\n",
+      " 1   text     100 non-null    object\n",
+      " 2   subject  100 non-null    object\n",
+      " 3   date     100 non-null    object\n",
+      " 4   label    100 non-null    object\n",
+      "dtypes: object(5)\n",
+      "memory usage: 4.0+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Fake News - DF info:\")\n",
+    "f_df = get_random_df(f_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 200 entries, 0 to 199\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column   Non-Null Count  Dtype \n",
+      "---  ------   --------------  ----- \n",
+      " 0   title    200 non-null    object\n",
+      " 1   text     200 non-null    object\n",
+      " 2   subject  200 non-null    object\n",
+      " 3   label    200 non-null    object\n",
+      "dtypes: object(4)\n",
+      "memory usage: 6.4+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = shuffle(pd\\\n",
+    "     .concat([t_df, f_df])\\\n",
+    "     .reset_index(drop=True))\n",
+    "df = df.reset_index(drop=True)\n",
+    "df.drop(['date'], axis=1, inplace=True)\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['text'] = df['text']\\\n",
+    "    .map(lambda x : x.lower()\\\n",
+    "         .translate(str\\\n",
+    "                .maketrans('', '', string.punctuation))\n",
+    "         .join([word for word in x.split() if word not in stopwords.words('english')]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Subjects: ['politics', 'News', 'politicsNews', 'worldnews', 'left-news', 'US_News', 'Government News', 'Middle-east']\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "subject\n",
+       "Government News     5\n",
+       "Middle-east         2\n",
+       "News               39\n",
+       "US_News             2\n",
+       "left-news          16\n",
+       "politics           36\n",
+       "politicsNews       46\n",
+       "worldnews          54\n",
+       "Name: label, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(\"Subjects: {}\".format(get_subsets(df, 'subject')))\n",
+    "df.groupby(['subject'])['label'].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_training, X_testing, y_training, y_testing = train_test_split(\n",
+    "    df['text'],\n",
+    "    df['label'],\n",
+    "    test_size=0.3\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_pipeline = Pipeline([\n",
+    "    ('vect', CountVectorizer()),\n",
+    "    ('tfidf', TfidfTransformer()),\n",
+    "    ('model', LogisticRegression())\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_model = ml_pipeline.fit(X_training, y_training)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ml_preds = ml_model.predict(X_testing)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Prediction accuracy: 91.67%\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Prediction accuracy: {}%\".format(get_accuracy(y_testing, ml_preds)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@ This repository contains all practices from [Pirple's "Data Mining With Python"
 - **Module 1** - [`Data Wrangling`][module-1]
 - **Module 2** - [`Data Mining Fundamentals`][module-2]
 - **Module 3** - [`Frameworks Explained`][module-3]
-- **Module 4** - [`Mining and Storing Data`][module-4] :hammer:
+- **Module 4** - [`Mining and Storing Data`][module-4]
 
 ## Languages
 ![python-language-badge] ![jupyter-language-badge]