Applied-Machine-Learning-2022 · github-classroom · Jul 15, 2022 · Jul 19, 2022 · Jul 20, 2022 · Jul 20, 2022
diff --git a/.DS_Store b/.DS_Store
diff --git a/Chert_Model__Decision_Trees.ipynb b/Chert_Model__Decision_Trees.ipynb
diff --git a/Decision_Tree_Random_Forest_FinalProject.ipynb b/Decision_Tree_Random_Forest_FinalProject.ipynb
diff --git a/DesignDoc.docx b/DesignDoc.docx
diff --git a/EXP-00004-Master.xlsx b/EXP-00004-Master.xlsx
diff --git a/EXP-00005-Master.xlsx b/EXP-00005-Master.xlsx
diff --git a/EXP-00006-Master.xlsx b/EXP-00006-Master.xlsx
diff --git a/FinalProject_DecisionTree.ipynb b/FinalProject_DecisionTree.ipynb
diff --git a/GradientBoostingClassifier.ipynb b/GradientBoostingClassifier.ipynb
@@ -0,0 +1,166 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import pickle\n",
+    "\n",
+    "exp_1 = pd.read_excel(\"EXP-00001-Master.xlsx\")\n",
+    "exp_2 = pd.read_excel('EXP-00002-Master.xlsx')\n",
+    "exp_3 = pd.read_excel('EXP-00003-Master.xlsx')\n",
+    "exp_4 = pd.read_excel('EXP-00004-Master.xlsx')\n",
+    "exp_5 = pd.read_excel('EXP-00004-Master.xlsx')\n",
+    "\n",
+    "\n",
+    "exp_1.drop(index=0, inplace=True)\n",
+    "exp_2.drop(index=0, inplace=True)\n",
+    "exp_3.drop(index=0, inplace=True)\n",
+    "exp_4.drop(index = 0, inplace = True)\n",
+    "exp_5.drop(index = 0, inplace= True)\n",
+    "\n",
+    "exp_1.reset_index(drop=True, inplace=True)\n",
+    "exp_2.reset_index(drop=True, inplace=True)\n",
+    "exp_3.reset_index(drop=True, inplace=True)\n",
+    "exp_4.reset_index(drop=True, inplace=True)\n",
+    "exp_5.reset_index(drop=True, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "not_included = ['Id', 'Filter0','Filter1', 'Filter2','Filter3', 'Filter4', 'Filter5', 'Filter6', 'hash', 'Img Id', 'Curvature', 'Transparency', 'Angularity']\n",
+    "filtered = [x for x in exp_1.columns if x not in not_included]\n",
+    "\n",
+    "exp_1_filtered = exp_1[filtered]\n",
+    "exp_2_filtered = exp_2[filtered]\n",
+    "exp_3_filtered = exp_3[filtered]\n",
+    "exp_4_filtered = exp_4[filtered]\n",
+    "exp_5_filtered = exp_5[filtered]\n",
+    "\n",
+    "exp_1_filtered['Production Stage'] = 0\n",
+    "exp_2_filtered['Production Stage'] = 1\n",
+    "exp_3_filtered['Production Stage'] = 2\n",
+    "exp_4_filtered['Production Stage'] = 3\n",
+    "exp_5_filtered['Production Stage'] = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    data[filtered],\n",
+    "    data['Production Stage'],\n",
+    "    test_size=0.1,\n",
+    "    stratify= data['Production Stage'],\n",
+    "    random_state=44)\n",
+    "\n",
+    "y_train.groupby(y_train).count()\n",
+    "y_test.groupby(y_test).count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Gradient Boosting\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import precision_score\n",
+    "\n",
+    "GBModel = GradientBoostingClassifier(\n",
+    "                                    n_estimators=100, \n",
+    "                                    learning_rate = .30 , \n",
+    "                                    max_depth = 3, \n",
+    "                                    max_features= 'auto',\n",
+    "                                    criterion= 'mse',\n",
+    "                                    loss = 'deviance'\n",
+    "                                    )\n",
+    "\n",
+    "\n",
+    "GBModel.fit(scale(data[filtered].values), data[\"Production Stage\"])\n",
+    "\n",
+    "# cluster = GBModel.predict([scale(data[filtered].values)[0]])[0]\n",
+    "\n",
+    "random = data.sample(frac = 1)\n",
+    "\n",
+    "ten_thousand = random[:10000]\n",
+    "\n",
+    "# predictions = model.predict(two_hundred[filtered])\n",
+    "\n",
+    "predictionsGB = GBModel.predict(ten_thousand[filtered])\n",
+    "\n",
+    "print('Accuracy score:',accuracy_score(ten_thousand['Production Stage'], predictionsGB))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.pipeline import Pipeline\n",
+    "gbc_pipe = Pipeline([('GBC', GradientBoostingClassifier())])\n",
+    "\n",
+    "params = [{ 'max_depth': [ 1, 2, 3, 4, 5],\n",
+    "            'learning_rate': [.01, .10, .20, .30, .40, .50],\n",
+    "            'n_estimators': [50, 150, 250, 350, 450,],\n",
+    "            'loss': ['log_loss', 'deviance', 'exponential'],\n",
+    "            'criterion': ['friedman_mse', 'squared_error', 'mse']\n",
+    "            }]\n",
+    "\n",
+    "search = GridSearchCV(GBModel, \n",
+    "                    param_grid = params,\n",
+    "                    scoring = 'accuracy',\n",
+    "                    cv = 5)\n",
+    "\n",
+    "search.fit(X_train,y_train)\n",
+    "print(search.best_params_)\n",
+    "\n",
+    "print(\"score: {}\".format(search.score(X_train, y_train)))\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.12 ('base')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "57279b1f8ab34c5e05b79187a57f554f106338f30a65896aafc1120729eab81b"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Heatmap.png b/Heatmap.png
diff --git a/July 28 - Classifying Microdebitage - Final Project.pdf b/July 28 - Classifying Microdebitage - Final Project.pdf
diff --git a/KMeansClassifier.ipynb b/KMeansClassifier.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import tensorflow as tf\n",
+    "import pickle\n",
+    "\n",
+    "exp_1 = pd.read_excel(\"EXP-00001-Master.xlsx\")\n",
+    "exp_2 = pd.read_excel('EXP-00002-Master.xlsx')\n",
+    "exp_3 = pd.read_excel('EXP-00003-Master.xlsx')\n",
+    "exp_4 = pd.read_excel('EXP-00004-Master.xlsx')\n",
+    "exp_5 = pd.read_excel('EXP-00004-Master.xlsx')\n",
+    "\n",
+    "\n",
+    "exp_1.drop(index=0, inplace=True)\n",
+    "exp_2.drop(index=0, inplace=True)\n",
+    "exp_3.drop(index=0, inplace=True)\n",
+    "exp_4.drop(index = 0, inplace = True)\n",
+    "exp_5.drop(index = 0, inplace= True)\n",
+    "\n",
+    "exp_1.reset_index(drop=True, inplace=True)\n",
+    "exp_2.reset_index(drop=True, inplace=True)\n",
+    "exp_3.reset_index(drop=True, inplace=True)\n",
+    "exp_4.reset_index(drop=True, inplace=True)\n",
+    "exp_5.reset_index(drop=True, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "not_included = ['Id', 'Filter0','Filter1', 'Filter2','Filter3', 'Filter4', 'Filter5', 'Filter6', 'hash', 'Img Id', 'Curvature', 'Transparency', 'Angularity']\n",
+    "filtered = [x for x in exp_1.columns if x not in not_included]\n",
+    "\n",
+    "exp_1_filtered = exp_1[filtered]\n",
+    "exp_2_filtered = exp_2[filtered]\n",
+    "exp_3_filtered = exp_3[filtered]\n",
+    "exp_4_filtered = exp_4[filtered]\n",
+    "exp_5_filtered = exp_5[filtered]\n",
+    "\n",
+    "exp_1_filtered['Production Stage'] = 0\n",
+    "exp_2_filtered['Production Stage'] = 1\n",
+    "exp_3_filtered['Production Stage'] = 2\n",
+    "exp_4_filtered['Production Stage'] = 3\n",
+    "exp_5_filtered['Production Stage'] = 4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    data[filtered],\n",
+    "    data['Production Stage'],\n",
+    "    test_size=0.1,\n",
+    "    stratify= data['Production Stage'],\n",
+    "    random_state=44)\n",
+    "\n",
+    "y_train.groupby(y_train).count()\n",
+    "y_test.groupby(y_test).count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.cluster import KMeans\n",
+    "\n",
+    "model = KMeans(n_clusters= 5, random_state= 0, max_iter= 45 )\n",
+    "\n",
+    "model.fit(scale(data[filtered]))\n",
+    "\n",
+    "cluster = model.predict([scale(data[filtered])[0]])[0]\n",
+    "\n",
+    "two_hundred = random[:1000]\n",
+    "\n",
+    "predictions = model.predict(two_hundred[filtered])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Accuracy score:',accuracy_score(two_hundred['Production Stage'], predictions))\n",
+    "print(precision_score(two_hundred['Production Stage'], predictions, average='macro'))\n",
+    "\n",
+    "predictions"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.13 64-bit (windows store)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "79ff1e5d1c988803ca272e854ac0ee62f88ccf4aa17fd038c7e703dc5466d506"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}