diff --git a/Automated Essay Grading/Model/Automated_Essay_Grading (2).ipynb b/Automated Essay Grading/Model/Automated_Essay_Grading (2).ipynb new file mode 100644 index 0000000000..20b01d626f --- /dev/null +++ b/Automated Essay Grading/Model/Automated_Essay_Grading (2).ipynb @@ -0,0 +1,1466 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Automated_Essay_Grading.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "cells": [ + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pGeJ9Tlf5gRq", + "outputId": "366bd917-cfda-4464-eebb-5407a3eb10ec" + }, + "source": [ + "import numpy as np \n", + "import pandas as pd \n", + "\n", + "#for pre-processing\n", + "import nltk\n", + "import re\n", + "from nltk.corpus import stopwords\n", + "from gensim.models import Word2Vec\n", + "\n", + "nltk.download('stopwords')\n", + "nltk.download('punkt')\n", + "\n", + "#for model training\n", + "\n", + "from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten\n", + "from keras.models import Sequential, load_model, model_from_config\n", + "import keras.backend as K\n", + "from sklearn.model_selection import KFold\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import SVR\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.metrics import explained_variance_score\n", + "from sklearn import ensemble\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.metrics import cohen_kappa_score" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", + "[nltk_data] Unzipping corpora/stopwords.zip.\n", + "[nltk_data] Downloading package punkt to /root/nltk_data...\n", + "[nltk_data] Unzipping tokenizers/punkt.zip.\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xpYqK4V55taJ" + }, + "source": [ + "test_data = pd.read_csv(\"test_set.tsv\",sep='\\t', encoding='ISO-8859-1')" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "EkUUzJU1GZgk" + }, + "source": [ + "training_data = pd.read_csv(\"training_set_rel3.tsv\",sep='\\t', encoding='ISO-8859-1',\n", + " usecols = ['essay_id', 'essay_set', 'essay','domain1_score']).dropna(axis=1)" + ], + "execution_count": 16, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "Zuqui4TA-msu" + }, + "source": [ + "scores = training_data['domain1_score']" + ], + "execution_count": 17, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "UNYVwze36yUS", + "outputId": "cff2ed4f-a510-4159-b200-46af5ce91112" + }, + "source": [ + "test_data.head()" + ], + "execution_count": 18, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
essay_idessay_setessaydomain1_predictioniddomain2_predictionid
023831I believe that computers have a positive effec...2383NaN
123841Dear @CAPS1, I know some problems have came up...2384NaN
223851Dear to whom it @MONTH1 concern, Computers are...2385NaN
323861Dear @CAPS1 @CAPS2, @CAPS3 has come to my atte...2386NaN
423871Dear Local newspaper, I think that people have...2387NaN
\n", + "
" + ], + "text/plain": [ + " essay_id essay_set ... domain1_predictionid domain2_predictionid\n", + "0 2383 1 ... 2383 NaN\n", + "1 2384 1 ... 2384 NaN\n", + "2 2385 1 ... 2385 NaN\n", + "3 2386 1 ... 2386 NaN\n", + "4 2387 1 ... 2387 NaN\n", + "\n", + "[5 rows x 5 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 18 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "k-Y0cEu8650v", + "outputId": "3a208166-8286-4f23-fe61-6840da1b3816" + }, + "source": [ + "test_data.shape" + ], + "execution_count": 19, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(4254, 5)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 19 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 204 + }, + "id": "UyhgfBgk60Vh", + "outputId": "53805d71-89e5-438d-a55c-f2f2048db8c4" + }, + "source": [ + "training_data.head()" + ], + "execution_count": 20, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
essay_idessay_setessaydomain1_score
011Dear local newspaper, I think effects computer...8
121Dear @CAPS1 @CAPS2, I believe that using compu...9
231Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...7
341Dear Local Newspaper, @CAPS1 I have found that...10
451Dear @LOCATION1, I know having computers has a...8
\n", + "
" + ], + "text/plain": [ + " essay_id ... domain1_score\n", + "0 1 ... 8\n", + "1 2 ... 9\n", + "2 3 ... 7\n", + "3 4 ... 10\n", + "4 5 ... 8\n", + "\n", + "[5 rows x 4 columns]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 20 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "lIo08YSV64G_", + "outputId": "70658373-5015-4ea6-cd22-274224309911" + }, + "source": [ + "training_data.shape" + ], + "execution_count": 21, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(12976, 4)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 21 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "fmt-Jt9u6qqY" + }, + "source": [ + "test_data.dropna(axis=1,inplace=True)" + ], + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "v9Cvw9XM6wR0" + }, + "source": [ + "\n", + "y = training_data['domain1_score']\n", + "X = training_data.copy()\n" + ], + "execution_count": 23, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "YLIUc7pe7TnY" + }, + "source": [ + "def essay_to_wordlist(essay_v, remove_stopwords):\n", + " #Remove the tagged labels and word tokenize the sentence.\n", + " essay_v = re.sub(\"[^a-zA-Z]\", \" \", essay_v)\n", + " words = essay_v.lower().split()\n", + " if remove_stopwords:\n", + " stops = set(stopwords.words(\"english\"))\n", + " words = [w for w in words if not w in stops]\n", + " return (words)\n", + "\n", + "def essay_to_sentences(essay_v, remove_stopwords):\n", + " \"\"\"Sentence tokenize the essay and call essay_to_wordlist() for word tokenization.\"\"\"\n", + " tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')\n", + " raw_sentences = tokenizer.tokenize(essay_v.strip())\n", + " sentences = []\n", + " for raw_sentence in raw_sentences:\n", + " if len(raw_sentence) > 0:\n", + " sentences.append(essay_to_wordlist(raw_sentence, remove_stopwords))\n", + " return sentences\n", + "\n", + "def makeFeatureVec(words, model, num_features):\n", + " \"\"\"Make ar from the words list of an Essay.\"\"\"\n", + " featureVec = np.zeros((num_features,),dtype=\"float32\")\n", + " num_words = 0.\n", + " index2word_set = set(model.wv.index2word)\n", + " for word in words:\n", + " if word in index2word_set:\n", + " num_words += 1\n", + " featureVec = np.add(featureVec,model[word]) \n", + " featureVec = np.divide(featureVec,num_words)\n", + " return featureVec\n", + "\n", + "def getAvgFeatureVecs(essays, model, num_features):\n", + " \"\"\"Main function to generate the word vectors for word2vec model.\"\"\"\n", + " counter = 0\n", + " essayFeatureVecs = np.zeros((len(essays),num_features),dtype=\"float32\")\n", + " for essay in essays:\n", + " essayFeatureVecs[counter] = makeFeatureVec(essay, model, num_features)\n", + " counter = counter + 1\n", + " return essayFeatureVecs" + ], + "execution_count": 24, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "poBfr2pm7b1c" + }, + "source": [ + "from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten\n", + "from keras.models import Sequential, load_model, model_from_config\n", + "import keras.backend as K\n", + "\n", + "def get_model():\n", + " \"\"\"Define the model.\"\"\"\n", + " model = Sequential()\n", + " model.add(LSTM(300, dropout=0.4, recurrent_dropout=0.4, input_shape=[1, 300], return_sequences=True))\n", + " model.add(LSTM(64, recurrent_dropout=0.4))\n", + " model.add(Dropout(0.5))\n", + " model.add(Dense(1, activation='relu'))\n", + "\n", + " model.compile(loss='mean_squared_error', optimizer='rmsprop', metrics=['mae'])\n", + " model.summary()\n", + "\n", + " return model" + ], + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j3T0MKXk7gti", + "outputId": "c6022e85-0b13-4c42-f133-3449ee989b5c" + }, + "source": [ + "cv = KFold(n_splits = 8, shuffle = True)\n", + "results = []\n", + "y_pred_list = []\n", + "\n", + "count = 1\n", + "for traincv, testcv in cv.split(X):\n", + " print(\"\\n--------Fold {}--------\\n\".format(count))\n", + " X_test, X_train, y_test, y_train = X.iloc[testcv], X.iloc[traincv], y.iloc[testcv], y.iloc[traincv]\n", + " \n", + " train_essays = X_train['essay']\n", + " test_essays = X_test['essay']\n", + " \n", + " sentences = []\n", + " \n", + " for essay in train_essays:\n", + " # Obtaining all sentences from the training essays.\n", + " sentences += essay_to_sentences(essay, remove_stopwords = True)\n", + " \n", + " # Initializing variables for word2vec model.\n", + " num_features = 300\n", + " min_word_count = 40\n", + " num_workers = 8\n", + " context = 10\n", + " downsampling = 1e-3\n", + "\n", + " \n", + " print(\"Training Word2Vec Model...\")\n", + " model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)\n", + "\n", + " model.init_sims(replace=True)\n", + " model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)\n", + "\n", + " clean_train_essays = []\n", + " \n", + " # Generate training and testing data word vectors.\n", + " for essay_v in train_essays:\n", + " clean_train_essays.append(essay_to_wordlist(essay_v, remove_stopwords=True))\n", + " trainDataVecs = getAvgFeatureVecs(clean_train_essays, model, num_features)\n", + " \n", + " clean_test_essays = []\n", + " for essay_v in test_essays:\n", + " clean_test_essays.append(essay_to_wordlist( essay_v, remove_stopwords=True ))\n", + " testDataVecs = getAvgFeatureVecs( clean_test_essays, model, num_features )\n", + " \n", + " trainDataVecs = np.array(trainDataVecs)\n", + " testDataVecs = np.array(testDataVecs)\n", + " # Reshaping train and test vectors to 3 dimensions. (1 represnts one timestep)\n", + " trainDataVecs = np.reshape(trainDataVecs, (trainDataVecs.shape[0], 1, trainDataVecs.shape[1]))\n", + " testDataVecs = np.reshape(testDataVecs, (testDataVecs.shape[0], 1, testDataVecs.shape[1]))\n", + " \n", + " lstm_model = get_model()\n", + " lstm_model.fit(trainDataVecs, y_train, batch_size=64, epochs=2)\n", + " #lstm_model.load_weights('./model_weights/final_lstm.h5')\n", + " y_pred = lstm_model.predict(testDataVecs)\n", + " \n", + " # Save any one of the 5 models.\n", + " if count == 5:\n", + " lstm_model.save_weights('final_lstm.h5')\n", + " \n", + " # Round y_pred to the nearest integer.\n", + " y_pred = np.around(y_pred)\n", + " \n", + " # Evaluate the model on the evaluation metric. \"Quadratic mean averaged Kappa\"\n", + " result = cohen_kappa_score(y_test.values,y_pred,weights='quadratic')\n", + " print(\"Kappa Score: {}\".format(result))\n", + " results.append(result)\n", + "\n", + " count += 1 " + ], + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "text": [ + "\n", + "--------Fold 1--------\n", + "\n", + "Training Word2Vec Model...\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Model: \"sequential\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_1 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 34s 26ms/step - loss: 80.0289 - mae: 5.0009\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 42.6830 - mae: 3.6584\n", + "Kappa Score: 0.7903575237442636\n", + "\n", + "--------Fold 2--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_1\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_2 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_3 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_1 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_1 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 13s 26ms/step - loss: 80.4811 - mae: 4.9985\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 41.0250 - mae: 3.5642\n", + "Kappa Score: 0.7576200645615898\n", + "\n", + "--------Fold 3--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_2\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_4 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_5 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_2 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_2 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 12s 26ms/step - loss: 86.8164 - mae: 5.1346\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 25ms/step - loss: 42.9669 - mae: 3.6482\n", + "Kappa Score: 0.7750784192913241\n", + "\n", + "--------Fold 4--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_3\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_6 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_7 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_3 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_3 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 12s 26ms/step - loss: 84.9276 - mae: 5.1270\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 41.1146 - mae: 3.5898\n", + "Kappa Score: 0.7666838457375764\n", + "\n", + "--------Fold 5--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_4\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_8 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_9 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_4 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_4 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 13s 27ms/step - loss: 82.9046 - mae: 5.0942\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 27ms/step - loss: 44.5824 - mae: 3.7365\n", + "Kappa Score: 0.7679175647531018\n", + "\n", + "--------Fold 6--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_5\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_10 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_11 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_5 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_5 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 12s 26ms/step - loss: 82.4743 - mae: 5.0568\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 41.3642 - mae: 3.6533\n", + "Kappa Score: 0.7668898538505264\n", + "\n", + "--------Fold 7--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_6\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_12 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_13 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_6 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_6 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 12s 26ms/step - loss: 77.7445 - mae: 4.8427\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 41.1949 - mae: 3.6313\n", + "Kappa Score: 0.7526492043229287\n", + "\n", + "--------Fold 8--------\n", + "\n", + "Training Word2Vec Model...\n", + "Model: \"sequential_7\"\n", + "_________________________________________________________________\n", + "Layer (type) Output Shape Param # \n", + "=================================================================\n", + "lstm_14 (LSTM) (None, 1, 300) 721200 \n", + "_________________________________________________________________\n", + "lstm_15 (LSTM) (None, 64) 93440 \n", + "_________________________________________________________________\n", + "dropout_7 (Dropout) (None, 64) 0 \n", + "_________________________________________________________________\n", + "dense_7 (Dense) (None, 1) 65 \n", + "=================================================================\n", + "Total params: 814,705\n", + "Trainable params: 814,705\n", + "Non-trainable params: 0\n", + "_________________________________________________________________\n", + "Epoch 1/2\n", + "178/178 [==============================] - 13s 27ms/step - loss: 85.8951 - mae: 5.1660\n", + "Epoch 2/2\n", + "178/178 [==============================] - 5s 26ms/step - loss: 41.5336 - mae: 3.6033\n", + "Kappa Score: 0.789614734201678\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1uVCRjAG-CMF", + "outputId": "f3edc244-db77-4e9d-c2cc-f61d0db6e099" + }, + "source": [ + "print(\"Average Kappa score after a 5-fold cross validation: \",np.around(np.array(results).mean(),decimals=4))" + ], + "execution_count": 27, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Average Kappa score after a 5-fold cross validation: 0.7709\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "5IVKdy1d7j2q", + "outputId": "ef6d01d4-64f6-4bcd-8cd6-ee4803fb1ac8" + }, + "source": [ + "# Splitting dataset into training and test set and generating word embeddings for other models other than\n", + "# neural networks\n", + "\n", + "indep_train, indep_test, dep_train, dep_test = train_test_split(X, scores, test_size = 0.25)\n", + "\n", + "train_essays2 = indep_train['essay']\n", + "test_essays2 = indep_test['essay']\n", + " \n", + "sentences2 = []\n", + "\n", + "\n", + "for essay2 in train_essays2:\n", + " # Obtaining all sentences from the training set of essays.\n", + " sentences2 += essay_to_sentences(essay2,remove_stopwords = True)\n", + " \n", + "# Initializing variables for word2vec model.\n", + "num_features = 300 \n", + "min_word_count = 40\n", + "num_workers = 4\n", + "context = 10\n", + "downsampling = 1e-3\n", + "\n", + "print(\"Training Word2Vec Model...\")\n", + "model = Word2Vec(sentences, workers=num_workers, size=num_features, min_count = min_word_count, window = context, sample = downsampling)\n", + "\n", + "model.init_sims(replace=True)\n", + "model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)\n", + "\n", + "clean_train_essays2 = []\n", + " \n", + "# Generate training and testing data word vectors.\n", + "for essay_text2 in train_essays2:\n", + " clean_train_essays2.append(essay_to_wordlist(essay_text2,remove_stopwords = True))\n", + "trainDataVecs2 = getAvgFeatureVecs(clean_train_essays2, model, num_features)\n", + " \n", + "clean_test_essays2 = []\n", + "for essay_text2 in test_essays2:\n", + " clean_test_essays2.append(essay_to_wordlist(essay_text2,remove_stopwords = True))\n", + "testDataVecs2 = getAvgFeatureVecs(clean_test_essays2, model, num_features)\n", + " \n", + "trainDataVecs2 = np.array(trainDataVecs2)\n", + "testDataVecs2 = np.array(testDataVecs2)" + ], + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Training Word2Vec Model...\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:28: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cCDipn8i9zee", + "outputId": "7a19bca9-8523-4f51-8a9c-4edf9838e2a1" + }, + "source": [ + "# Generating scores using Linear Regression Model\n", + "\n", + "linear_regressor = LinearRegression()\n", + "\n", + "linear_regressor.fit(trainDataVecs2, dep_train)\n", + "\n", + "dep_pred = linear_regressor.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 29, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 20.85\n", + "Variance score: 0.72\n", + "Kappa Score: 0.85\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "VSsXfIkpUmTe" + }, + "source": [ + "ksr = (\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RD9X4cplAJm3", + "outputId": "6ece1d9b-d7f2-45c1-a465-380e410c2b6f" + }, + "source": [ + "#Generating scores using Gradient Boosting regressor\n", + "\n", + "'''from sklearn.model_selection import GridSearchCV\n", + "params = {'n_estimators':[100, 1000], 'max_depth':[2], 'min_samples_split': [2],\n", + " 'learning_rate':[3, 1, 0.1, 0.3], 'loss': ['ls']}\n", + "\n", + "gbr = ensemble.GradientBoostingRegressor()\n", + "\n", + "grid = GridSearchCV(gbr, params)\n", + "grid.fit(trainDataVecs2, dep_train)\n", + "\n", + "y_pred = grid.predict(testDataVecs2)\n", + "\n", + "# summarize the results of the grid search\n", + "print(grid.best_score_)\n", + "print(grid.best_estimator_)'''\n", + "\n", + "#USING THE PARAMS FOUND OUT USING GRID SEARCH CV\n", + "gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n", + " learning_rate=0.1, loss='ls', max_depth=2, max_features=None,\n", + " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", + " min_impurity_split=None, min_samples_leaf=1,\n", + " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", + " n_estimators=1000, presort='auto', random_state=None,\n", + " subsample=1.0, verbose=0, warm_start=False)\n", + "gbr.fit(trainDataVecs2, dep_train)\n", + "dep_pred = gbr.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py:1342: FutureWarning: The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n", + " FutureWarning)\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "Mean squared error: 9.14\n", + "Variance score: 0.88\n", + "Kappa Score: 0.94\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "_K6BGP2OAraH", + "outputId": "5585d00c-8c5c-4ead-97c2-9468ab167714" + }, + "source": [ + "# Generating scores using Support Vector Regression\n", + "svr = SVR()\n", + "\n", + "'''parameters = {'kernel':['linear', 'rbf'], 'C':[1, 100], 'gamma':[0.1, 0.001]}\n", + "\n", + "grid = GridSearchCV(svr, parameters)\n", + "grid.fit(trainDataVecs2, dep_train)\n", + "\n", + "y_pred = grid.predict(testDataVecs2)\n", + "\n", + "# summarize the results of the grid search\n", + "print(grid.best_score_)\n", + "print(grid.best_estimator_)'''\n", + "\n", + "#USING THE PARAMS FOUND OUT USING GRID SEARCH CV\n", + "svr = SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)\n", + "svr.fit(trainDataVecs2, dep_train)\n", + "dep_pred = svr.predict(testDataVecs2)\n", + "\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#Cohen's Kappa score\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 31, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 34.61\n", + "Variance score: 0.55\n", + "Kappa Score: 0.65\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pzwT3hDvBIxU", + "outputId": "f031d8ff-8a08-4d29-a62c-0cdc69a59352" + }, + "source": [ + "# Generating scores using XGBClassifier\n", + "from xgboost import XGBClassifier\n", + "xgb=XGBClassifier(colsample_bytree=0.4,gamma=0,learning_rate=0.01,max_depth=4,min_child_weight=0.5,n_estimators=100, reg_alpha=0.75,reg_lambda=0.45,\n", + " subsample=0.6,seed=42)\n", + "xgb.fit(trainDataVecs2, dep_train)\n", + "dep_pred = xgb.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 32, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 17.12\n", + "Variance score: 0.77\n", + "Kappa Score: 0.88\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "auYVWNYdDkTS", + "outputId": "2bc380e5-3760-4abd-cde5-72a21e9811ff" + }, + "source": [ + "# Generating scores using Logistic Regression\n", + "lr = LogisticRegression(random_state=1, max_iter=1000)\n", + "lr.fit(trainDataVecs2, dep_train)\n", + "dep_pred = lr.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 33, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 40.63\n", + "Variance score: 0.48\n", + "Kappa Score: 0.66\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WcV-obvrH8ES", + "outputId": "974ce7e2-4b24-41bb-fe1d-3fac293548e0" + }, + "source": [ + "# Generating scores using K-Nearest Neighbor\n", + "knn = KNeighborsClassifier(n_neighbors=1)\n", + "knn.fit(trainDataVecs2, dep_train)\n", + "dep_pred = knn.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))\n" + ], + "execution_count": 34, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 15.10\n", + "Variance score: 0.81\n", + "Kappa Score: 0.91\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "R1w0mbbAIOEw", + "outputId": "5ab17902-79b9-4597-f3f5-59bd54d8ec30" + }, + "source": [ + "# Generating scores using Desion Tree Classifier\n", + "dt = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 30)\n", + "dt.fit(trainDataVecs2, dep_train)\n", + "dep_pred = dt.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 35, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 21.12\n", + "Variance score: 0.72\n", + "Kappa Score: 0.86\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "8MW8TGHBIbbx", + "outputId": "0b33a055-d8e1-4e9e-aea9-f2aa6b178b0f" + }, + "source": [ + "# Generating scores using Random Forest Classifier\n", + "rf = RandomForestClassifier(n_estimators=200, random_state=0,max_depth=12)\n", + "rf.fit(trainDataVecs2, dep_train)\n", + "dep_pred = rf.predict(testDataVecs2)\n", + "\n", + "# The mean squared error\n", + "print(\"Mean squared error: %.2f\" % mean_squared_error(dep_test, dep_pred))\n", + "\n", + "# Explained variance score: 1 is perfect prediction\n", + "print('Variance score: %.2f' % explained_variance_score(dep_test,dep_pred))\n", + "\n", + "#print('Cohen\\'s kappa score: %.2f' % cohen_kappa_score(dep_pred, dep_test))\n", + "print(\"Kappa Score: {0:.2f}\".format(cohen_kappa_score(dep_test.values,np.around(dep_pred),weights='quadratic')))" + ], + "execution_count": 36, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Mean squared error: 10.44\n", + "Variance score: 0.86\n", + "Kappa Score: 0.93\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cxJwnPuvterS" + }, + "source": [ + "**Comparison of Kappa Score of different algorithms**" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 350 + }, + "id": "SbIeCEVHYkg8", + "outputId": "afafb246-f0c2-4540-bc52-3bf7139918ef" + }, + "source": [ + "\n", + "\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "data = {'LR': 85, 'GBR': 94, 'SVR': 65, 'XGB':88, 'LR':66, 'KNN':91,'Decision Tree':86, 'RF':93 }\n", + "courses = list(data.keys())\n", + "values = list(data.values())\n", + " \n", + "ks =[ks_lr, ks_gb]\n", + " \n", + "fig = plt.figure(figsize = (10, 5))\n", + " \n", + "# creating the bar plot\n", + "plt.bar(courses, values, color ='teal',width = 0.4)\n", + " \n", + "plt.xlabel(\"Algorithm\")\n", + "plt.ylabel(\"Kappa Score\")\n", + "plt.title(\"Coparison of Kappa Score for different algoriths\")\n", + "plt.show()" + ], + "execution_count": 104, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LfIe5758PeG_" + }, + "source": [ + "* Gradient Boosting regressor gives highest cohen kappa score (0.95) \n", + "* We made our final prediction with Gradient Boosting Regressor Model" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TkAilMekTwOi" + }, + "source": [ + "## Prediction by best Model" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "b0b7vb8FTqD4", + "outputId": "c083c493-c6ad-4966-c88c-b9bb8f2342e7" + }, + "source": [ + "gbr = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n", + " learning_rate=0.1, loss='ls', max_depth=2, max_features=None,\n", + " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", + " min_impurity_split=None, min_samples_leaf=1,\n", + " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", + " n_estimators=1000, presort='auto', random_state=None,\n", + " subsample=1.0, verbose=0, warm_start=False)\n", + "gbr.fit(trainDataVecs2, dep_train)\n", + "predicted_scores = gbr.predict(testDataVecs2)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py:1342: FutureWarning: The parameter 'presort' is deprecated and has no effect. It will be removed in v0.24. You can suppress this warning by not passing any value to the 'presort' parameter. We also recommend using HistGradientBoosting models instead.\n", + " FutureWarning)\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 202 + }, + "id": "S6_0WOLVMLwL", + "outputId": "a172a169-14b6-4c32-b703-7b64a28be425" + }, + "source": [ + "predicted_score = predicted_scores\n", + "# predicted_score = pd.Series([score for sublist in predicted_scores for score in sublist])\n", + "predicted_score=pd.DataFrame(predicted_score,columns=['Predicted_Score']).round()\n", + "predicted_score.head()" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Predicted_Score
03.0
12.0
23.0
32.0
416.0
\n", + "
" + ], + "text/plain": [ + " Predicted_Score\n", + "0 3.0\n", + "1 2.0\n", + "2 3.0\n", + "3 2.0\n", + "4 16.0" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 26 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "bZgNaf5PMOEu", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 164 + }, + "outputId": "9ca8f68c-fe9a-430c-fe1d-58cd8d7d3bef" + }, + "source": [ + "final_scores = pd.concat([test_data, predicted_score], axis = 1).rename(columns = {0:\"predicted_score\"})\n" + ], + "execution_count": 37, + "outputs": [ + { + "output_type": "error", + "ename": "NameError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfinal_scores\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mtest_data\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpredicted_score\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcolumns\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\"predicted_score\"\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mNameError\u001b[0m: name 'predicted_score' is not defined" + ] + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "WnaKTQw2MOGW" + }, + "source": [ + "final_scores.head()" + ], + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file