diff --git a/09_SkimLit_nlp_milestone_project_2.ipynb b/09_SkimLit_nlp_milestone_project_2.ipynb
index c6c45962..a29be75d 100644
--- a/09_SkimLit_nlp_milestone_project_2.ipynb
+++ b/09_SkimLit_nlp_milestone_project_2.ipynb
@@ -1,28 +1,10 @@
{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "accelerator": "GPU",
- "colab": {
- "name": "09_SkimLit_nlp_milestone_project_2.ipynb",
- "provenance": [],
- "collapsed_sections": [],
- "toc_visible": true,
- "mount_file_id": "1_yq3R-ThKP78_byQV9OovntEcXpQcBK2",
- "authorship_tag": "ABX9TyM5wpCFGOHhp2o71grvS7VP",
- "include_colab_link": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
- },
"cells": [
{
"cell_type": "markdown",
"metadata": {
- "id": "view-in-github",
- "colab_type": "text"
+ "colab_type": "text",
+ "id": "view-in-github"
},
"source": [
""
@@ -141,26 +123,26 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "dsuQCg5Uaw1w",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "dsuQCg5Uaw1w",
"outputId": "f1681b9e-a9e6-4049-9591-7300b376cf46"
},
- "source": [
- "# Check for GPU\n",
- "!nvidia-smi -L"
- ],
- "execution_count": null,
"outputs": [
{
+ "name": "stdout",
"output_type": "stream",
"text": [
"GPU 0: Tesla T4 (UUID: GPU-90b6bfd2-2dbc-6214-b3b0-835ecd7fd102)\n"
- ],
- "name": "stdout"
+ ]
}
+ ],
+ "source": [
+ "# Check for GPU\n",
+ "!nvidia-smi -L"
]
},
{
@@ -180,20 +162,17 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "c0qt0M55a98x",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "c0qt0M55a98x",
"outputId": "4f4db3ae-e84b-432d-a807-7900ef3e1461"
},
- "source": [
- "!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git\n",
- "!ls pubmed-rct"
- ],
- "execution_count": null,
"outputs": [
{
+ "name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'pubmed-rct'...\n",
@@ -207,9 +186,12 @@
"PubMed_20k_RCT\n",
"PubMed_20k_RCT_numbers_replaced_with_at_sign\n",
"README.md\n"
- ],
- "name": "stdout"
+ ]
}
+ ],
+ "source": [
+ "!git clone https://github.com/Franck-Dernoncourt/pubmed-rct.git\n",
+ "!ls pubmed-rct"
]
},
{
@@ -238,26 +220,26 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "crmxKEJ69bNW",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "crmxKEJ69bNW",
"outputId": "87514fc4-594d-4572-d5ca-0125b51e073e"
},
- "source": [
- "# Check what files are in the PubMed_20K dataset \n",
- "!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign"
- ],
- "execution_count": null,
"outputs": [
{
+ "name": "stdout",
"output_type": "stream",
"text": [
"dev.txt test.txt train.txt\n"
- ],
- "name": "stdout"
+ ]
}
+ ],
+ "source": [
+ "# Check what files are in the PubMed_20K dataset \n",
+ "!ls pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign"
]
},
{
@@ -276,35 +258,28 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "C1Zp21fGbBUJ"
},
+ "outputs": [],
"source": [
"# Start by using the 20k dataset\n",
"data_dir = \"pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/\""
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "CWqMrjLCbFTr",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "CWqMrjLCbFTr",
"outputId": "872976d8-e0c4-4cae-fd6a-db538926fa46"
},
- "source": [
- "# Check all of the filenames in the target directory\n",
- "import os\n",
- "filenames = [data_dir + filename for filename in os.listdir(data_dir)]\n",
- "filenames"
- ],
- "execution_count": null,
"outputs": [
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"['pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',\n",
@@ -312,9 +287,16 @@
" 'pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt']"
]
},
+ "execution_count": 5,
"metadata": {},
- "execution_count": 5
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "# Check all of the filenames in the target directory\n",
+ "import os\n",
+ "filenames = [data_dir + filename for filename in os.listdir(data_dir)]\n",
+ "filenames"
]
},
{
@@ -352,9 +334,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "2yjdhJxbbIhX"
},
+ "outputs": [],
"source": [
"# Create function to read the lines of a document\n",
"def get_lines(filename):\n",
@@ -373,9 +357,7 @@
" \"\"\"\n",
" with open(filename, \"r\") as f:\n",
" return f.readlines()"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -390,21 +372,16 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "IT7RMQsEbI0I",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "IT7RMQsEbI0I",
"outputId": "5c6b3b6d-393c-42d2-bf0d-009f288d61cf"
},
- "source": [
- "train_lines = get_lines(data_dir+\"train.txt\")\n",
- "train_lines[:20] # the whole first example of an abstract + a little more of the next one"
- ],
- "execution_count": null,
"outputs": [
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"['###24293578\\n',\n",
@@ -429,9 +406,14 @@
" 'METHODS\\tParticipants ( N = @ ) were randomly assigned to one of the two experimental mood induction conditions ( sad/neutral ) .\\n']"
]
},
+ "execution_count": 7,
"metadata": {},
- "execution_count": 7
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "train_lines = get_lines(data_dir+\"train.txt\")\n",
+ "train_lines[:20] # the whole first example of an abstract + a little more of the next one"
]
},
{
@@ -478,9 +460,11 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
"id": "B65Ffn9abJKH"
},
+ "outputs": [],
"source": [
"def preprocess_text_with_line_numbers(filename):\n",
" \"\"\"Returns a list of dictionaries of abstract line data.\n",
@@ -530,9 +514,7 @@
" abstract_lines += line\n",
" \n",
" return abstract_samples"
- ],
- "execution_count": null,
- "outputs": []
+ ]
},
{
"cell_type": "markdown",
@@ -545,31 +527,31 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "yDd28-PfgoUP",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "yDd28-PfgoUP",
"outputId": "9a1f574a-f050-41a7-83e4-6deb7f37c225"
},
- "source": [
- "# Get data from file and preprocess it\n",
- "%%time\n",
- "train_samples = preprocess_text_with_line_numbers(data_dir + \"train.txt\")\n",
- "val_samples = preprocess_text_with_line_numbers(data_dir + \"dev.txt\") # dev is another name for validation set\n",
- "test_samples = preprocess_text_with_line_numbers(data_dir + \"test.txt\")\n",
- "len(train_samples), len(val_samples), len(test_samples)"
- ],
- "execution_count": null,
"outputs": [
{
+ "name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 450 ms, sys: 89.4 ms, total: 540 ms\n",
"Wall time: 540 ms\n"
- ],
- "name": "stdout"
+ ]
}
+ ],
+ "source": [
+ "# Get data from file and preprocess it\n",
+ "%%time\n",
+ "train_samples = preprocess_text_with_line_numbers(data_dir + \"train.txt\")\n",
+ "val_samples = preprocess_text_with_line_numbers(data_dir + \"dev.txt\") # dev is another name for validation set\n",
+ "test_samples = preprocess_text_with_line_numbers(data_dir + \"test.txt\")\n",
+ "len(train_samples), len(val_samples), len(test_samples)"
]
},
{
@@ -583,21 +565,16 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "FcYkHrnnh0lf",
"colab": {
"base_uri": "https://localhost:8080/"
},
+ "id": "FcYkHrnnh0lf",
"outputId": "b86a45de-d1f7-4172-8c63-e892c679e827"
},
- "source": [
- "# Check the first abstract of our training data\n",
- "train_samples[:14]"
- ],
- "execution_count": null,
"outputs": [
{
- "output_type": "execute_result",
"data": {
"text/plain": [
"[{'line_number': 0,\n",
@@ -658,9 +635,14 @@
" 'total_lines': 10}]"
]
},
+ "execution_count": 10,
"metadata": {},
- "execution_count": 10
+ "output_type": "execute_result"
}
+ ],
+ "source": [
+ "# Check the first abstract of our training data\n",
+ "train_samples[:14]"
]
},
{
@@ -676,25 +658,17 @@
},
{
"cell_type": "code",
+ "execution_count": null,
"metadata": {
- "id": "RRSTUXuth9jJ",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 480
},
+ "id": "RRSTUXuth9jJ",
"outputId": "f23dff4e-7eb8-43fb-b374-b32984eeae1a"
},
- "source": [
- "import pandas as pd\n",
- "train_df = pd.DataFrame(train_samples)\n",
- "val_df = pd.DataFrame(val_samples)\n",
- "test_df = pd.DataFrame(test_samples)\n",
- "train_df.head(14)"
- ],
- "execution_count": null,
"outputs": [
{
- "output_type": "execute_result",
"data": {
"text/html": [
"