diff --git a/year_prediction/ENV.yml b/year_prediction/ENV.yml new file mode 100644 index 0000000..66f3a50 --- /dev/null +++ b/year_prediction/ENV.yml @@ -0,0 +1,15 @@ +name: year_prediction +channels: + - defaults +dependencies: + - python=3.8 + - jupyterlab[version='>=3.0.0,<4.0.0a0'] + - jupyterlab-lsp + - pandas + - datasets + - ca-certificates + - certifi + - openssl + - scikit-learn + - ipywidgets +prefix: /Users/dvanstrien/miniconda3/envs/year_prediction diff --git a/year_prediction/T0_history_blog_nb.ipynb b/year_prediction/T0_history_blog_nb.ipynb new file mode 100644 index 0000000..5a0579d --- /dev/null +++ b/year_prediction/T0_history_blog_nb.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "TOKEN = \"\"" + ], + "metadata": { + "id": "2Nxaf2vKlj5W" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "import requests\n", + "from functools import lru_cache\n", + "headers = {\"Authorization\": f\"Bearer {TOKEN}\"}\n" + ] + }, + { + "cell_type": "code", + "source": [ + "api_urls = {\"bert-base-historic-english-cased\":\"https://api-inference.huggingface.co/models/dbmdz/bert-base-historic-english-cased\", \n", + " \"T0pp\": \"https://api-inference.huggingface.co/models/bigscience/T0pp\"}" + ], + "metadata": { + "id": "5xIGUhQ_TnMp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def query(payload, model=\"T0pp\"):\n", + "\tresponse = requests.post(api_urls[model], headers=headers, json=payload)\n", + "\treturn response.json()" + ], + "metadata": { + "id": "6bg22dO5NYhF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "@lru_cache(maxsize=None)\n", + "def query_api(inputs, model=\"T0pp\"):\n", + " output = query({\n", + " \"inputs\": f\"{inputs}\",\n", + " },model)\n", + " return output\n" + ], + "metadata": { + "id": "lg1ZJuj8Ub79" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def query_time(text, time=\"year\"):\n", + " input = f\"\"\"During which {time} was the following text likely to have been published?\n", + "Text: \"{text}\"\"\"\n", + " print(input)\n", + " return input, query_api(input)" + ], + "metadata": { + "id": "1FYcwRU2axWI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "HAVE_A_DREAM = \"\"\"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\"\"\"" + ], + "metadata": { + "id": "PYB-7taKDj_Q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(HAVE_A_DREAM)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eM44W59IDrDo", + "outputId": "9a909c0f-cfcd-4dae-cef1-d690e3a9408e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.',\n", + " [{'generated_text': '1963'}])" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(HAVE_A_DREAM, time='decade')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GdRHHaPZEASb", + "outputId": "34985f7e-ea2e-48d7-8178-30f6486348f3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which decade was the following text likely to have been published?\n", + "Text: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which decade was the following text likely to have been published?\\nText: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.',\n", + " [{'generated_text': '1960s'}])" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Frankenstein = \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \"" + ], + "metadata": { + "id": "wPBXIrzRKL7H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(Frankenstein)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CabPR1o5KQyd", + "outputId": "7bfb3ad1-1490-4abe-d799-c1291917f6d6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. ',\n", + " [{'generated_text': '1797'}])" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(Frankenstein, time=\"decade\")" + ], + "metadata": { + "id": "sw8ZlGL6KiZQ", + "outputId": "27314ea1-74a6-4a04-e194-61130430de33", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which decade was the following text likely to have been published?\n", + "Text: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which decade was the following text likely to have been published?\\nText: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. ',\n", + " [{'generated_text': '18th century'}])" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "SPANISH_TRAGEDIE_1587 = \"\"\"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\"\"\"" + ], + "metadata": { + "id": "YBHMw2FciXwq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(SPANISH_TRAGEDIE_1587)" + ], + "metadata": { + "id": "bKZ5FkseU4ZD", + "outputId": "82b959d9-93f1-4432-eaa6-3a6b0574ed76", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"Then rest we heere a-while in our vnrest;\\n And feede our sorrowes with inward sighes,\\n For deepest cares break neuer into teares.\\n But wherefore sit I in a regall throne?\\n This better fits a wretches endles moane.\\n Yet this is higher then my fortunes reach,\\n And therefore better then my state deserues.',\n", + " [{'generated_text': '1602'}])" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(SPANISH_TRAGEDIE_1587, time=\"century\")" + ], + "metadata": { + "id": "KrLhigIwbtJL", + "outputId": "1a8a024e-293d-4f6b-c155-1d98b5a091bb", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which century was the following text likely to have been published?\n", + "Text: \"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which century was the following text likely to have been published?\\nText: \"Then rest we heere a-while in our vnrest;\\n And feede our sorrowes with inward sighes,\\n For deepest cares break neuer into teares.\\n But wherefore sit I in a regall throne?\\n This better fits a wretches endles moane.\\n Yet this is higher then my fortunes reach,\\n And therefore better then my state deserues.',\n", + " [{'generated_text': '16th'}])" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + } + ], + "metadata": { + "colab": { + "name": "T0_history_blog_nb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/year_prediction/date_prediction.ipynb b/year_prediction/date_prediction.ipynb new file mode 100644 index 0000000..882427b --- /dev/null +++ b/year_prediction/date_prediction.ipynb @@ -0,0 +1,1455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0b2cb97-d68e-489d-97ff-0ee88455d50f", + "metadata": {}, + "source": [ + "*Notebook is a WIP* \n", + "\n", + "Since T0 variants are big, this notebook uses the HF inference API. You will need to pass an API token to use this. You get a generous free allowance, but you could still burn through this quite quickly if you repeat things many times. `requests_cache` is used to cache requests which means that the same request isn't passed to the API multiple times. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "151b5b9e-0609-4206-ac4f-3c32a6b981d5", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from datasets import concatenate_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "db2d0055-a5f1-4cd0-9fb5-42002f831517", + "metadata": {}, + "outputs": [], + "source": [ + "def load_datasets():\n", + " en_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'en')\n", + " en_dataset = en_dataset.map(lambda x: {\"language\": \"en\"})\n", + " fr_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'fr')\n", + " fr_dataset = fr_dataset.map(lambda x: {\"language\": \"fr\"})\n", + " de_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'de')\n", + " de_dataset = de_dataset.map(lambda x: {\"language\": \"de\"})\n", + " dataset = concatenate_datasets([fr_dataset['train'], fr_dataset['validation'], en_dataset['validation'],de_dataset['validation'],de_dataset['train']])\n", + " dataset = dataset.remove_columns([column for column in dataset.features.keys() if column not in ['tokens','id', 'date', \"language\"]])\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "17c253bf-9599-4688-a88c-3546b3493a21", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5d3e5e86652d4c31a476acf498e3bb22", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/19.6k [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset hipe2020/en to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/en/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d6a80d5449ba4b748898e116d75bcafd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7665db0f4c064cf0b13d883b0bc7cee4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/99.4k [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "94b34e60b162470196873b9cf8695bd2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset hipe2020 downloaded and prepared to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/en/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118. Subsequent calls will reuse this data.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ace4c055df5e418192168e1b687b153e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e256b06b20144d22907021562b0c671c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset hipe2020/fr to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/fr/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "10c2ed7940ef40bbad87c4e2497beb8e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "70a1dfb853c947adb5cf0912c8fdbf2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset hipe2020 downloaded and prepared to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/fr/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118. Subsequent calls will reuse this data.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7639d8353e0c4175bbbde3ac0cf8514f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4eff024d78134d339a17d84f08c384f1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b33d8609f3114ee19316c2bb4c0344be", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading and preparing dataset hipe2020/de to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/de/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118...\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b68bf443d9cf49f7abd19c172965179d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b43381bdd3004a7596bd8d1d8b7bbb48", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0 examples [00:00, ? examples/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset hipe2020 downloaded and prepared to /Users/dvanstrien/.cache/huggingface/datasets/bigscience-historical-texts___hipe2020/de/1.0.0/f1def91947260b00bcde9dc80b6d2fd0867449ecbf2f7e1024bbfd1b56f43118. Subsequent calls will reuse this data.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5e8895a0d08248c28eeb86c6568895d8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8586bb20d18942de868b6164ecf64528", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "425bb41b50344186b7e4f28b0d649dc8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset = load_datasets()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "cd1a0689-bd1a-4aca-b008-2b186c8296d5", + "metadata": {}, + "outputs": [], + "source": [ + "import requests_cache\n", + "\n", + "session = requests_cache.CachedSession('HF_API_CACHE',allowable_methods=('GET', 'POST'))" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "947cc4f8-171b-41ce-8a83-f567ed5a377e", + "metadata": {}, + "outputs": [], + "source": [ + "TOKEN = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "95cfa6fe-2758-416c-a56e-ab155b4988b5", + "metadata": {}, + "outputs": [], + "source": [ + "API_URL = \"https://api-inference.huggingface.co/models/bigscience/T0\"\n", + "headers = {\"Authorization\": f\"Bearer {TOKEN}\"}\n", + "\n", + "def query(payload):\n", + " response = session.post(API_URL, headers=headers, json=payload)\n", + " return response\n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9f576807-2a8f-49ef-a068-ba6e0b9efa11", + "metadata": {}, + "outputs": [], + "source": [ + "text = \"It was the best of times, it was the worst of times\"" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "a24c2f0b-c0e4-4f64-9bdf-33ed651a3ab5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'generated_text': '1812'}]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query({\"inputs\": f\"In which year is the following text likely to have been published: text:{text}\"}).json()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d4ee9670-9791-47e9-a876-fe6dddfa9a43", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'1880'" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re\n", + "year_regex = re.compile(r\"(\\d{4})\")\n", + "match = year_regex.search(\"1880s\")\n", + "match.group()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "5c0b1ced-c8f9-40d3-9512-6f11474f6760", + "metadata": {}, + "outputs": [], + "source": [ + "def query_year(example):\n", + " text = \" \".join(example['tokens'])\n", + " output = query({\"inputs\": f\"In which year is the following text likely to have been published: text:{text}\"})\n", + " try:\n", + " generated_text = output.json()[0][\"generated_text\"]\n", + " year = year_regex.search(generated_text)\n", + " if year is not None:\n", + " return year.group()\n", + " else:\n", + " return generated_text\n", + " except KeyError:\n", + " return str(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "48b43f9c-6314-4af7-b9bb-4ad83eb8adb1", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = dataset.shuffle()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "dca26ee3-7b05-4488-a616-d73b3a63577c", + "metadata": {}, + "outputs": [], + "source": [ + "sample_size = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "e85b2ebf-e4ce-4cb7-af38-ceee2619a876", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = dataset.select([i for i in range(sample_size)])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ec27a8af-3759-4684-8227-532be8609d2a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c29958d8bbba474db8df0bcc4f14a7bf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0ex [00:00, ?ex/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset = dataset.map(lambda example: {\"year_pred\": query_year(example)})" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "9604d07d-d281-48e5-8262-3ce8419f4879", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f22a0c970229400cbab746941ea819f7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00, ?ba/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "dataset = dataset.filter(lambda x: len(x['year_pred'])==4)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "dfc025c8-75d8-4011-99f8-c87be2a491c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['id', 'tokens', 'date', 'language', 'year_pred'],\n", + " num_rows: 4986\n", + "})" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f0478a35-ad31-42ed-8ac6-76fa8651c068", + "metadata": {}, + "outputs": [], + "source": [ + "df = dataset.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "47776c4c-f5fa-4f19-b411-54a5c3d873ef", + "metadata": {}, + "outputs": [], + "source": [ + "#df.to_json('results.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "3377a986-e420-46b7-8ead-9fd8f51b343c", + "metadata": {}, + "outputs": [], + "source": [ + "df['true_year'] = df['date'].dt.year" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "ea9188d7-b4ec-4566-a163-9a8cdd34355f", + "metadata": {}, + "outputs": [], + "source": [ + "true_y = df['true_year'].astype(int)\n", + "pred_y = df['year_pred'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "afb6ce67-4613-4e11-af50-ee529530724a", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics import mean_absolute_error, median_absolute_error, max_error" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "d66a83ad-6452-4b84-9ad0-9e3ffd5b943a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "48.418572001604495" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mean_absolute_error(true_y, pred_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "b830aaf4-dddb-4f11-a8ed-1aeff67b1e35", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "38.0" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "median_absolute_error(true_y, pred_y)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "a505ed4c-a742-4715-bba7-d6be6323bf3b", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "4ddca20f-fc59-4461-8317-960dc37054f9", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = defaultdict(list)\n", + "for lang in ['en','de','fr']:\n", + " metrics['language'].append(lang)\n", + " sub_df = df[df['language']==lang]\n", + " true_y = sub_df['true_year'].astype(int)\n", + " pred_y = sub_df['year_pred'].astype(int)\n", + " metrics['MAE'].append(mean_absolute_error(true_y, pred_y))\n", + " metrics['Median Absolute Error'].append(median_absolute_error(true_y, pred_y))\n", + " metrics['Max Error'].append(max_error(true_y, pred_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d91b463d-4a12-4e65-8cb1-56e8ab7a62ed", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "6dc75b5d-3bb7-4dbc-aec4-01eaa853ec18", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = pd.DataFrame.from_dict(metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "id": "783a92c2-3c2f-45d8-88be-651fa324525d", + "metadata": {}, + "outputs": [], + "source": [ + "metrics['MAE'] = metrics['MAE'].round(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "c6b433ac-e3e8-4ff3-b44b-d71652170c7c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{lrrr}\n", + "\\toprule\n", + "language & MAE & Median Absolute Error & Max Error \\\\\n", + "\\midrule\n", + " en & 40.476 & 30.0 & 789 \\\\\n", + " de & 40.112 & 32.0 & 1206 \\\\\n", + " fr & 55.251 & 48.0 & 2982 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/tj/54sfzlyj6_573fn82y996grc0000gr/T/ipykernel_12721/3362837598.py:1: FutureWarning: In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.\n", + " print(metrics.to_latex(index=False))\n" + ] + } + ], + "source": [ + "print(metrics.to_latex(index=False))" + ] + }, + { + "cell_type": "markdown", + "id": "f2e85f33-3c48-4fca-935a-aab773c3213c", + "metadata": {}, + "source": [ + "WIP/🗑" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18008bea-4ca9-4be2-8029-036f09774a8c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
| \n", + " | id | \n", + "tokens | \n", + "date | \n", + "language | \n", + "year_pred | \n", + "true_year | \n", + "error | \n", + "
|---|---|---|---|---|---|---|---|
| 0 | \n", + "1540 | \n", + "[Son, remplaçant, sera, probablement, Frank, C... | \n", + "1978-09-27 | \n", + "fr | \n", + "2009 | \n", + "1978 | \n", + "31 | \n", + "
| 1 | \n", + "5431 | \n", + "[B, .] | \n", + "1978-10-11 | \n", + "fr | \n", + "1865 | \n", + "1978 | \n", + "-113 | \n", + "
| 2 | \n", + "141 | \n", + "[General, St, ., Julien, ,, besides, the, Rati... | \n", + "1800-10-21 | \n", + "en | \n", + "1745 | \n", + "1800 | \n", + "-55 | \n", + "
| 3 | \n", + "3493 | \n", + "[Le, GrasUdanin, donne, les, détails, suivants... | \n", + "1888-11-01 | \n", + "fr | \n", + "1912 | \n", + "1888 | \n", + "24 | \n", + "
| 4 | \n", + "1659 | \n", + "[Es, handelt, sich, um, folgendes, .] | \n", + "1898-11-07 | \n", + "de | \n", + "1890 | \n", + "1898 | \n", + "-8 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 4981 | \n", + "3880 | \n", + "[M, ., Pichon, ,, ministre, des, affaires, étr... | \n", + "1908-01-07 | \n", + "fr | \n", + "1848 | \n", + "1908 | \n", + "-60 | \n", + "
| 4982 | \n", + "1400 | \n", + "[Das, alte, Haus, hatte, sich, heute, in, den,... | \n", + "1888-10-15 | \n", + "de | \n", + "1890 | \n", + "1888 | \n", + "2 | \n", + "
| 4983 | \n", + "902 | \n", + "[Thait, Is, what, I, bave, to, say, about, Alf... | \n", + "1920-07-08 | \n", + "en | \n", + "1880 | \n", + "1920 | \n", + "-40 | \n", + "
| 4984 | \n", + "2564 | \n", + "[7, ., Hendrickx, (, Belgien, ), 4, P, .] | \n", + "1938-08-09 | \n", + "de | \n", + "1912 | \n", + "1938 | \n", + "-26 | \n", + "
| 4985 | \n", + "1053 | \n", + "[] | \n", + "1950-11-25 | \n", + "en | \n", + "1865 | \n", + "1950 | \n", + "-85 | \n", + "
4986 rows × 7 columns
\n", + "| \n", + " | id | \n", + "tokens | \n", + "date | \n", + "language | \n", + "year_pred | \n", + "true_year | \n", + "error | \n", + "year | \n", + "decade | \n", + "
|---|---|---|---|---|---|---|---|---|---|
| 0 | \n", + "1540 | \n", + "[Son, remplaçant, sera, probablement, Frank, C... | \n", + "275702400000 | \n", + "fr | \n", + "2009 | \n", + "1978 | \n", + "31 | \n", + "1978 | \n", + "1970 | \n", + "
| 1 | \n", + "5431 | \n", + "[B, .] | \n", + "276912000000 | \n", + "fr | \n", + "1865 | \n", + "1978 | \n", + "-113 | \n", + "1978 | \n", + "1970 | \n", + "
| 2 | \n", + "141 | \n", + "[General, St, ., Julien, ,, besides, the, Rati... | \n", + "-5339347200000 | \n", + "en | \n", + "1745 | \n", + "1800 | \n", + "-55 | \n", + "1800 | \n", + "1800 | \n", + "
| 3 | \n", + "3493 | \n", + "[Le, GrasUdanin, donne, les, détails, suivants... | \n", + "-2561328000000 | \n", + "fr | \n", + "1912 | \n", + "1888 | \n", + "24 | \n", + "1888 | \n", + "1880 | \n", + "
| 4 | \n", + "1659 | \n", + "[Es, handelt, sich, um, folgendes, .] | \n", + "-2245276800000 | \n", + "de | \n", + "1890 | \n", + "1898 | \n", + "-8 | \n", + "1898 | \n", + "1890 | \n", + "
| ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
| 4981 | \n", + "3880 | \n", + "[M, ., Pichon, ,, ministre, des, affaires, étr... | \n", + "-1956096000000 | \n", + "fr | \n", + "1848 | \n", + "1908 | \n", + "-60 | \n", + "1908 | \n", + "1900 | \n", + "
| 4982 | \n", + "1400 | \n", + "[Das, alte, Haus, hatte, sich, heute, in, den,... | \n", + "-2562796800000 | \n", + "de | \n", + "1890 | \n", + "1888 | \n", + "2 | \n", + "1888 | \n", + "1880 | \n", + "
| 4983 | \n", + "902 | \n", + "[Thait, Is, what, I, bave, to, say, about, Alf... | \n", + "-1561593600000 | \n", + "en | \n", + "1880 | \n", + "1920 | \n", + "-40 | \n", + "1920 | \n", + "1920 | \n", + "
| 4984 | \n", + "2564 | \n", + "[7, ., Hendrickx, (, Belgien, ), 4, P, .] | \n", + "-990835200000 | \n", + "de | \n", + "1912 | \n", + "1938 | \n", + "-26 | \n", + "1938 | \n", + "1930 | \n", + "
| 4985 | \n", + "1053 | \n", + "[] | \n", + "-602812800000 | \n", + "en | \n", + "1865 | \n", + "1950 | \n", + "-85 | \n", + "1950 | \n", + "1950 | \n", + "
4986 rows × 9 columns
\n", + "| \n", + " | language | \n", + "MAE | \n", + "Median Absolute Error | \n", + "
|---|---|---|---|
| 0 | \n", + "en | \n", + "40.48 | \n", + "30.0 | \n", + "
| 1 | \n", + "de | \n", + "40.11 | \n", + "32.0 | \n", + "
| 2 | \n", + "fr | \n", + "55.25 | \n", + "48.0 | \n", + "