diff --git a/year_prediction/ENV.yml b/year_prediction/ENV.yml new file mode 100644 index 0000000..66f3a50 --- /dev/null +++ b/year_prediction/ENV.yml @@ -0,0 +1,15 @@ +name: year_prediction +channels: + - defaults +dependencies: + - python=3.8 + - jupyterlab[version='>=3.0.0,<4.0.0a0'] + - jupyterlab-lsp + - pandas + - datasets + - ca-certificates + - certifi + - openssl + - scikit-learn + - ipywidgets +prefix: /Users/dvanstrien/miniconda3/envs/year_prediction diff --git a/year_prediction/T0_history_blog_nb.ipynb b/year_prediction/T0_history_blog_nb.ipynb new file mode 100644 index 0000000..5a0579d --- /dev/null +++ b/year_prediction/T0_history_blog_nb.ipynb @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "code", + "source": [ + "TOKEN = \"\"" + ], + "metadata": { + "id": "2Nxaf2vKlj5W" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lIYdn1woOS1n" + }, + "outputs": [], + "source": [ + "import requests\n", + "from functools import lru_cache\n", + "headers = {\"Authorization\": f\"Bearer {TOKEN}\"}\n" + ] + }, + { + "cell_type": "code", + "source": [ + "api_urls = {\"bert-base-historic-english-cased\":\"https://api-inference.huggingface.co/models/dbmdz/bert-base-historic-english-cased\", \n", + " \"T0pp\": \"https://api-inference.huggingface.co/models/bigscience/T0pp\"}" + ], + "metadata": { + "id": "5xIGUhQ_TnMp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def query(payload, model=\"T0pp\"):\n", + "\tresponse = requests.post(api_urls[model], headers=headers, json=payload)\n", + "\treturn response.json()" + ], + "metadata": { + "id": "6bg22dO5NYhF" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "@lru_cache(maxsize=None)\n", + "def query_api(inputs, model=\"T0pp\"):\n", + " output = query({\n", + " \"inputs\": f\"{inputs}\",\n", + " },model)\n", + " return output\n" + ], + "metadata": { + "id": "lg1ZJuj8Ub79" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def query_time(text, time=\"year\"):\n", + " input = f\"\"\"During which {time} was the following text likely to have been published?\n", + "Text: \"{text}\"\"\"\n", + " print(input)\n", + " return input, query_api(input)" + ], + "metadata": { + "id": "1FYcwRU2axWI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "HAVE_A_DREAM = \"\"\"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\"\"\"" + ], + "metadata": { + "id": "PYB-7taKDj_Q" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(HAVE_A_DREAM)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "eM44W59IDrDo", + "outputId": "9a909c0f-cfcd-4dae-cef1-d690e3a9408e" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.',\n", + " [{'generated_text': '1963'}])" + ] + }, + "metadata": {}, + "execution_count": 7 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(HAVE_A_DREAM, time='decade')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GdRHHaPZEASb", + "outputId": "34985f7e-ea2e-48d7-8178-30f6486348f3" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which decade was the following text likely to have been published?\n", + "Text: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which decade was the following text likely to have been published?\\nText: \"So even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream. I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.',\n", + " [{'generated_text': '1960s'}])" + ] + }, + "metadata": {}, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "source": [ + "Frankenstein = \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \"" + ], + "metadata": { + "id": "wPBXIrzRKL7H" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(Frankenstein)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CabPR1o5KQyd", + "outputId": "7bfb3ad1-1490-4abe-d799-c1291917f6d6" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. ',\n", + " [{'generated_text': '1797'}])" + ] + }, + "metadata": {}, + "execution_count": 13 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(Frankenstein, time=\"decade\")" + ], + "metadata": { + "id": "sw8ZlGL6KiZQ", + "outputId": "27314ea1-74a6-4a04-e194-61130430de33", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which decade was the following text likely to have been published?\n", + "Text: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which decade was the following text likely to have been published?\\nText: \"How slowly the time passes here, encompassed as I am by frost and snow! Yet a second step is taken towards my enterprise. I have hired a vessel and am occupied in collecting my sailors; those whom I have already engaged appear to be men on whom I can depend and are certainly possessed of dauntless courage. ',\n", + " [{'generated_text': '18th century'}])" + ] + }, + "metadata": {}, + "execution_count": 14 + } + ] + }, + { + "cell_type": "code", + "source": [ + "SPANISH_TRAGEDIE_1587 = \"\"\"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\"\"\"" + ], + "metadata": { + "id": "YBHMw2FciXwq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "query_time(SPANISH_TRAGEDIE_1587)" + ], + "metadata": { + "id": "bKZ5FkseU4ZD", + "outputId": "82b959d9-93f1-4432-eaa6-3a6b0574ed76", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which year was the following text likely to have been published?\n", + "Text: \"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which year was the following text likely to have been published?\\nText: \"Then rest we heere a-while in our vnrest;\\n And feede our sorrowes with inward sighes,\\n For deepest cares break neuer into teares.\\n But wherefore sit I in a regall throne?\\n This better fits a wretches endles moane.\\n Yet this is higher then my fortunes reach,\\n And therefore better then my state deserues.',\n", + " [{'generated_text': '1602'}])" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "query_time(SPANISH_TRAGEDIE_1587, time=\"century\")" + ], + "metadata": { + "id": "KrLhigIwbtJL", + "outputId": "1a8a024e-293d-4f6b-c155-1d98b5a091bb", + "colab": { + "base_uri": "https://localhost:8080/" + } + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "During which century was the following text likely to have been published?\n", + "Text: \"Then rest we heere a-while in our vnrest;\n", + " And feede our sorrowes with inward sighes,\n", + " For deepest cares break neuer into teares.\n", + " But wherefore sit I in a regall throne?\n", + " This better fits a wretches endles moane.\n", + " Yet this is higher then my fortunes reach,\n", + " And therefore better then my state deserues.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('During which century was the following text likely to have been published?\\nText: \"Then rest we heere a-while in our vnrest;\\n And feede our sorrowes with inward sighes,\\n For deepest cares break neuer into teares.\\n But wherefore sit I in a regall throne?\\n This better fits a wretches endles moane.\\n Yet this is higher then my fortunes reach,\\n And therefore better then my state deserues.',\n", + " [{'generated_text': '16th'}])" + ] + }, + "metadata": {}, + "execution_count": 11 + } + ] + } + ], + "metadata": { + "colab": { + "name": "T0_history_blog_nb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/year_prediction/date_prediction.ipynb b/year_prediction/date_prediction.ipynb new file mode 100644 index 0000000..882427b --- /dev/null +++ b/year_prediction/date_prediction.ipynb @@ -0,0 +1,1455 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b0b2cb97-d68e-489d-97ff-0ee88455d50f", + "metadata": {}, + "source": [ + "*Notebook is a WIP* \n", + "\n", + "Since T0 variants are big, this notebook uses the HF inference API. You will need to pass an API token to use this. You get a generous free allowance, but you could still burn through this quite quickly if you repeat things many times. `requests_cache` is used to cache requests which means that the same request isn't passed to the API multiple times. \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "151b5b9e-0609-4206-ac4f-3c32a6b981d5", + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "from datasets import concatenate_datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "db2d0055-a5f1-4cd0-9fb5-42002f831517", + "metadata": {}, + "outputs": [], + "source": [ + "def load_datasets():\n", + " en_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'en')\n", + " en_dataset = en_dataset.map(lambda x: {\"language\": \"en\"})\n", + " fr_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'fr')\n", + " fr_dataset = fr_dataset.map(lambda x: {\"language\": \"fr\"})\n", + " de_dataset = load_dataset(\"bigscience-historical-texts/HIPE2020_sent-split\",'de')\n", + " de_dataset = de_dataset.map(lambda x: {\"language\": \"de\"})\n", + " dataset = concatenate_datasets([fr_dataset['train'], fr_dataset['validation'], en_dataset['validation'],de_dataset['validation'],de_dataset['train']])\n", + " dataset = dataset.remove_columns([column for column in dataset.features.keys() if column not in ['tokens','id', 'date', \"language\"]])\n", + " return dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "17c253bf-9599-4688-a88c-3546b3493a21", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5d3e5e86652d4c31a476acf498e3bb22", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading: 0%| | 0.00/19.6k [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtokensdatelanguageyear_predtrue_yearerror
01540[Son, remplaçant, sera, probablement, Frank, C...1978-09-27fr2009197831
15431[B, .]1978-10-11fr18651978-113
2141[General, St, ., Julien, ,, besides, the, Rati...1800-10-21en17451800-55
33493[Le, GrasUdanin, donne, les, détails, suivants...1888-11-01fr1912188824
41659[Es, handelt, sich, um, folgendes, .]1898-11-07de18901898-8
........................
49813880[M, ., Pichon, ,, ministre, des, affaires, étr...1908-01-07fr18481908-60
49821400[Das, alte, Haus, hatte, sich, heute, in, den,...1888-10-15de189018882
4983902[Thait, Is, what, I, bave, to, say, about, Alf...1920-07-08en18801920-40
49842564[7, ., Hendrickx, (, Belgien, ), 4, P, .]1938-08-09de19121938-26
49851053[]1950-11-25en18651950-85
\n", + "

4986 rows × 7 columns

\n", + "" + ], + "text/plain": [ + " id tokens date \\\n", + "0 1540 [Son, remplaçant, sera, probablement, Frank, C... 1978-09-27 \n", + "1 5431 [B, .] 1978-10-11 \n", + "2 141 [General, St, ., Julien, ,, besides, the, Rati... 1800-10-21 \n", + "3 3493 [Le, GrasUdanin, donne, les, détails, suivants... 1888-11-01 \n", + "4 1659 [Es, handelt, sich, um, folgendes, .] 1898-11-07 \n", + "... ... ... ... \n", + "4981 3880 [M, ., Pichon, ,, ministre, des, affaires, étr... 1908-01-07 \n", + "4982 1400 [Das, alte, Haus, hatte, sich, heute, in, den,... 1888-10-15 \n", + "4983 902 [Thait, Is, what, I, bave, to, say, about, Alf... 1920-07-08 \n", + "4984 2564 [7, ., Hendrickx, (, Belgien, ), 4, P, .] 1938-08-09 \n", + "4985 1053 [] 1950-11-25 \n", + "\n", + " language year_pred true_year error \n", + "0 fr 2009 1978 31 \n", + "1 fr 1865 1978 -113 \n", + "2 en 1745 1800 -55 \n", + "3 fr 1912 1888 24 \n", + "4 de 1890 1898 -8 \n", + "... ... ... ... ... \n", + "4981 fr 1848 1908 -60 \n", + "4982 de 1890 1888 2 \n", + "4983 en 1880 1920 -40 \n", + "4984 de 1912 1938 -26 \n", + "4985 en 1865 1950 -85 \n", + "\n", + "[4986 rows x 7 columns]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cd4a743-b04f-41ff-b148-ee4f688fc95e", + "metadata": {}, + "outputs": [], + "source": [ + "df['error'] = df['year_pred'].astype(int) - df['true_year'].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f949640-6aab-4583-9e38-b385198f79df", + "metadata": {}, + "outputs": [], + "source": [ + "df['year'] = df.date.dt.year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34b5d526-98df-4b72-a5de-e7994eb2051e", + "metadata": {}, + "outputs": [], + "source": [ + "df['decade'] = df.year.astype(str).str[:3]+\"0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "65c0f67a-bae6-4d7b-90bf-ef8fb2f880f6", + "metadata": {}, + "outputs": [], + "source": [ + "df['century'] = df.year.astype(str).str[:2]+\"00\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f900e2aa-3602-4135-848a-835a91cf0da9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 132, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df.groupby('decade')['error'].mean().plot(kind='barh', figsize=(20,5))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "e956d68a-3252-4a66-a031-bc6b51497406", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_json('results.json')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f6e35be0-07b4-4119-9202-48664b96a482", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtokensdatelanguageyear_predtrue_yearerroryeardecade
01540[Son, remplaçant, sera, probablement, Frank, C...275702400000fr200919783119781970
15431[B, .]276912000000fr18651978-11319781970
2141[General, St, ., Julien, ,, besides, the, Rati...-5339347200000en17451800-5518001800
33493[Le, GrasUdanin, donne, les, détails, suivants...-2561328000000fr191218882418881880
41659[Es, handelt, sich, um, folgendes, .]-2245276800000de18901898-818981890
..............................
49813880[M, ., Pichon, ,, ministre, des, affaires, étr...-1956096000000fr18481908-6019081900
49821400[Das, alte, Haus, hatte, sich, heute, in, den,...-2562796800000de18901888218881880
4983902[Thait, Is, what, I, bave, to, say, about, Alf...-1561593600000en18801920-4019201920
49842564[7, ., Hendrickx, (, Belgien, ), 4, P, .]-990835200000de19121938-2619381930
49851053[]-602812800000en18651950-8519501950
\n", + "

4986 rows × 9 columns

\n", + "
" + ], + "text/plain": [ + " id tokens date \\\n", + "0 1540 [Son, remplaçant, sera, probablement, Frank, C... 275702400000 \n", + "1 5431 [B, .] 276912000000 \n", + "2 141 [General, St, ., Julien, ,, besides, the, Rati... -5339347200000 \n", + "3 3493 [Le, GrasUdanin, donne, les, détails, suivants... -2561328000000 \n", + "4 1659 [Es, handelt, sich, um, folgendes, .] -2245276800000 \n", + "... ... ... ... \n", + "4981 3880 [M, ., Pichon, ,, ministre, des, affaires, étr... -1956096000000 \n", + "4982 1400 [Das, alte, Haus, hatte, sich, heute, in, den,... -2562796800000 \n", + "4983 902 [Thait, Is, what, I, bave, to, say, about, Alf... -1561593600000 \n", + "4984 2564 [7, ., Hendrickx, (, Belgien, ), 4, P, .] -990835200000 \n", + "4985 1053 [] -602812800000 \n", + "\n", + " language year_pred true_year error year decade \n", + "0 fr 2009 1978 31 1978 1970 \n", + "1 fr 1865 1978 -113 1978 1970 \n", + "2 en 1745 1800 -55 1800 1800 \n", + "3 fr 1912 1888 24 1888 1880 \n", + "4 de 1890 1898 -8 1898 1890 \n", + "... ... ... ... ... ... ... \n", + "4981 fr 1848 1908 -60 1908 1900 \n", + "4982 de 1890 1888 2 1888 1880 \n", + "4983 en 1880 1920 -40 1920 1920 \n", + "4984 de 1912 1938 -26 1938 1930 \n", + "4985 en 1865 1950 -85 1950 1950 \n", + "\n", + "[4986 rows x 9 columns]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "4f5e839d-bbe5-46c3-bf94-66118b7d36ce", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = defaultdict(list)\n", + "for lang in ['en','de','fr']:\n", + " metrics['language'].append(lang)\n", + " sub_df = df[df['language']==lang]\n", + " true_y = sub_df['true_year'].astype(int)\n", + " pred_y = sub_df['year_pred'].astype(int)\n", + " metrics['MAE'].append(mean_absolute_error(true_y, pred_y).round(2))\n", + " metrics['Median Absolute Error'].append(median_absolute_error(true_y, pred_y))\n", + " #metrics['Max Error'].append(max_error(true_y, pred_y))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "2b37d9b0-2d49-4f29-84ef-df61ca43e92c", + "metadata": {}, + "outputs": [], + "source": [ + "metrics = pd.DataFrame.from_dict(metrics)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "182d1803-da13-4056-8ac9-faf45d3f3a16", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
languageMAEMedian Absolute Error
0en40.4830.0
1de40.1132.0
2fr55.2548.0
\n", + "
" + ], + "text/plain": [ + " language MAE Median Absolute Error\n", + "0 en 40.48 30.0\n", + "1 de 40.11 32.0\n", + "2 fr 55.25 48.0" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d6e6749-aff3-4efc-b671-3c2ac90c32b5", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ee31fd5b-273d-4598-a7b8-569109654ae7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\\begin{tabular}{llrr}\n", + "\\toprule\n", + "{} & language & MAE & Median Absolute Error \\\\\n", + "\\midrule\n", + "0 & en & 40.48 & 30.0 \\\\\n", + "1 & de & 40.11 & 32.0 \\\\\n", + "2 & fr & 55.25 & 48.0 \\\\\n", + "\\bottomrule\n", + "\\end{tabular}\n", + "\n" + ] + } + ], + "source": [ + "print(metrics.to_latex())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "901fd108-72b6-4625-a381-c4711bcd99c8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}