Add notebook to extract weights and masses

jl-wynen · jl-wynen · commit 259e5ed9dcae · 2025-03-24T10:02:37.000+01:00
diff --git a/tools/atomic_weights.ipynb b/tools/atomic_weights.ipynb
@@ -0,0 +1,238 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Extract atomic weights and masses\n",
+    "\n",
+    "This notebook extracts atomic weights and masses from the CIAAW website.\n",
+    "See below for links."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import html.parser\n",
+    "\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": [
+    "## Atomic Weights\n",
+    "\n",
+    "Extract the standard atomic weights from an HTML file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Simple state machine to parse out the table from HTML.\n",
+    "# The data must be in a table with id \"mytable\".\n",
+    "class WeightsParser(html.parser.HTMLParser):\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "        self._in_table = False\n",
+    "        self._in_table_body = False\n",
+    "        self._in_td = False\n",
+    "        self.rows = []\n",
+    "        self._row = []\n",
+    "\n",
+    "    def handle_starttag(self, tag, attrs):\n",
+    "        if self._in_table:\n",
+    "            self._handle_starttag_in_table(tag, attrs)\n",
+    "        else:\n",
+    "            self._handle_starttag_outside_table(tag, attrs)\n",
+    "\n",
+    "    def _handle_starttag_in_table(self, tag, attrs):\n",
+    "        if tag == \"table\":\n",
+    "            raise NotImplementedError(\"Nested table\")\n",
+    "        if tag == \"tbody\":\n",
+    "            self._in_table_body = True\n",
+    "\n",
+    "        if not self._in_table_body:\n",
+    "            return\n",
+    "\n",
+    "        if tag == \"tr\":\n",
+    "            self._start_row()\n",
+    "        elif tag == \"td\":\n",
+    "            self._in_td = True\n",
+    "\n",
+    "    def _handle_starttag_outside_table(self, tag, attrs):\n",
+    "        if tag == \"table\" and ('id', 'mytable') in attrs:\n",
+    "            self._in_table = True\n",
+    "\n",
+    "    def handle_endtag(self, tag):\n",
+    "        if self._in_table:\n",
+    "            self._handle_endtag_in_table(tag)\n",
+    "\n",
+    "    def _handle_endtag_in_table(self, tag):\n",
+    "        if tag == \"table\":\n",
+    "            self._in_table = False\n",
+    "        elif tag == \"tbody\":\n",
+    "            self._in_table_body = False\n",
+    "        elif tag == \"tr\":\n",
+    "            self._end_row()\n",
+    "        elif tag == \"td\":\n",
+    "            self._in_td = False\n",
+    "\n",
+    "    def handle_data(self, data):\n",
+    "        if self._in_td:\n",
+    "            self._row.append(data.strip())\n",
+    "\n",
+    "    def _start_row(self):\n",
+    "        self._row = []\n",
+    "\n",
+    "    def _end_row(self):\n",
+    "        self.rows.append(self._row)\n",
+    "        self._row = []\n",
+    "\n",
+    "\n",
+    "def parse_weight(s):\n",
+    "    if s == '—':  # This is UTF-8 char \\xe2\\x80\\x94\n",
+    "        return None, None\n",
+    "    value, error = s.split('±')\n",
+    "    return float(value), float(error)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4",
+   "metadata": {},
+   "source": [
+    "Set the correct filename here.\n",
+    "This webpage should have been downloaded from the \"Abridged Standard Atomic Weights\" page at\n",
+    "https://www.ciaaw.org/abridged-atomic-weights.htm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"IUPAC_abridged_weights.html\") as f:\n",
+    "    raw_html = f.read()\n",
+    "\n",
+    "parser = WeightsParser()\n",
+    "parser.feed(raw_html)\n",
+    "atoms = [\n",
+    "    (row[1], int(row[0]), *parse_weight(row[3]))\n",
+    "    for row in parser.rows\n",
+    "    if row\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('atomic_weights.csv', 'w') as f:\n",
+    "    f.write(\"Element,Z,Atomic Weight [Da],Uncertainty [Da]\\n\")\n",
+    "    for (symbol, z, weight, error) in atoms:\n",
+    "        if weight is None:\n",
+    "            weight = ''\n",
+    "        if error is None:\n",
+    "            error = ''\n",
+    "        f.write(f'{symbol},{z},{weight},{error}\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "## Atomic Masses\n",
+    "\n",
+    "Set the correct filename here.\n",
+    "The file should have been downloaded from the \"Atomic Masses\" page at\n",
+    "https://www.ciaaw.org/atomic-masses.htm\n",
+    "That page offers a link to download a CSV file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"IUPAC_atomic_masses.csv\", header=2)\n",
+    "df['year'] = df.pop('Year/link').str.extract(r'>(\\d+)</a>')\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "Select the most recent entry for each isotope:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "latest = df.groupby('nuclide').apply(lambda x: x[x['year'] == x['year'].max()], include_groups=False)\n",
+    "latest.index = latest.index.map(lambda x: x[0])\n",
+    "assert latest.index.is_unique\n",
+    "latest"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "latest.to_csv('atomic_masses.csv',\n",
+    "              columns=['mass', 'uncertainty'],\n",
+    "              index_label=\"Isotope\",\n",
+    "              header=[\"Atomic Mass [Da]\", \"Uncertainty [Da]\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}