Skip to content

Commit 259e5ed

Browse files
committed
Add notebook to extract weights and masses
1 parent 90b3d14 commit 259e5ed

File tree

1 file changed

+238
-0
lines changed

1 file changed

+238
-0
lines changed

tools/atomic_weights.ipynb

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "0",
6+
"metadata": {},
7+
"source": [
8+
"# Extract atomic weights and masses\n",
9+
"\n",
10+
"This notebook extracts atomic weights and masses from the CIAAW website.\n",
11+
"See below for links."
12+
]
13+
},
14+
{
15+
"cell_type": "code",
16+
"execution_count": null,
17+
"id": "1",
18+
"metadata": {},
19+
"outputs": [],
20+
"source": [
21+
"import html.parser\n",
22+
"\n",
23+
"import pandas as pd"
24+
]
25+
},
26+
{
27+
"cell_type": "markdown",
28+
"id": "2",
29+
"metadata": {},
30+
"source": [
31+
"## Atomic Weights\n",
32+
"\n",
33+
"Extract the standard atomic weights from an HTML file."
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": null,
39+
"id": "3",
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"# Simple state machine to parse out the table from HTML.\n",
44+
"# The data must be in a table with id \"mytable\".\n",
45+
"class WeightsParser(html.parser.HTMLParser):\n",
46+
" def __init__(self):\n",
47+
" super().__init__()\n",
48+
" self._in_table = False\n",
49+
" self._in_table_body = False\n",
50+
" self._in_td = False\n",
51+
" self.rows = []\n",
52+
" self._row = []\n",
53+
"\n",
54+
" def handle_starttag(self, tag, attrs):\n",
55+
" if self._in_table:\n",
56+
" self._handle_starttag_in_table(tag, attrs)\n",
57+
" else:\n",
58+
" self._handle_starttag_outside_table(tag, attrs)\n",
59+
"\n",
60+
" def _handle_starttag_in_table(self, tag, attrs):\n",
61+
" if tag == \"table\":\n",
62+
" raise NotImplementedError(\"Nested table\")\n",
63+
" if tag == \"tbody\":\n",
64+
" self._in_table_body = True\n",
65+
"\n",
66+
" if not self._in_table_body:\n",
67+
" return\n",
68+
"\n",
69+
" if tag == \"tr\":\n",
70+
" self._start_row()\n",
71+
" elif tag == \"td\":\n",
72+
" self._in_td = True\n",
73+
"\n",
74+
" def _handle_starttag_outside_table(self, tag, attrs):\n",
75+
" if tag == \"table\" and ('id', 'mytable') in attrs:\n",
76+
" self._in_table = True\n",
77+
"\n",
78+
" def handle_endtag(self, tag):\n",
79+
" if self._in_table:\n",
80+
" self._handle_endtag_in_table(tag)\n",
81+
"\n",
82+
" def _handle_endtag_in_table(self, tag):\n",
83+
" if tag == \"table\":\n",
84+
" self._in_table = False\n",
85+
" elif tag == \"tbody\":\n",
86+
" self._in_table_body = False\n",
87+
" elif tag == \"tr\":\n",
88+
" self._end_row()\n",
89+
" elif tag == \"td\":\n",
90+
" self._in_td = False\n",
91+
"\n",
92+
" def handle_data(self, data):\n",
93+
" if self._in_td:\n",
94+
" self._row.append(data.strip())\n",
95+
"\n",
96+
" def _start_row(self):\n",
97+
" self._row = []\n",
98+
"\n",
99+
" def _end_row(self):\n",
100+
" self.rows.append(self._row)\n",
101+
" self._row = []\n",
102+
"\n",
103+
"\n",
104+
"def parse_weight(s):\n",
105+
" if s == '—': # This is UTF-8 char \\xe2\\x80\\x94\n",
106+
" return None, None\n",
107+
" value, error = s.split('±')\n",
108+
" return float(value), float(error)"
109+
]
110+
},
111+
{
112+
"cell_type": "markdown",
113+
"id": "4",
114+
"metadata": {},
115+
"source": [
116+
"Set the correct filename here.\n",
117+
"This webpage should have been downloaded from the \"Abridged Standard Atomic Weights\" page at\n",
118+
"https://www.ciaaw.org/abridged-atomic-weights.htm"
119+
]
120+
},
121+
{
122+
"cell_type": "code",
123+
"execution_count": null,
124+
"id": "5",
125+
"metadata": {},
126+
"outputs": [],
127+
"source": [
128+
"with open(\"IUPAC_abridged_weights.html\") as f:\n",
129+
" raw_html = f.read()\n",
130+
"\n",
131+
"parser = WeightsParser()\n",
132+
"parser.feed(raw_html)\n",
133+
"atoms = [\n",
134+
" (row[1], int(row[0]), *parse_weight(row[3]))\n",
135+
" for row in parser.rows\n",
136+
" if row\n",
137+
"]"
138+
]
139+
},
140+
{
141+
"cell_type": "code",
142+
"execution_count": null,
143+
"id": "6",
144+
"metadata": {},
145+
"outputs": [],
146+
"source": [
147+
"with open('atomic_weights.csv', 'w') as f:\n",
148+
" f.write(\"Element,Z,Atomic Weight [Da],Uncertainty [Da]\\n\")\n",
149+
" for (symbol, z, weight, error) in atoms:\n",
150+
" if weight is None:\n",
151+
" weight = ''\n",
152+
" if error is None:\n",
153+
" error = ''\n",
154+
" f.write(f'{symbol},{z},{weight},{error}\\n')"
155+
]
156+
},
157+
{
158+
"cell_type": "markdown",
159+
"id": "7",
160+
"metadata": {},
161+
"source": [
162+
"## Atomic Masses\n",
163+
"\n",
164+
"Set the correct filename here.\n",
165+
"The file should have been downloaded from the \"Atomic Masses\" page at\n",
166+
"https://www.ciaaw.org/atomic-masses.htm\n",
167+
"That page offers a link to download a CSV file."
168+
]
169+
},
170+
{
171+
"cell_type": "code",
172+
"execution_count": null,
173+
"id": "8",
174+
"metadata": {},
175+
"outputs": [],
176+
"source": [
177+
"df = pd.read_csv(\"IUPAC_atomic_masses.csv\", header=2)\n",
178+
"df['year'] = df.pop('Year/link').str.extract(r'>(\\d+)</a>')\n",
179+
"df"
180+
]
181+
},
182+
{
183+
"cell_type": "markdown",
184+
"id": "9",
185+
"metadata": {},
186+
"source": [
187+
"Select the most recent entry for each isotope:"
188+
]
189+
},
190+
{
191+
"cell_type": "code",
192+
"execution_count": null,
193+
"id": "10",
194+
"metadata": {},
195+
"outputs": [],
196+
"source": [
197+
"latest = df.groupby('nuclide').apply(lambda x: x[x['year'] == x['year'].max()], include_groups=False)\n",
198+
"latest.index = latest.index.map(lambda x: x[0])\n",
199+
"assert latest.index.is_unique\n",
200+
"latest"
201+
]
202+
},
203+
{
204+
"cell_type": "code",
205+
"execution_count": null,
206+
"id": "11",
207+
"metadata": {},
208+
"outputs": [],
209+
"source": [
210+
"latest.to_csv('atomic_masses.csv',\n",
211+
" columns=['mass', 'uncertainty'],\n",
212+
" index_label=\"Isotope\",\n",
213+
" header=[\"Atomic Mass [Da]\", \"Uncertainty [Da]\"])"
214+
]
215+
}
216+
],
217+
"metadata": {
218+
"kernelspec": {
219+
"display_name": "Python 3",
220+
"language": "python",
221+
"name": "python3"
222+
},
223+
"language_info": {
224+
"codemirror_mode": {
225+
"name": "ipython",
226+
"version": 2
227+
},
228+
"file_extension": ".py",
229+
"mimetype": "text/x-python",
230+
"name": "python",
231+
"nbconvert_exporter": "python",
232+
"pygments_lexer": "ipython2",
233+
"version": "2.7.6"
234+
}
235+
},
236+
"nbformat": 4,
237+
"nbformat_minor": 5
238+
}

0 commit comments

Comments
 (0)