Skip to content

Commit 2590188

Browse files
committed
- added documentation to methods
- added chemimg method - added cas validation with check digit verification - added inchikey validation - improved query to check for '*' and added exact option - improved key2cas to get casrn of smallest molecule (assuming other hits are polymers)
1 parent 645b711 commit 2590188

File tree

3 files changed

+135
-21
lines changed

3 files changed

+135
-21
lines changed

comchem/__init__.py

Lines changed: 126 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
import json
2+
import re
3+
import cairosvg
4+
import os
25
from bs4 import BeautifulSoup
36
from urllib.request import urlopen
47

@@ -7,13 +10,19 @@
710
'properties', 'synonyms', 'replaceRns', 'hasMolefile']
811

912

10-
# get data from a page
1113
def detail(casrn, field="all"):
12-
""" access the common chemistry detail api """
14+
"""
15+
Access the Common Chemistry detail API at
16+
https://commonchemistry.cas.org/api/detail?cas_rn=<casrn>
17+
:param casrn: CAS Registry Number
18+
:param field: field to return or all fields (default)
19+
:return mixed
20+
"""
21+
if not _validcas(casrn):
22+
return '' # false
1323
url = ccpath + 'detail?cas_rn=' + casrn
1424
respnse = urlopen(url)
1525
jsn = json.loads(respnse.read())
16-
1726
if field == "all":
1827
return jsn
1928
elif field in fields:
@@ -25,23 +34,125 @@ def detail(casrn, field="all"):
2534
else:
2635
return jsn[field]
2736
else:
28-
return "Field not available..."
37+
return '' # false
2938

3039

31-
# run a search
32-
def query(term):
33-
url = ccpath + 'search?q=' + term
40+
def query(term='', exact=False):
41+
"""
42+
Search the CommonChemistry database API at
43+
https://commonchemistry.cas.org/api/search?q=<term>
44+
:param term: string to be searched
45+
:param exact: boolean to indicate an exact match
46+
:return: string
47+
"""
48+
url = ''
49+
if exact is False:
50+
url = ccpath + 'search?q=' + term + '*'
51+
elif term[-1:] == '*' or exact is True:
52+
url = ccpath + 'search?q=' + term
3453
respnse = urlopen(url)
3554
jsn = json.loads(respnse.read())
36-
out = []
37-
for hit in jsn['results']:
38-
textname = BeautifulSoup(hit["name"], "lxml").text
39-
out.append({"textname": textname, "htmlname": hit["name"].lower(), "rn": hit["rn"]})
55+
out = [] # false
56+
if jsn['results']:
57+
for hit in jsn['results']:
58+
textname = BeautifulSoup(hit["name"], "lxml").text
59+
out.append({"textname": textname, "htmlname": hit["name"].lower(), "rn": hit["rn"]})
4060
return out
4161

4262

43-
# search for a compound using an InChIKey
4463
def key2cas(key):
45-
""" search the api for an InChKey"""
46-
hits = query('InChIKey=' + key)
47-
return hits[0]['rn'] # only returns the casne of the first hit
64+
"""
65+
Find the CAS Registry Number of a chemical substance using an IUPAC InChIKey
66+
:param key - a valid InChIKey
67+
"""
68+
if _validkey(key):
69+
hits = query('InChIKey=' + key, True)
70+
if hits:
71+
if len(hits) == 1:
72+
return hits[0]['rn']
73+
else:
74+
# check hits for smallest molar mass compound, i.e., not polymer
75+
minmm = 100000
76+
minrn = ''
77+
for i, hit in enumerate(hits):
78+
mm = detail(hit['rn'], 'molecularMass')
79+
if mm != '':
80+
if float(mm) < minmm:
81+
minmm = float(mm)
82+
minrn = hit['rn']
83+
return minrn
84+
else:
85+
return ''
86+
else:
87+
return ''
88+
89+
90+
def _validkey(key):
91+
"""
92+
Validate and IUPAC InChIKey
93+
:param key: a string to be validated as an IUPAC InChIKey
94+
:return: bool
95+
"""
96+
test = re.search(r'^[A-Z]{14}-[A-Z]{8}[SN][A]-[A-Z]$', key)
97+
if test is None:
98+
return False
99+
return True
100+
101+
102+
def _validcas(cas):
103+
"""
104+
Validate a CAS Registry Number
105+
See: https://en.wikipedia.org/wiki/CAS_Registry_Number#Format
106+
:param cas: a string to be validated as a CAS Registry Number
107+
:return: bool
108+
"""
109+
test = re.search(r'^\d{2,8}-\d{2}-\d$', cas)
110+
# if format of string does not match then it's not CAS RN
111+
if test is None:
112+
return False
113+
# verify check digit
114+
reverse = cas[::-1] # reverse the CAS Registry Number (needed for checksum math and split out checksum)
115+
digits = reverse.replace('-', '') # remove the dashes
116+
nochk = digits[1:] # all but first digit
117+
chksum = int(digits[:1]) # first digit
118+
total = 0
119+
for i, digit in enumerate(nochk):
120+
total += (i + 1) * int(digit) # index of chars starts at 0
121+
newsum = total % 10
122+
if newsum == chksum:
123+
return True
124+
else:
125+
return False
126+
127+
128+
def chemimg(chemid='', imgtype='svg'):
129+
"""
130+
Get an image for a compound from either a CAS Registry Number, InChIKey, SMILES, or name
131+
:param chemid: the CAS Registry Number, InChIKey, SMILES, or name
132+
:param imgtype: the type of image file to produce - svg, png, or ps
133+
:return:
134+
"""
135+
# check identifier for type so checking can be done
136+
if chemid == '':
137+
return False
138+
if _validkey(chemid):
139+
casrn = key2cas(chemid)
140+
elif not _validcas(chemid):
141+
casrn = query(chemid, True)
142+
else:
143+
casrn = chemid
144+
if not casrn:
145+
return casrn
146+
# get svg data and save
147+
svg = detail(casrn, "image")
148+
f = open(casrn + ".svg", "w")
149+
f.write(svg)
150+
f.close()
151+
if imgtype == 'png':
152+
cairosvg.svg2png(url=casrn + ".svg", write_to=casrn + ".png")
153+
elif imgtype == 'ps':
154+
cairosvg.svg2ps(url=casrn + ".svg", write_to=casrn + ".ps")
155+
if imgtype == 'png' or imgtype == 'ps':
156+
if os.path.exists(casrn + ".svg"):
157+
os.remove(casrn + ".svg")
158+
return True

example.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
from comchem import *
22

3-
name = detail("9003-07-0", "properties")
4-
print(name)
3+
# name = detail("9003-07-0", "properties")
4+
# print(name)
5+
#
6+
# find = query("trifluoro*")
7+
# print(find)
58

6-
find = query("trifluoro*")
7-
print(find)
9+
# find = key2cas('UHOVQNZJYSORNB-UHFFFAOYSA-N')
10+
# print(find)
811

9-
find = key2cas('UHOVQNZJYSORNB-UHFFFAOYSA-N')
10-
print(find)
12+
chemimg("71-43-2", 'png')

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ packages = [
1919
[tool.poetry.dependencies]
2020
python = "^3.7"
2121
bs4 = "0.0.1"
22+
cairosvg = "^2.5.2"
2223

2324
[tool.poetry.dev-dependencies]
2425
pytest = "^5.2"

0 commit comments

Comments
 (0)