Skip to content

Commit 17930ad

Browse files
hhaenselhhaensel
andauthored
implement generic inchitomol() with stereo support (#138)
* implement generic inchitomol() with stereo support * fix stereo handling in inchitomol for double bonds + code styling * add/modify tests * more code cleaning * inchitosdf: convert via MolGraph in order to support stereo --------- Co-authored-by: hhaensel <[email protected]>
1 parent 1c44983 commit 17930ad

File tree

2 files changed

+290
-57
lines changed

2 files changed

+290
-57
lines changed

src/inchi.jl

Lines changed: 276 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,38 @@
22
# This file is a part of MolecularGraph.jl
33
# Licensed under the MIT License http://opensource.org/licenses/MIT
44
#
5+
const INCHI_StereoType_None = 0
6+
const INCHI_StereoType_DoubleBond = 1
7+
const INCHI_StereoType_Tetrahedral= 2
8+
const INCHI_StereoType_Allene = 3
9+
10+
const INCHI_PARITY_NONE = 0
11+
const INCHI_PARITY_ODD = 1 # 'o'
12+
const INCHI_PARITY_EVEN = 2 # 'e'
13+
const INCHI_PARITY_UNKNOWN = 3 # 'u'
14+
const INCHI_PARITY_UNDEFINED = 4 # '?', rarely used
15+
16+
const NO_ATOM = -1 # from header: NO_ATOM == -1
17+
18+
const MAXVAL = 20
19+
const ATOM_EL_LEN = 6
20+
const NUM_H_ISOTOPES = 3
21+
const AT_NUM = Int16
22+
const S_CHAR = Int8
23+
struct inchi_Atom
24+
x::Cdouble
25+
y::Cdouble
26+
z::Cdouble
27+
neighbor::NTuple{MAXVAL,AT_NUM}
28+
bond_type::NTuple{MAXVAL,S_CHAR}
29+
bond_stereo::NTuple{MAXVAL,S_CHAR}
30+
elname::NTuple{ATOM_EL_LEN,Cchar}
31+
num_bonds::AT_NUM
32+
num_iso_H::NTuple{NUM_H_ISOTOPES+1,S_CHAR}
33+
isotopic_mass::AT_NUM
34+
radical::S_CHAR
35+
charge::S_CHAR
36+
end
537

638
mutable struct inchi_Output
739
szInChI::Cstring
@@ -18,10 +50,17 @@ mutable struct inchi_InputINCHI
1850
# '/' or '-' depending on OS and compiler
1951
end
2052

53+
struct inchi_Stereo0D
54+
neighbor::NTuple{4, AT_NUM} # always 4 atoms
55+
central_atom::AT_NUM # central atom index (or NO_ATOM)
56+
type::S_CHAR # inchi_StereoType0D
57+
parity::S_CHAR # inchi_StereoParity0D (possibly combined)
58+
end
59+
2160
mutable struct inchi_Input
2261
# the caller is responsible for the data allocation and deallocation
23-
atom::Ptr{Cvoid} # array of num_atoms elements
24-
stereo0D::Ptr{Cvoid} # array of num_stereo0D 0D stereo elements or NULL
62+
atom::Ptr{inchi_Atom} # array of num_atoms elements
63+
stereo0D::Ptr{inchi_Stereo0D} # array of num_stereo0D 0D stereo elements or NULL
2564
szOptions::Cstring # InChI options: space-delimited; each is preceded by
2665
# '/' or '-' depending on OS and compiler
2766
num_atoms::Cshort # number of atoms in the structure < MAX_ATOMS
@@ -30,20 +69,20 @@ end
3069

3170
mutable struct inchi_InputEx
3271
# the caller is responsible for the data allocation and deallocation
33-
atom::Ptr{Cvoid} # array of num_atoms elements
34-
stereo0D::Ptr{Cvoid} # array of num_stereo0D 0D stereo elements or NULL
35-
szOptions::Cstring # InChI options: space-delimited; each is preceded by
36-
# '/' or '-' depending on OS and compiler
37-
num_atoms::Cshort # number of atoms in the structure < MAX_ATOMS
38-
num_stereo0D::Cshort # number of 0D stereo elements
72+
atom::Ptr{inchi_Atom} # array of num_atoms elements
73+
stereo0D::Ptr{inchi_Stereo0D} # array of num_stereo0D 0D stereo elements or NULL
74+
szOptions::Cstring # InChI options: space-delimited; each is preceded by
75+
# '/' or '-' depending on OS and compiler
76+
num_atoms::Cshort # number of atoms in the structure < MAX_ATOMS
77+
num_stereo0D::Cshort # number of 0D stereo elements
3978
polymer::Ptr{Cvoid}
4079
v3000::Ptr{Cvoid}
4180
end
4281

4382
mutable struct inchi_OutputStruct
4483
# the caller is responsible for the data allocation and deallocation
45-
atom::Ptr{Cvoid} # array of num_atoms elements
46-
stereo0D::Ptr{Cvoid} # array of num_stereo0D 0D stereo elements or NULL
84+
atom::Ptr{inchi_Atom} # array of num_atoms elements
85+
stereo0D::Ptr{inchi_Stereo0D} # array of num_stereo0D 0D stereo elements or NULL
4786
num_atoms::Cshort; # number of atoms in the structure < MAX_ATOMS
4887
num_stereo0D::Cshort # number of 0D stereo elements
4988
szMessage::Cstring # Error/warning ASCIIZ message
@@ -62,13 +101,13 @@ end
62101

63102
mutable struct inchi_OutputStructEx
64103
# the caller is responsible for the data allocation and deallocation
65-
atom::Ptr{Cvoid} # array of num_atoms elements
66-
stereo0D::Ptr{Cvoid} # array of num_stereo0D 0D stereo elements or NULL
67-
num_atoms::Cshort; # number of atoms in the structure < MAX_ATOMS
68-
num_stereo0D::Cshort # number of 0D stereo elements
69-
szMessage::Cstring # Error/warning ASCIIZ message
70-
szLog::Cstring # log-file ASCIIZ string, contains a human-readable list
71-
# of recognized options and possibly an Error/warn message
104+
atom::Ptr{inchi_Atom} # array of num_atoms elements
105+
stereo0D::Ptr{inchi_Stereo0D} # array of num_stereo0D 0D stereo elements or NULL
106+
num_atoms::Cshort; # number of atoms in the structure < MAX_ATOMS
107+
num_stereo0D::Cshort # number of 0D stereo elements
108+
szMessage::Cstring # Error/warning ASCIIZ message
109+
szLog::Cstring # log-file ASCIIZ string, contains a human-readable list
110+
# of recognized options and possibly an Error/warn message
72111
warningflags::NTuple{2, NTuple{2, Clong}} # warnings, see INCHIDIFF in inchicmp.h */
73112
# [x][y]:
74113
# x=0 => Reconnected if present in InChI
@@ -160,52 +199,235 @@ end
160199
inchikey(mol::SimpleMolGraph) = inchikey(inchi(mol))
161200

162201
"""
163-
inchitosdf(inchi::String; options::String = "") -> Union{String,Nothing}
202+
inchitosdf(inchi::AbstractString; options::String = "", config::Union{Nothing,Dict{Symbol,Any}})
164203
165-
Generate sdf string from inchi string, `options` are specified in https://github.com/mojaie/libinchi/blob/master/INCHI_BASE/src/inchi_api.h
204+
Generate sdf string from inchi string.
205+
This new version goes via parsing to a MolGraph and exporting as sdf.
206+
Parsing `options` are specified in https://github.com/mojaie/libinchi/blob/master/INCHI_BASE/src/inchi_api.h
207+
Coordinates are generated via coordgen (Schrödinger coordgenlibs).
166208
"""
167-
function inchitosdf(inchi::String; options::String = "", verbose::Bool = false)
168-
# support the correct options format depending on OS
169-
opts = opt_array(options)
170-
# add a timeout of 60s per molecule
171-
any(occursin.(r"^Wm?\d+$", opts)) || push!(opts, "W60")
172-
# switch output to sdf format
173-
"OutputSDF" opts || push!(opts, "OutputSDF")
174-
options = opt_string(opts)
175-
176-
structure = inchi_OutputStructEx()
177-
inchi_input = inchi_InputINCHI(Base.unsafe_convert(Cstring, inchi), Base.unsafe_convert(Cstring, options))
178-
@ccall libinchi.GetStructFromINCHIEx(
179-
inchi_input::Ref{inchi_InputINCHI}, structure::Ref{inchi_OutputStructEx})::Int32
180-
181-
input = inchi_InputEx(
182-
structure.atom,
183-
structure.stereo0D,
184-
Base.unsafe_convert(Cstring, options),
185-
structure.num_atoms,
186-
structure.num_stereo0D,
187-
structure.polymer,
188-
structure.v3000
189-
)
190-
output = inchi_Output()
191-
@ccall libinchi.GetINCHIEx(
192-
input::Ref{inchi_InputEx}, output::Ref{inchi_Output})::Cint
193-
report_output(output, verbose)
209+
function inchitosdf(inchi::AbstractString; options::String = "", config::Union{Nothing,Dict{Symbol,Any}} = nothing)
210+
printv2mol(inchitomol(inchi; options, config))
211+
end
194212

195-
res = output.szInChI == C_NULL ? nothing : unsafe_string(output.szInChI)
213+
# decode parity byte (connected in low 3 bits, disconnected in bits 3..5)
214+
# returns the parity value to use (0..4)
215+
decode_parity(p::Integer) = begin
216+
p_raw = UInt8(p) # make sure unsigned before bit ops
217+
p_conn = Int(p_raw & 0x07) # low 3 bits
218+
p_disc = Int((p_raw >> 3) & 0x07) # next 3 bits
219+
p_conn != 0 ? p_conn : p_disc # prefer connected parity if present
220+
end
196221

197-
# Free buffers allocated by GetStructFromINCHIEx and GetINCHI
198-
@ccall libinchi.FreeStructFromINCHIEx(structure::Ref{inchi_OutputStructEx})::Cvoid
199-
@ccall libinchi.FreeINCHI(output::Ref{inchi_Output})::Cvoid
222+
function process_inchi_stereo!(g::T, structure::inchi_OutputStructEx) where T <: MolGraph
223+
ET = edgetype(T)
224+
225+
stereocenters = Dict{Int64, Tuple{Int64,Int64,Int64,Bool}}() # center => (looking_from,v1,v2,is_clockwise)
226+
stereobonds = Dict{Edge{Int64}, Tuple{Int64,Int64,Bool}}() # edge => (a1,a2,is_cis)
227+
228+
atoms = unsafe_wrap(Vector{inchi_Atom}, structure.atom, structure.num_atoms)
229+
stereos = unsafe_wrap(Vector{inchi_Stereo0D}, structure.stereo0D, structure.num_stereo0D)
230+
n_heavy = length(atoms) # number of heavy atoms (without isotopic H/D/T)
231+
is_heavy(idx) = 0 < idx <= n_heavy # for 1-based indices
232+
233+
for s in stereos
234+
parity = decode_parity(s.parity)
235+
parity == INCHI_PARITY_NONE || parity == INCHI_PARITY_UNKNOWN && continue # nothing to set
236+
parity == INCHI_PARITY_UNDEFINED && continue # explicit undefined -> skip
237+
238+
# convert to 1-based indexing
239+
nbrs = s.neighbor .+ 1
240+
center = s.central_atom + 1
241+
242+
if s.type == INCHI_StereoType_Tetrahedral
243+
center == 0 && continue # sanity check
244+
245+
# InChI neighbors order corresponds to W,X,Y,Z (see InChI comments).
246+
w, x, y, z = nbrs # these are already expanded indices (or NO_ATOM)
247+
is_clockwise = (parity == INCHI_PARITY_EVEN) # 'e' == clockwise per docs
248+
stereocenters[center] = (w, x, y, is_clockwise)
249+
elseif s.type == INCHI_StereoType_DoubleBond
250+
all(is_heavy.(nbrs)) || continue # skip disconnected or missing substituents
251+
252+
# the double bond is between the middle two atoms
253+
# by InChI convention, neighbor[1,2] on one end, neighbor[3,4] on the other
254+
# determine the actual bond by looking up the shared bond in the graph
255+
bond_candidates = [u_edge(g, nbrs[i], nbrs[j]) for i in 1:2, j in 3:4]
256+
# pick the one that actually exists in your bond list
257+
edges = collect(g.eprops) # materialize iterator
258+
bond_key_index = findfirst(e -> e[1] in bond_candidates, edges)
259+
bond_key_index === nothing && continue
260+
bond_key = edges[bond_key_index][1]
261+
262+
is_trans = if parity == INCHI_PARITY_ODD
263+
true # InChI uses ODD -> trans
264+
elseif parity == INCHI_PARITY_EVEN
265+
false # EVEN -> cis
266+
else
267+
# we currently don't get here, because we filtered out NONE/UNKNOWN/UNDEFINED above
268+
# once we can handle undetermined stereo, we need to implement that logic here
269+
continue # unknown
270+
end
271+
272+
# store stereobond
273+
stereobonds[bond_key] = (nbrs[1], nbrs[3], is_trans)
274+
elseif s.type == INCHI_StereoType_Allene
275+
@info "Allene stereo, not yet tested"
276+
# Tried testing with `mol = smilestomol("C[C@]=C=C(C)F")`,
277+
# but the InChI generated from it does not contain stereo information.
278+
# Allene (axial) handling
279+
# InChI gives neighbors as W,X,Y,Z where W/X are substituents on one end
280+
# and Y/Z on the other. central_atom is the central cumulated C.
281+
# We need to pick a viewing atom and two reference atoms to encode axial chirality.
282+
center == 0 && continue
283+
284+
# For an allene to be chiral, both ends must have two substituents (i.e. W/X and Y/Z present)
285+
all(is_heavy.(nbrs)) || continue
286+
w, x, y, z = nbrs
287+
288+
is_clockwise = (parity == INCHI_PARITY_EVEN)
289+
290+
# store into same stereocenters dict so coordgen!/rendering code can use it
291+
stereocenters[center] = (w, x, y, is_clockwise)
292+
else
293+
@warn "Unknown stereo type $(s.type), skipping"
294+
end
295+
end
200296

201-
return res
297+
# merge into mol.gprops the same way you already did:
298+
merge!(g.gprops.stereocenter, stereocenters)
299+
merge!(g.gprops.stereobond, stereobonds)
300+
return g
202301
end
203302

204303
"""
205-
function inchitomol(inchi::String; options = "", verbose = false)
304+
function inchitomol(inchi::String; options = "", config::Union{Nothing,Dict{Symbol,Any}} = nothing)
206305
207306
Generate molecule from inchi string, `options` are specified in https://github.com/mojaie/libinchi/blob/master/INCHI_BASE/src/inchi_api.h
307+
`config` is for internal or advanced use only. Maybe removed in a future release.
208308
"""
209-
function inchitomol(inchi::String; options = "", verbose = false)
210-
inchitosdf(inchi; options, verbose) |> IOBuffer |> sdftomol
309+
function inchitomol(::Type{T}, inchi::AbstractString;
310+
options::String = "",
311+
config::Union{Nothing,Dict{Symbol,Any}} = nothing
312+
) where T <: MolGraph
313+
inchi isa String || (inchi = "$inchi")
314+
315+
structure = inchi_OutputStructEx()
316+
inchi_input = inchi_InputINCHI(Base.unsafe_convert(Cstring, inchi),
317+
Base.unsafe_convert(Cstring, options))
318+
319+
ret = @ccall libinchi.GetStructFromINCHIEx(
320+
inchi_input::Ref{inchi_InputINCHI},
321+
structure::Ref{inchi_OutputStructEx}
322+
)::Int32
323+
324+
if ret < 0
325+
error("libinchi failed with code $ret")
326+
end
327+
328+
config = if config === nothing && T === SMILESMolGraph
329+
Dict{Symbol, Any}(:on_init => smiles_on_init!, :on_update => smiles_on_update!)
330+
elseif config === nothing && T === SDFMolGraph
331+
Dict{Symbol, Any}(:on_init => sdf_on_init!, :on_update => sdf_on_update!)
332+
else
333+
config
334+
end
335+
336+
# determine the MolecularGraph atom/bond types and constructors
337+
V = vproptype(T) === AbstractAtom ? SDFAtom : vproptype(T)
338+
E = eproptype(T) === AbstractBond ? SDFBond : eproptype(T)
339+
ET = edgetype(T)
340+
341+
atoms = unsafe_wrap(Vector{inchi_Atom}, structure.atom, structure.num_atoms)
342+
343+
# heavy atoms
344+
expanded_syms = String[
345+
String(reinterpret(UInt8, collect(Iterators.takewhile(!=(0x00), a.elname))))
346+
for a in atoms]
347+
348+
# isotopic H/D/T
349+
for a in atoms
350+
# a.num_iso_H is a NTuple{4,S_CHAR}, where index 1 = non-specified, 2 = H, 3 = D, 4 = T
351+
# let's add symbols only for explicit isotopes [1H], [D] and [T]
352+
# as NUM_H_ISOTOPES = 3, we only need to check indices 2, 3, and 4
353+
for iso_index in 1:NUM_H_ISOTOPES
354+
n = a.num_iso_H[iso_index + 1]
355+
iso_sym = ["H", "D", "T"][iso_index]
356+
for _ in 1:n
357+
push!(expanded_syms, iso_sym)
358+
end
359+
end
360+
end
361+
362+
# build bonds to isotopic H/D/T
363+
bonds = Tuple{Int,Int,Int}[]
364+
first_isotope_index = length(atoms) + 1
365+
next_iso_idx = first_isotope_index
366+
for (parent_idx, a) in enumerate(atoms)
367+
for iso_index in 1:NUM_H_ISOTOPES
368+
n = a.num_iso_H[iso_index + 1]
369+
for _ in 1:n
370+
push!(bonds, (parent_idx, next_iso_idx, 1))
371+
next_iso_idx += 1
372+
end
373+
end
374+
end
375+
376+
# heavy atom bonds
377+
for (i, a) in enumerate(atoms)
378+
for k in 1:a.num_bonds
379+
j = a.neighbor[k] + 1
380+
j > i && push!(bonds, (i, j, Int(a.bond_type[k])))
381+
end
382+
end
383+
384+
N = length(expanded_syms)
385+
vprops = V[]
386+
edges = ET[]
387+
eprops = E[]
388+
389+
for idx in 1:N
390+
if idx <= length(atoms)
391+
a = atoms[idx]
392+
symbol = expanded_syms[idx]
393+
d = Dict{String,Any}(
394+
"symbol" => symbol,
395+
"charge" => Int(a.charge),
396+
"multiplicity" => (Int(a.radical) == 0 ? 1 : Int(a.radical)),
397+
"isotope" => Int(a.isotopic_mass),
398+
)
399+
:coords in fieldnames(V) && push!(d, "coords" => [Float64(a.x), Float64(a.y), Float64(a.z)])
400+
push!(vprops, V(d))
401+
else
402+
symbol = expanded_syms[idx]
403+
d = Dict(
404+
"symbol" => symbol,
405+
"charge" => 0,
406+
"multiplicity" => 1,
407+
"isotope" => 0
408+
)
409+
:coords in fieldnames(V) && push!(d, "coords" => [0.0, 0.0, 0.0])
410+
push!(vprops, V(d))
411+
end
412+
end
413+
414+
for (i, j, order) in bonds
415+
ekey = i < j ? ET(i, j) : ET(j, i)
416+
push!(edges, ekey)
417+
push!(eprops, E(; order))
418+
end
419+
420+
g = T(edges, vprops, eprops; NamedTuple((k, v) for (k, v) in config)...)
421+
422+
process_inchi_stereo!(g, structure)
423+
424+
@ccall libinchi.FreeStructFromINCHIEx(structure::Ref{inchi_OutputStructEx})::Cvoid
425+
426+
coordgen!(g)
427+
remove_hydrogens!(g)
428+
return g
429+
end
430+
431+
function inchitomol(inchi::String; options::String = "", config::Union{Nothing,Dict{Symbol,Any}} = nothing)
432+
inchitomol(SDFMolGraph, inchi; options, config)
211433
end

0 commit comments

Comments
 (0)