Skip to content

Commit 2bc4006

Browse files
committed
Update _filter_xyz to ignore lines after cartesian coordinates in standard xyz files
1 parent d04a380 commit 2bc4006

File tree

2 files changed

+223
-1
lines changed

2 files changed

+223
-1
lines changed

qcelemental/molparse/from_string.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -671,7 +671,7 @@ def process_variable(matchobj):
671671
xyz1strict = re.compile(r"\A" + r"(?P<nat>\d+)" + r"\Z")
672672
SIMPLENUCLEUS = r"""((?P<E>[A-Z]{1,3})|(?P<Z>\d{1,3}))"""
673673
atom_cartesian_strict = re.compile(
674-
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + r"\Z", re.IGNORECASE | re.VERBOSE
674+
r"\A" + r"(?P<nucleus>" + SIMPLENUCLEUS + r")" + SEP + CARTXYZ + ".*" + r"\Z", re.IGNORECASE | re.VERBOSE
675675
)
676676

677677
xyz1 = re.compile(r"\A" + r"(?P<nat>\d+)" + r"[\s,]*" + r"((?P<ubohr>(bohr|au))|(?P<uang>ang))?" + r"\Z", re.IGNORECASE)
@@ -737,9 +737,17 @@ def process_atom_cartesian(matchobj):
737737
for iln, line in enumerate(string.split("\n")):
738738
line = line.strip()
739739
if iln == 0:
740+
try:
741+
num_atoms = int(line)
742+
except ValueError:
743+
# Not a standard xyz format; continue with regular process
744+
num_atoms = None
740745
line = re.sub(xyz1strict, "", line)
741746
elif iln == 1:
742747
continue
748+
elif num_atoms and iln > num_atoms + 1: # if standard xyz, stop parsing after cartesian coords
749+
# Ignore everything after cartesian coords
750+
break
743751
else:
744752
line = re.sub(atom_cartesian_strict, process_atom_cartesian, line)
745753
if line:

qcelemental/tests/test_molparse_from_string.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import qcelemental
77
from qcelemental.models import Molecule
8+
from qcelemental.molparse.from_string import _filter_xyz
89
from qcelemental.testing import compare, compare_molrecs, compare_recursive, compare_values, tnm
910

1011
_arrays_prov_stamp = {"creator": "QCElemental", "version": "1.0", "routine": "qcelemental.molparse.from_arrays"}
@@ -921,6 +922,219 @@ def test_xyzp_qm_7e():
921922
assert compare_molrecs(fullans, final["qm"], tnm() + ": full qm")
922923

923924

925+
@pytest.mark.parametrize(
926+
"string,elbl,geom",
927+
(
928+
(
929+
"""5
930+
gdb 1 157.7118 157.70997 157.70699 0. 13.21 -0.3877 0.1171 0.5048 35.3641 0.044749 -40.47893 -40.476062 -40.475117 -40.498597 6.469
931+
C -0.0126981359 1.0858041578 0.0080009958 -0.535689
932+
H 0.002150416 -0.0060313176 0.0019761204 0.133921
933+
H 1.0117308433 1.4637511618 0.0002765748 0.133922
934+
H -0.540815069 1.4475266138 -0.8766437152 0.133923
935+
H -0.5238136345 1.4379326443 0.9063972942 0.133923
936+
1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078
937+
C C
938+
InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4""",
939+
["C", "H", "H", "H", "H"],
940+
[
941+
-0.0126981359,
942+
1.0858041578,
943+
0.0080009958,
944+
0.002150416,
945+
-0.0060313176,
946+
0.0019761204,
947+
1.0117308433,
948+
1.4637511618,
949+
0.0002765748,
950+
-0.540815069,
951+
1.4475266138,
952+
-0.8766437152,
953+
-0.5238136345,
954+
1.4379326443,
955+
0.9063972942,
956+
],
957+
),
958+
(
959+
"""20
960+
gdb 52625 3.48434 0.81389 0.77349 4.0931 85.49 -0.2471 0.0275 0.2746 1578.0163 0.16756 -403.158572 -403.147447 -403.146503 -403.196607 38.82
961+
C 0.0219866132 1.4617007325 0.0778162941 -0.5003
962+
C 0.0172170008 0.0062570163 0.0278221402 0.060798
963+
C 0.0082754818 -1.1968179556 -0.0219490036 0.058561
964+
C -0.0048543985 -2.6632285567 -0.0864436684 0.086487
965+
C -1.4625346892 -3.1775528493 -0.0739515886 -0.401401
966+
C 0.724612823 -3.1404242978 -1.3630324305 -0.401392
967+
N 0.7132087245 -3.1481618818 1.1179206119 -0.295483
968+
C 0.9247284349 -4.4479813471 1.4397739679 0.162556
969+
O 0.5626675182 -5.4147891002 0.7982523365 -0.336127
970+
H 1.0431199948 1.8581192318 0.0749725881 0.150933
971+
H -0.4997253695 1.8847111976 -0.7874860262 0.154682
972+
H -0.4780438734 1.8320081388 0.979321682 0.150927
973+
H -1.9940013133 -2.8054928472 -0.9536444371 0.11928
974+
H -1.4636215253 -4.2685570577 -0.0833553058 0.154594
975+
H -1.981857154 -2.824763824 0.8204749775 0.116725
976+
H 1.749444309 -2.7614376976 -1.3787041159 0.11672
977+
H 0.7443784569 -4.2310758294 -1.3847506905 0.154572
978+
H 0.2030559276 -2.7681902401 -2.2485640369 0.11929
979+
H 1.0701803562 -2.4436691628 1.7431541049 0.251085
980+
H 1.4874481325 -4.5389119798 2.3916335805 0.077494
981+
9.6558 74.9555 86.4308 130.4187 209.6956 213.4664 245.0078 277.3227 292.8775 293.3837 344.8375 364.9338 411.1024 508.5101 542.9334 570.3714 618.7374 820.0082 829.2715 936.286 986.4059 1028.4996 1033.1508 1051.8084 1054.6924 1076.9172 1172.1103 1179.8358 1222.2743 1289.1699 1387.6509 1408.9379 1414.4085 1431.5059 1476.1153 1477.0789 1478.6118 1481.2338 1494.3428 1511.202 1519.6039 1792.8486 2370.6229 2920.0767 3029.2922 3050.3711 3054.2985 3090.8073 3092.1476 3124.8526 3128.7105 3148.8786 3152.1387 3639.6323
982+
CC#CC(C)(C)NC=O CC#CC(C)(C)NC=O
983+
InChI=1S/C7H11NO/c1-4-5-7(2,3)8-6-9/h6H,1-3H3,(H,8,9) InChI=1S/C7H11NO/c1-4-5-7(2,3)8-6-9/h6H,1-3H3,(H,8,9)
984+
""",
985+
["C", "C", "C", "C", "C", "C", "N", "C", "O", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H", "H"],
986+
[
987+
0.0219866132,
988+
1.4617007325,
989+
0.0778162941,
990+
0.0172170008,
991+
0.0062570163,
992+
0.0278221402,
993+
0.0082754818,
994+
-1.1968179556,
995+
-0.0219490036,
996+
-0.0048543985,
997+
-2.6632285567,
998+
-0.0864436684,
999+
-1.4625346892,
1000+
-3.1775528493,
1001+
-0.0739515886,
1002+
0.724612823,
1003+
-3.1404242978,
1004+
-1.3630324305,
1005+
0.7132087245,
1006+
-3.1481618818,
1007+
1.1179206119,
1008+
0.9247284349,
1009+
-4.4479813471,
1010+
1.4397739679,
1011+
0.5626675182,
1012+
-5.4147891002,
1013+
0.7982523365,
1014+
1.0431199948,
1015+
1.8581192318,
1016+
0.0749725881,
1017+
-0.4997253695,
1018+
1.8847111976,
1019+
-0.7874860262,
1020+
-0.4780438734,
1021+
1.8320081388,
1022+
0.979321682,
1023+
-1.9940013133,
1024+
-2.8054928472,
1025+
-0.9536444371,
1026+
-1.4636215253,
1027+
-4.2685570577,
1028+
-0.0833553058,
1029+
-1.981857154,
1030+
-2.824763824,
1031+
0.8204749775,
1032+
1.749444309,
1033+
-2.7614376976,
1034+
-1.3787041159,
1035+
0.7443784569,
1036+
-4.2310758294,
1037+
-1.3847506905,
1038+
0.2030559276,
1039+
-2.7681902401,
1040+
-2.2485640369,
1041+
1.0701803562,
1042+
-2.4436691628,
1043+
1.7431541049,
1044+
1.4874481325,
1045+
-4.5389119798,
1046+
2.3916335805,
1047+
],
1048+
),
1049+
(
1050+
"""17
1051+
gdb 107395 3.22974 1.27619 1.028 3.0629 69.23 -0.225 0.0454 0.2704 1155.6964 0.136978 -455.1439 -455.135732 -455.134788 -455.176691 31.549
1052+
N 0.0166534686 1.2958609713 -0.1502735485 -0.580455
1053+
C -0.0682337319 -0.0600800983 -0.0096861238 0.457461
1054+
N -1.1305816717 -0.7597824714 0.1390863864 -0.408126
1055+
C -0.7009986293 -2.1439574926 0.1108213578 0.311595
1056+
C -1.4984924036 -3.0589993793 1.002599169 -0.138474
1057+
O -2.8585678608 -3.0949107355 0.6105794873 -0.417767
1058+
C -0.0617590849 -2.6053899658 -1.1855277252 -0.354308
1059+
C 0.8014538633 -2.0927793706 -0.0638522254 0.002142
1060+
O 1.1296621 -0.7079465651 -0.1012706145 -0.24761
1061+
H 0.8924648972 1.705797935 0.136582908 0.26608
1062+
H -0.8059467993 1.7929265086 0.153937696 0.270518
1063+
H -1.1201801933 -4.0847553601 0.9397586373 0.099132
1064+
H -1.3951151651 -2.7241068393 2.0475874391 0.085667
1065+
H -3.1252104475 -2.1733539114 0.5051511429 0.295168
1066+
H -0.0217120799 -3.6759992339 -1.3586546368 0.121153
1067+
H -0.1831965908 -1.9943417386 -2.0738763382 0.124725
1068+
H 1.5549266689 -2.687136563 0.4367361986 0.113098
1069+
70.6081 153.9333 180.5159 278.5491 345.0174 381.8663 413.9812 430.6681 448.7324 482.9536 553.6703 659.1349 739.6911 741.9888 783.4372 872.8339 876.0968 943.3731 959.9083 980.6717 997.5511 1046.3298 1069.8255 1090.0307 1128.0369 1148.246 1187.611 1267.6525 1296.0396 1370.3317 1379.5593 1424.1337 1443.8219 1468.4333 1503.1705 1614.9407 1733.6758 2971.4118 3070.4849 3127.1322 3195.3399 3220.8339 3586.8344 3701.0432 3786.2797
1070+
NC1=NC2(CO)CC2O1 NC1=N[C@@]2(CO)C[C@H]2O1
1071+
InChI=1S/C5H8N2O2/c6-4-7-5(2-8)1-3(5)9-4/h3,8H,1-2H2,(H2,6,7) InChI=1S/C5H8N2O2/c6-4-7-5(2-8)1-3(5)9-4/h3,8H,1-2H2,(H2,6,7)/t3-,5-/m1/s1
1072+
""",
1073+
["N", "C", "N", "C", "C", "O", "C", "C", "O", "H", "H", "H", "H", "H", "H", "H", "H"],
1074+
[
1075+
0.0166534686,
1076+
1.2958609713,
1077+
-0.1502735485,
1078+
-0.0682337319,
1079+
-0.0600800983,
1080+
-0.0096861238,
1081+
-1.1305816717,
1082+
-0.7597824714,
1083+
0.1390863864,
1084+
-0.7009986293,
1085+
-2.1439574926,
1086+
0.1108213578,
1087+
-1.4984924036,
1088+
-3.0589993793,
1089+
1.002599169,
1090+
-2.8585678608,
1091+
-3.0949107355,
1092+
0.6105794873,
1093+
-0.0617590849,
1094+
-2.6053899658,
1095+
-1.1855277252,
1096+
0.8014538633,
1097+
-2.0927793706,
1098+
-0.0638522254,
1099+
1.1296621,
1100+
-0.7079465651,
1101+
-0.1012706145,
1102+
0.8924648972,
1103+
1.705797935,
1104+
0.136582908,
1105+
-0.8059467993,
1106+
1.7929265086,
1107+
0.153937696,
1108+
-1.1201801933,
1109+
-4.0847553601,
1110+
0.9397586373,
1111+
-1.3951151651,
1112+
-2.7241068393,
1113+
2.0475874391,
1114+
-3.1252104475,
1115+
-2.1733539114,
1116+
0.5051511429,
1117+
-0.0217120799,
1118+
-3.6759992339,
1119+
-1.3586546368,
1120+
-0.1831965908,
1121+
-1.9943417386,
1122+
-2.0738763382,
1123+
1.5549266689,
1124+
-2.687136563,
1125+
0.4367361986,
1126+
],
1127+
),
1128+
),
1129+
)
1130+
def test_xyz_gdb_format(string, elbl, geom):
1131+
unprocessed, processed = _filter_xyz(string, strict=True)
1132+
1133+
assert not unprocessed
1134+
assert processed["elbl"] == elbl
1135+
assert processed["geom"] == geom
1136+
1137+
9241138
subject8 = """\
9251139
3
9261140
stuffs

0 commit comments

Comments
 (0)