From d86822af28443e66e41e5565351c1764895f81ac Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Tue, 3 Dec 2024 01:50:36 +0100 Subject: [PATCH 1/2] Addition of CMS Topology Parser --- package/MDAnalysis/topology/CMSParser.py | 120 +++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 package/MDAnalysis/topology/CMSParser.py diff --git a/package/MDAnalysis/topology/CMSParser.py b/package/MDAnalysis/topology/CMSParser.py new file mode 100644 index 00000000000..a778cdcc3ad --- /dev/null +++ b/package/MDAnalysis/topology/CMSParser.py @@ -0,0 +1,120 @@ +import numpy as np +import re + +from ..lib.util import openany +from ..core.topologyattrs import ( + Atomnames, + Atomids, + Resids, + Resnames, + Resnums, + Segids, + Resindices, +) +from ..core.topology import Topology +from .base import TopologyReaderBase, change_squash + + +class CMSParser(TopologyReaderBase): + """Parser for CMS file format.""" + format = 'CMS' + + def __init__(self, filename): + super().__init__(filename=filename) + + def parse(self, **kwargs): + """Parse the CMS file and debug data extraction.""" + + with open(self.filename, 'rt') as inf: + # Read all lines + lines = inf.readlines() + + # Extract the number of atoms (n_atoms) from the m_atom[] line + n_atoms = 0 + for line in lines: + line = line.strip() + if line.startswith("m_atom["): + # Extract the number of atoms from m_atom[] (e.g., m_atom[100]) + parts = line.split('[') + n_atoms = int(parts[1].split(']')[0]) + break + + # If the number of atoms wasn't found, raise an error + if n_atoms == 0: + raise ValueError("Number of atoms (n_atoms) could not be found in the file.") + + # Create arrays + resids = np.zeros(n_atoms, dtype=np.int32) + resnames = np.zeros(n_atoms, dtype=object) + segids = np.zeros(n_atoms, dtype=object) + atomnames = np.zeros(n_atoms, dtype=object) + atom_ids = np.zeros(n_atoms, dtype=np.int32) + + #Parse the atom data after the third occurrence of ":::" + colon_count = 0 # Counter for ":::" markers + atom_block = False + + atom_idx = 0 + + # Regex pattern to split while keeping quoted strings together + split_pattern = r'".*?"|\S+' + + for line in lines: + line = line.strip() + + # Count occurrences of ":::" and start atom block after the third occurrence + if ":::" in line: + colon_count += 1 + if colon_count < 3: + continue + else: + atom_block = True + continue # Skip the ":::" marker line + + if atom_block: + # Stop processing when encountering a line starting with '}' + if line.startswith("}") or line.startswith('{'): + break + + details = re.findall(split_pattern, line) + details = [item.strip('"') for item in details] + + if len(details) >= 7: + try: + # Extract atom data + atom_id = int(details[0]) # Atom ID + resid = int(details[5]) # Residue ID + segid = details[7].strip() # Segment ID + + resname = details[11].strip() # Residue name + atomname = details[12].strip() # Atom name + + if atomname == '': + atomname = details[14].strip() + + # Fill the allocated arrays with parsed data + resids[atom_idx] = resid + resnames[atom_idx] = resname + segids[atom_idx] = segid + atomnames[atom_idx] = atomname + atom_ids[atom_idx] = atom_id + + atom_idx += 1 + + except (ValueError, IndexError): + # Skip malformed lines + print(f"Skipping invalid line: {line}") + + print(segids) + + attrs = [ + Atomnames(atomnames), + Atomids(atom_ids), + Resids(resids), + Resnames(resnames), + Segids(segids), + ] + + topology = Topology(n_atoms=n_atoms, n_res=len(resids), n_seg=len(segids), attrs=attrs) + + return topology From 3602eecfcdc4996f0ebf166c276981e73b0fd541 Mon Sep 17 00:00:00 2001 From: Valerij Talagayev <82884038+talagayev@users.noreply.github.com> Date: Tue, 3 Dec 2024 01:52:47 +0100 Subject: [PATCH 2/2] Update __init__.py Addition of CMSParser --- package/MDAnalysis/topology/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/package/MDAnalysis/topology/__init__.py b/package/MDAnalysis/topology/__init__.py index 951567ec615..faa6a1ab2fb 100644 --- a/package/MDAnalysis/topology/__init__.py +++ b/package/MDAnalysis/topology/__init__.py @@ -307,7 +307,7 @@ __all__ = ['core', 'PSFParser', 'PDBParser', 'PQRParser', 'GROParser', 'CRDParser', 'TOPParser', 'PDBQTParser', 'TPRParser', 'LAMMPSParser', 'XYZParser', 'GMSParser', 'DLPolyParser', - 'HoomdXMLParser','GSDParser', 'ITPParser'] + 'HoomdXMLParser','GSDParser', 'ITPParser', 'CMSParser'] from . import core from . import PSFParser @@ -332,3 +332,4 @@ from . import MinimalParser from . import ITPParser from . import FHIAIMSParser +from . import CMSParser