|
| 1 | +"""Utils for deepmd/hdf5 format.""" |
| 2 | +import h5py |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +from wcmatch.glob import globfilter |
| 6 | + |
| 7 | + |
| 8 | +__all__ = ['to_system_data', 'dump'] |
| 9 | + |
| 10 | +def to_system_data(f: h5py.File, |
| 11 | + folder: str, |
| 12 | + type_map: list = None, |
| 13 | + labels: bool = True) : |
| 14 | + """Load a HDF5 file. |
| 15 | +
|
| 16 | + Parameters |
| 17 | + ---------- |
| 18 | + f : h5py.File |
| 19 | + HDF5 file object |
| 20 | + folder : str |
| 21 | + path in the HDF5 file |
| 22 | + type_map : list |
| 23 | + type map |
| 24 | + labels : bool |
| 25 | + labels |
| 26 | + """ |
| 27 | + g = f[folder] if folder else f |
| 28 | + |
| 29 | + data = {} |
| 30 | + data['atom_types'] = g['type.raw'][:] |
| 31 | + ntypes = np.max(data['atom_types']) + 1 |
| 32 | + natoms = data['atom_types'].size |
| 33 | + data['atom_numbs'] = [] |
| 34 | + for ii in range (ntypes) : |
| 35 | + data['atom_numbs'].append(np.count_nonzero(data['atom_types'] == ii)) |
| 36 | + data['atom_names'] = [] |
| 37 | + # if find type_map.raw, use it |
| 38 | + if 'type_map.raw' in g.keys(): |
| 39 | + my_type_map = list(np.char.decode(g['type_map.raw'][:])) |
| 40 | + # else try to use arg type_map |
| 41 | + elif type_map is not None: |
| 42 | + my_type_map = type_map |
| 43 | + # in the last case, make artificial atom names |
| 44 | + else: |
| 45 | + my_type_map = [] |
| 46 | + for ii in range(ntypes) : |
| 47 | + my_type_map.append('Type_%d' % ii) |
| 48 | + assert(len(my_type_map) >= len(data['atom_numbs'])) |
| 49 | + for ii in range(len(data['atom_numbs'])) : |
| 50 | + data['atom_names'].append(my_type_map[ii]) |
| 51 | + |
| 52 | + data['orig'] = np.zeros([3]) |
| 53 | + if 'nopbc' in g.keys(): |
| 54 | + data['nopbc'] = True |
| 55 | + sets = globfilter(g.keys(), 'set.*') |
| 56 | + |
| 57 | + data_types = { |
| 58 | + 'cells': {'fn': 'box', 'labeled': False, 'shape': (3,3), 'required': 'nopbc' not in data}, |
| 59 | + 'coords': {'fn': 'coord', 'labeled': False, 'shape': (natoms,3), 'required': True}, |
| 60 | + 'energies': {'fn': 'energy', 'labeled': True, 'shape': tuple(), 'required': False}, |
| 61 | + 'forces': {'fn': 'force', 'labeled': True, 'shape': (natoms,3), 'required': False}, |
| 62 | + 'virials': {'fn': 'virial', 'labeled': True, 'shape': (3,3), 'required': False}, |
| 63 | + } |
| 64 | + |
| 65 | + for dt, prop in data_types.items(): |
| 66 | + all_data = [] |
| 67 | + |
| 68 | + for ii in sets: |
| 69 | + set = g[ii] |
| 70 | + fn = '%s.npy' % prop['fn'] |
| 71 | + if fn in set.keys(): |
| 72 | + dd = set[fn][:] |
| 73 | + nframes = dd.shape[0] |
| 74 | + all_data.append(np.reshape(dd, (nframes, *prop['shape']))) |
| 75 | + elif prop['required']: |
| 76 | + raise RuntimeError("%s/%s/%s not found" % (folder, ii, fn)) |
| 77 | + |
| 78 | + if len(all_data) > 0 : |
| 79 | + data[dt] = np.concatenate(all_data, axis = 0) |
| 80 | + return data |
| 81 | + |
| 82 | +def dump(f: h5py.File, |
| 83 | + folder: str, |
| 84 | + data: dict, |
| 85 | + set_size = 5000, |
| 86 | + comp_prec = np.float32, |
| 87 | + ) -> None: |
| 88 | + """Dump data to a HDF5 file. |
| 89 | +
|
| 90 | + Parameters |
| 91 | + ---------- |
| 92 | + f : h5py.File |
| 93 | + HDF5 file object |
| 94 | + folder : str |
| 95 | + path in the HDF5 file |
| 96 | + data : dict |
| 97 | + System or LabeledSystem data |
| 98 | + set_size : int, default: 5000 |
| 99 | + size of a set |
| 100 | + comp_prec : np.dtype, default: np.float32 |
| 101 | + precision of data |
| 102 | + """ |
| 103 | + # if folder is None, use the root of the file |
| 104 | + if folder: |
| 105 | + if folder in f: |
| 106 | + del f[folder] |
| 107 | + g = f.create_group(folder) |
| 108 | + else: |
| 109 | + g = f |
| 110 | + # dump raw (array in fact) |
| 111 | + g.create_dataset('type.raw', data=data['atom_types']) |
| 112 | + g.create_dataset('type_map.raw', data=np.array(data['atom_names'], dtype='S')) |
| 113 | + # BondOrder System |
| 114 | + if "bonds" in data: |
| 115 | + g.create_dataset("bonds.raw", data=data['bonds']) |
| 116 | + if "formal_charges" in data: |
| 117 | + g.create_dataset("formal_charges.raw", data=data['formal_charges']) |
| 118 | + # reshape frame properties and convert prec |
| 119 | + nframes = data['cells'].shape[0] |
| 120 | + |
| 121 | + nopbc = data.get("nopbc", False) |
| 122 | + reshaped_data = {} |
| 123 | + |
| 124 | + data_types = { |
| 125 | + 'cells': {'fn': 'box', 'shape': (nframes, 9), 'dump': not nopbc}, |
| 126 | + 'coords': {'fn': 'coord', 'shape': (nframes, -1), 'dump': True}, |
| 127 | + 'energies': {'fn': 'energy', 'shape': (nframes,), 'dump': True}, |
| 128 | + 'forces': {'fn': 'force', 'shape': (nframes, -1), 'dump': True}, |
| 129 | + 'virials': {'fn': 'virial', 'shape': (nframes, 9), 'dump': True}, |
| 130 | + } |
| 131 | + for dt, prop in data_types.items(): |
| 132 | + if dt in data: |
| 133 | + if prop['dump']: |
| 134 | + reshaped_data[dt] = np.reshape(data[dt], prop['shape']).astype(comp_prec) |
| 135 | + |
| 136 | + # dump frame properties: cell, coord, energy, force and virial |
| 137 | + nsets = nframes // set_size |
| 138 | + if set_size * nsets < nframes : |
| 139 | + nsets += 1 |
| 140 | + for ii in range(nsets) : |
| 141 | + set_stt = ii * set_size |
| 142 | + set_end = (ii+1) * set_size |
| 143 | + set_folder = g.create_group('set.%03d' % ii) |
| 144 | + for dt, prop in data_types.items(): |
| 145 | + if dt in reshaped_data: |
| 146 | + set_folder.create_dataset('%s.npy' % prop['fn'], data=reshaped_data[dt][set_stt:set_end]) |
| 147 | + |
| 148 | + if nopbc: |
| 149 | + g.create_dataset("nopbc", True) |
0 commit comments