Fix opening QM9 files

Johannes Gasteiger · The TensorFlow Datasets Authors · commit b52885bb90d5 · 2024-03-26T08:44:13.000-07:00
PiperOrigin-RevId: 619201213
diff --git a/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py b/tensorflow_datasets/datasets/qm9/qm9_dataset_builder.py
@@ -22,81 +22,106 @@
 To build the dataset, run the following from directory containing this file:
 $ tfds build.
 """
+
 import re
+from typing import Any, Iterable
 
-from typing import Any, Dict, Iterable, Tuple
+from etils import epath
 import numpy as np
 import tensorflow_datasets.public_api as tfds
 
+
 pd = tfds.core.lazy_imports.pandas
 
 _HOMEPAGE = 'https://doi.org/10.6084/m9.figshare.c.978904.v5'
 
 _ATOMREF_URL = 'https://figshare.com/ndownloader/files/3195395'
-_UNCHARACTERIZED_URL = 'https://springernature.figshare.com/ndownloader/files/3195404'
+_UNCHARACTERIZED_URL = (
+    'https://springernature.figshare.com/ndownloader/files/3195404'
+)
 _MOLECULES_URL = 'https://springernature.figshare.com/ndownloader/files/3195389'
 
 _SIZE = 133_885
 _CHARACTERIZED_SIZE = 130_831
 
 _MAX_ATOMS = 29
 _CHARGES = {'H': 1, 'C': 6, 'N': 7, 'O': 8, 'F': 9}
-_LABELS = ['tag', 'index', 'A', 'B', 'C', 'mu', 'alpha', 'homo', 'lumo', 'gap',
-           'r2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']
+_LABELS = [
+    'tag',
+    'index',
+    'A',
+    'B',
+    'C',
+    'mu',
+    'alpha',
+    'homo',
+    'lumo',
+    'gap',
+    'r2',
+    'zpve',
+    'U0',
+    'U',
+    'H',
+    'G',
+    'Cv',
+]
 # For each of these targets, we will add a second target with an
 # _atomization suffix that has the thermo term subtracted.
 _ATOMIZATION_TARGETS = ['U0', 'U', 'H', 'G']
 
 
-def _process_molecule(atomref, fname):
+def _process_molecule(
+    atomref: dict[str, Any], fname: epath.PathLike
+) -> dict[str, Any]:
   """Read molecule data from file."""
-  with open(fname, 'r') as f:
+  with epath.Path(fname).open() as f:
     lines = f.readlines()
   num_atoms = int(lines[0].rstrip())
   frequencies = re.split(r'\s+', lines[num_atoms + 2].rstrip())
   smiles = re.split(r'\s+', lines[num_atoms + 3].rstrip())
   inchi = re.split(r'\s+', lines[num_atoms + 4].rstrip())
 
-  labels = pd.read_table(fname,
-                         skiprows=1,
-                         nrows=1,
-                         sep=r'\s+',
-                         names=_LABELS)
+  labels = pd.read_table(fname, skiprows=1, nrows=1, sep=r'\s+', names=_LABELS)
 
-  atoms = pd.read_table(fname,
-                        skiprows=2,
-                        nrows=num_atoms,
-                        sep=r'\s+',
-                        names=['Z', 'x', 'y', 'z', 'Mulliken_charge'])
+  atoms = pd.read_table(
+      fname,
+      skiprows=2,
+      nrows=num_atoms,
+      sep=r'\s+',
+      names=['Z', 'x', 'y', 'z', 'Mulliken_charge'],
+  )
 
   # Correct exponential notation (6.8*^-6 -> 6.8e-6).
   for key in ['x', 'y', 'z', 'Mulliken_charge']:
     if atoms[key].values.dtype == 'object':
       # there are unrecognized numbers.
-      atoms[key].values[:] = np.array([
-          float(x.replace('*^', 'e'))
-          for i, x in enumerate(atoms[key].values)])
-
-  charges = np.pad([_CHARGES[v] for v in atoms['Z'].values],
-                   (0, _MAX_ATOMS - num_atoms))
-  positions = np.stack([atoms['x'].values,
-                        atoms['y'].values,
-                        atoms['z'].values], axis=-1).astype(np.float32)
+      atoms[key].values[:] = np.array(
+          [float(x.replace('*^', 'e')) for i, x in enumerate(atoms[key].values)]
+      )
+
+  charges = np.pad(
+      [_CHARGES[v] for v in atoms['Z'].values], (0, _MAX_ATOMS - num_atoms)
+  )
+  positions = np.stack(
+      [atoms['x'].values, atoms['y'].values, atoms['z'].values], axis=-1
+  ).astype(np.float32)
   positions = np.pad(positions, ((0, _MAX_ATOMS - num_atoms), (0, 0)))
 
   mulliken_charges = atoms['Mulliken_charge'].values.astype(np.float32)
   mulliken_charges = np.pad(mulliken_charges, ((0, _MAX_ATOMS - num_atoms)))
 
-  example = {'num_atoms': num_atoms,
-             'charges': charges,
-             'Mulliken_charges': mulliken_charges,
-             'positions': positions.astype(np.float32),
-             'frequencies': frequencies,
-             'SMILES': smiles[0],
-             'SMILES_relaxed': smiles[1],
-             'InChI': inchi[0],
-             'InChI_relaxed': inchi[1],
-             **{k: labels[k].values[0] for k in _LABELS}}
+  example = {
+      'num_atoms': num_atoms,
+      'charges': charges,
+      'Mulliken_charges': mulliken_charges,
+      'positions': positions.astype(np.float32),
+      'frequencies': frequencies,
+      'SMILES': smiles[0],
+      'SMILES_relaxed': smiles[1],
+      'InChI': inchi[0],
+      'InChI_relaxed': inchi[1],
+      **{k: labels[k].values[0] for k in _LABELS},
+  }
 
   # Create atomization targets by subtracting thermochemical energy of
   # each atom.
@@ -113,8 +138,9 @@ def _process_molecule(atomref, fname):
 def _get_valid_ids(uncharacterized):
   """Get valid ids."""
   # Original data files are  1-indexed.
-  characterized_ids = np.array(sorted(set(range(1, _SIZE + 1)) -
-                                      set(uncharacterized)))
+  characterized_ids = np.array(
+      sorted(set(range(1, _SIZE + 1)) - set(uncharacterized))
+  )
   assert len(characterized_ids) == _CHARACTERIZED_SIZE
   return characterized_ids
 
@@ -173,27 +199,32 @@ def _info(self) -> tfds.core.DatasetInfo:
     )
 
   def _split_generators(
-      self, dl_manager: tfds.download.DownloadManager) -> Dict[str, Any]:
+      self, dl_manager: tfds.download.DownloadManager
+  ) -> dict[str, Any]:
     """Returns SplitGenerators. See superclass method for details."""
     atomref = pd.read_table(
         dl_manager.download({'atomref': _ATOMREF_URL})['atomref'],
         skiprows=5,
         index_col='Z',
         skipfooter=1,
         sep=r'\s+',
-        names=['Z', 'zpve', 'U0', 'U', 'H', 'G', 'Cv']).to_dict()
+        names=['Z', 'zpve', 'U0', 'U', 'H', 'G', 'Cv'],
+    ).to_dict()
 
     uncharacterized = pd.read_table(
-        dl_manager.download(
-            {'uncharacterized': _UNCHARACTERIZED_URL})['uncharacterized'],
+        dl_manager.download({'uncharacterized': _UNCHARACTERIZED_URL})[
+            'uncharacterized'
+        ],
         skiprows=9,
         skipfooter=1,
         sep=r'\s+',
         usecols=[0],
-        names=['index']).values[:, 0]
+        names=['index'],
+    ).values[:, 0]
 
     molecules_dir = dl_manager.download_and_extract(
-        {'dsgdb9nsd': _MOLECULES_URL})['dsgdb9nsd']
+        {'dsgdb9nsd': _MOLECULES_URL}
+    )['dsgdb9nsd']
 
     valid_ids = _get_valid_ids(uncharacterized)
 
@@ -202,11 +233,13 @@ def _split_generators(
   def _generate_examples(
       self,
       split: np.ndarray,
-      atomref: Dict[str, Any],
-      molecules_dir: Any) -> Iterable[Tuple[int, Dict[str, Any]]]:
+      atomref: dict[str, Any],
+      molecules_dir: epath.Path,
+  ) -> Iterable[tuple[int, dict[str, Any]]]:
     """Dataset generator. See superclass method for details."""
 
     for i in split:
       entry = _process_molecule(
-          atomref, molecules_dir / f'dsgdb9nsd_{i:06d}.xyz')
+          atomref, molecules_dir / f'dsgdb9nsd_{i:06d}.xyz'
+      )
       yield int(i), entry