Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
307 changes: 162 additions & 145 deletions src/openfermion/chem/molecular_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"""Class and functions to store quantum chemistry data."""

import os
import uuid
import tempfile
import shutil
import numpy
import h5py
Expand Down Expand Up @@ -699,157 +699,174 @@ def ccsd_double_amps(self):
def ccsd_double_amps(self, value):
self._ccsd_double_amps = value

def save(self):
"""Method to save the class under a systematic name."""
# Create a temporary file and swap it to the original name in case
# data needs to be loaded while saving
tmp_name = uuid.uuid4()
with h5py.File("{}.hdf5".format(tmp_name), "w") as f:
# Save geometry (atoms and positions need to be separate):
d_geom = f.create_group("geometry")
if not isinstance(self.geometry, basestring):
atoms = [numpy.bytes_(item[0]) for item in self.geometry]
positions = numpy.array([list(item[1]) for item in self.geometry])
else:
atoms = numpy.bytes_(self.geometry)
positions = None
d_geom.create_dataset("atoms", data=(atoms if atoms is not None else False))
d_geom.create_dataset("positions", data=(positions if positions is not None else False))
# Save basis:
f.create_dataset("basis", data=numpy.bytes_(self.basis))
# Save multiplicity:
f.create_dataset("multiplicity", data=self.multiplicity)
# Save charge:
f.create_dataset("charge", data=self.charge)
# Save description:
f.create_dataset("description", data=numpy.bytes_(self.description))
# Save name:
f.create_dataset("name", data=numpy.bytes_(self.name))
# Save n_atoms:
f.create_dataset("n_atoms", data=self.n_atoms)
# Save atoms:
f.create_dataset("atoms", data=numpy.bytes_(self.atoms))
# Save protons:
f.create_dataset("protons", data=self.protons)
# Save n_electrons:
f.create_dataset("n_electrons", data=self.n_electrons)
# Save generic attributes from calculations:
f.create_dataset(
"n_orbitals", data=(self.n_orbitals if self.n_orbitals is not None else False)
)
f.create_dataset(
"n_qubits", data=(self.n_qubits if self.n_qubits is not None else False)
)
f.create_dataset(
"nuclear_repulsion",
data=(self.nuclear_repulsion if self.nuclear_repulsion is not None else False),
)
# Save attributes generated from SCF calculation.
f.create_dataset(
"hf_energy", data=(self.hf_energy if self.hf_energy is not None else False)
)
f.create_dataset(
"canonical_orbitals",
data=(self.canonical_orbitals if self.canonical_orbitals is not None else False),
compression=("gzip" if self.canonical_orbitals is not None else None),
)
f.create_dataset(
"overlap_integrals",
data=(self.overlap_integrals if self.overlap_integrals is not None else False),
compression=("gzip" if self.overlap_integrals is not None else None),
)
f.create_dataset(
"orbital_energies",
data=(self.orbital_energies if self.orbital_energies is not None else False),
)
# Save attributes generated from integrals.
f.create_dataset(
"one_body_integrals",
data=(self.one_body_integrals if self.one_body_integrals is not None else False),
compression=("gzip" if self.one_body_integrals is not None else None),
)
f.create_dataset(
"two_body_integrals",
data=(self.two_body_integrals if self.two_body_integrals is not None else False),
compression=("gzip" if self.two_body_integrals is not None else None),
)
# Save attributes generated from MP2 calculation.
f.create_dataset(
"mp2_energy", data=(self.mp2_energy if self.mp2_energy is not None else False)
)
# Save attributes generated from CISD calculation.
f.create_dataset(
"cisd_energy", data=(self.cisd_energy if self.cisd_energy is not None else False)
)
f.create_dataset(
"cisd_one_rdm",
data=(self.cisd_one_rdm if self.cisd_one_rdm is not None else False),
compression=("gzip" if self.cisd_one_rdm is not None else None),
)
f.create_dataset(
"cisd_two_rdm",
data=(self.cisd_two_rdm if self.cisd_two_rdm is not None else False),
compression=("gzip" if self.cisd_two_rdm is not None else None),
)
# Save attributes generated from exact diagonalization.
f.create_dataset(
"fci_energy", data=(self.fci_energy if self.fci_energy is not None else False)
)
f.create_dataset(
"fci_one_rdm",
data=(self.fci_one_rdm if self.fci_one_rdm is not None else False),
compression=("gzip" if self.fci_one_rdm is not None else None),
)
f.create_dataset(
"fci_two_rdm",
data=(self.fci_two_rdm if self.fci_two_rdm is not None else False),
compression=("gzip" if self.fci_two_rdm is not None else None),
)
# Save attributes generated from CCSD calculation.
f.create_dataset(
"ccsd_energy", data=(self.ccsd_energy if self.ccsd_energy is not None else False)
)
f.create_dataset(
"ccsd_single_amps",
data=(self.ccsd_single_amps if self.ccsd_single_amps is not None else False),
compression=("gzip" if self.ccsd_single_amps is not None else None),
)
f.create_dataset(
"ccsd_double_amps",
data=(self.ccsd_double_amps if self.ccsd_double_amps is not None else False),
compression=("gzip" if self.ccsd_double_amps is not None else None),
)
def _write_hdf5_data(self, hdf5_file):
"""Method to write the class data to an HDF5 file.

# Save general calculation data
key_list = list(self.general_calculations.keys())
f.create_dataset(
"general_calculations_keys",
data=([numpy.bytes_(key) for key in key_list] if len(key_list) > 0 else False),
)
f.create_dataset(
"general_calculations_values",
data=(
[self.general_calculations[key] for key in key_list]
if len(key_list) > 0
else False
),
)
The hdf5_file parameter is expected to be an already-open h5py.File handle.
"""
# Save geometry (atoms and positions need to be separate):
d_geom = hdf5_file.create_group("geometry")
if not isinstance(self.geometry, basestring):
atoms = [numpy.bytes_(item[0]) for item in self.geometry]
positions = numpy.array([list(item[1]) for item in self.geometry])
else:
atoms = numpy.bytes_(self.geometry)
positions = None
d_geom.create_dataset("atoms", data=(atoms if atoms is not None else False))
d_geom.create_dataset("positions", data=(positions if positions is not None else False))
# Save basis:
hdf5_file.create_dataset("basis", data=numpy.bytes_(self.basis))
# Save multiplicity:
hdf5_file.create_dataset("multiplicity", data=self.multiplicity)
# Save charge:
hdf5_file.create_dataset("charge", data=self.charge)
# Save description:
hdf5_file.create_dataset("description", data=numpy.bytes_(self.description))
# Save name:
hdf5_file.create_dataset("name", data=numpy.bytes_(self.name))
# Save n_atoms:
hdf5_file.create_dataset("n_atoms", data=self.n_atoms)
# Save atoms:
hdf5_file.create_dataset("atoms", data=numpy.bytes_(self.atoms))
# Save protons:
hdf5_file.create_dataset("protons", data=self.protons)
# Save n_electrons:
hdf5_file.create_dataset("n_electrons", data=self.n_electrons)
# Save generic attributes from calculations:
hdf5_file.create_dataset(
"n_orbitals", data=(self.n_orbitals if self.n_orbitals is not None else False)
)
hdf5_file.create_dataset(
"n_qubits", data=(self.n_qubits if self.n_qubits is not None else False)
)
hdf5_file.create_dataset(
"nuclear_repulsion",
data=(self.nuclear_repulsion if self.nuclear_repulsion is not None else False),
)
# Save attributes generated from SCF calculation.
hdf5_file.create_dataset(
"hf_energy", data=(self.hf_energy if self.hf_energy is not None else False)
)
hdf5_file.create_dataset(
"canonical_orbitals",
data=(self.canonical_orbitals if self.canonical_orbitals is not None else False),
compression=("gzip" if self.canonical_orbitals is not None else None),
)
hdf5_file.create_dataset(
"overlap_integrals",
data=(self.overlap_integrals if self.overlap_integrals is not None else False),
compression=("gzip" if self.overlap_integrals is not None else None),
)
hdf5_file.create_dataset(
"orbital_energies",
data=(self.orbital_energies if self.orbital_energies is not None else False),
)
# Save attributes generated from integrals.
hdf5_file.create_dataset(
"one_body_integrals",
data=(self.one_body_integrals if self.one_body_integrals is not None else False),
compression=("gzip" if self.one_body_integrals is not None else None),
)
hdf5_file.create_dataset(
"two_body_integrals",
data=(self.two_body_integrals if self.two_body_integrals is not None else False),
compression=("gzip" if self.two_body_integrals is not None else None),
)
# Save attributes generated from MP2 calculation.
hdf5_file.create_dataset(
"mp2_energy", data=(self.mp2_energy if self.mp2_energy is not None else False)
)
# Save attributes generated from CISD calculation.
hdf5_file.create_dataset(
"cisd_energy", data=(self.cisd_energy if self.cisd_energy is not None else False)
)
hdf5_file.create_dataset(
"cisd_one_rdm",
data=(self.cisd_one_rdm if self.cisd_one_rdm is not None else False),
compression=("gzip" if self.cisd_one_rdm is not None else None),
)
hdf5_file.create_dataset(
"cisd_two_rdm",
data=(self.cisd_two_rdm if self.cisd_two_rdm is not None else False),
compression=("gzip" if self.cisd_two_rdm is not None else None),
)
# Save attributes generated from exact diagonalization.
hdf5_file.create_dataset(
"fci_energy", data=(self.fci_energy if self.fci_energy is not None else False)
)
hdf5_file.create_dataset(
"fci_one_rdm",
data=(self.fci_one_rdm if self.fci_one_rdm is not None else False),
compression=("gzip" if self.fci_one_rdm is not None else None),
)
hdf5_file.create_dataset(
"fci_two_rdm",
data=(self.fci_two_rdm if self.fci_two_rdm is not None else False),
compression=("gzip" if self.fci_two_rdm is not None else None),
)
# Save attributes generated from CCSD calculation.
hdf5_file.create_dataset(
"ccsd_energy", data=(self.ccsd_energy if self.ccsd_energy is not None else False)
)
hdf5_file.create_dataset(
"ccsd_single_amps",
data=(self.ccsd_single_amps if self.ccsd_single_amps is not None else False),
compression=("gzip" if self.ccsd_single_amps is not None else None),
)
hdf5_file.create_dataset(
"ccsd_double_amps",
data=(self.ccsd_double_amps if self.ccsd_double_amps is not None else False),
compression=("gzip" if self.ccsd_double_amps is not None else None),
)

# Remove old file first for compatibility with systems that don't allow
# rename replacement. Catching OSError for when file does not exist
# yet
try:
os.remove("{}.hdf5".format(self.filename))
except OSError:
# Nothing to do but carry on.
pass
# Save general calculation data
key_list = list(self.general_calculations.keys())
hdf5_file.create_dataset(
"general_calculations_keys",
data=([numpy.bytes_(key) for key in key_list] if len(key_list) > 0 else False),
)
hdf5_file.create_dataset(
"general_calculations_values",
data=(
[self.general_calculations[key] for key in key_list] if len(key_list) > 0 else False
),
)

shutil.move("{}.hdf5".format(tmp_name), "{}.hdf5".format(self.filename))
def save(self):
"""Method to save the class under a systematic name."""
# Create a temporary file in the same directory as self.filename, write data to it, then
# finally replace self.filename with the temp file.
output_dir = os.path.dirname(os.path.abspath(self.filename))
final_filename = f"{self.filename}.hdf5"
Comment thread
mhucka marked this conversation as resolved.
tmp_name = None
try:
# Use delete=False for Windows compatibility (allows h5py to open the path).
with tempfile.NamedTemporaryFile(
delete=False, suffix='.hdf5', dir=output_dir
) as tmp_file:
tmp_name = tmp_file.name
with h5py.File(tmp_file, "w") as hdf5_file:
self._write_hdf5_data(hdf5_file)

# Determine the umask by setting it to an arbitrary value & immediately restoring it.
# Note: trying to set umask(0) can be problematic, so we use something else.
current_umask = os.umask(0o600)
os.umask(current_umask)
# Now apply the umask to the file and move it.
os.chmod(tmp_name, 0o666 & ~current_umask)

# os.replace is atomic and works across platforms.
os.replace(tmp_name, final_filename)
Comment thread
mhucka marked this conversation as resolved.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will save hdf5 files with user-only access permissions.
A well behaved application should respect user's umask when creating files.
These seem to be molecular structure / energy simulation data which are not sensitive.

I'd suggest to adjust the initial code to write tmp_name in the output_dir so it is on the same device and thus eligible for os.replace.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I understand. The code already writes to output_dir; the tmp_name will be the full path to the location created by the tempfile.NamedTemporaryFile() call a few lines above:

with tempfile.NamedTemporaryFile(
    delete=False, suffix='.hdf5', dir=output_dir
) as tmp_file:
    tmp_name = tmp_file.name

Can you clarify?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code was respecting the umask which is not the case with the new code.
I suggest to avoid tempfile and instead adjust the old code with a tweak to write tmp_name in the same directory as self.filename. I don't see a problem in using uuid4 for temporary file name.

Another option is to keep it as is, but adjust the new file permissions to reflect umask, ie, as if it were created with open(filename, "w"). Should be possible (albeit a bit ugly) along https://stackoverflow.com/a/44130549/24247414.

Copy link
Copy Markdown
Contributor Author

@mhucka mhucka Apr 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The latest commit changes the way the file writing is done and also reads the umask and chmods the file accordingly. Is this what you have in mind?

Copy link
Copy Markdown
Contributor

@pavoljuhas pavoljuhas Apr 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the stanza to determine umask should be executed just once (and the umask cached) rather than doing it over for every save call. TBH, this is getting longer and messier than it was before.

Was this PR initiated because gemini found an uuid4() call and suggested security improvement?

@mpharrigan - is there a concern that this should support high-throughput writes (e.g., save 10_000 MolecularData objects as quickly as possible)?

If not, we could just use a standard self-deleting TemporaryFile and after a successful write copy it over to "{self.filename}.hdf5".

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't remember what flagged it (might have been GitHub's scanners). IIRC it was about security and possible race conditions.

I also think this matter is no longer worth the effort, so let's close this PR.

tmp_name = None
finally:
# Clean up if something went wrong.
if tmp_name and os.path.exists(tmp_name):
os.remove(tmp_name) # pragma: no cover

def load(self):
geometry = []

with h5py.File("{}.hdf5".format(self.filename), "r") as f:
with h5py.File(f"{self.filename}.hdf5", "r") as f:
Comment thread
mhucka marked this conversation as resolved.
# Load geometry:
data = f["geometry/atoms"]
if data.shape != (()):
Expand Down
Loading