-
Notifications
You must be signed in to change notification settings - Fork 415
Replace insecure temporary file creation in chem/molecular_data.py #1243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b4832a0
652e356
81a0c7b
379cc75
26632bf
c65c54f
4d50153
c842259
3765b9e
3adf6bc
8dcf2ba
3f68bd8
2c1dd1b
1aa8ba7
9cb4a71
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,7 +12,7 @@ | |
| """Class and functions to store quantum chemistry data.""" | ||
|
|
||
| import os | ||
| import uuid | ||
| import tempfile | ||
| import shutil | ||
| import numpy | ||
| import h5py | ||
|
|
@@ -699,157 +699,174 @@ def ccsd_double_amps(self): | |
| def ccsd_double_amps(self, value): | ||
| self._ccsd_double_amps = value | ||
|
|
||
| def save(self): | ||
| """Method to save the class under a systematic name.""" | ||
| # Create a temporary file and swap it to the original name in case | ||
| # data needs to be loaded while saving | ||
| tmp_name = uuid.uuid4() | ||
| with h5py.File("{}.hdf5".format(tmp_name), "w") as f: | ||
| # Save geometry (atoms and positions need to be separate): | ||
| d_geom = f.create_group("geometry") | ||
| if not isinstance(self.geometry, basestring): | ||
| atoms = [numpy.bytes_(item[0]) for item in self.geometry] | ||
| positions = numpy.array([list(item[1]) for item in self.geometry]) | ||
| else: | ||
| atoms = numpy.bytes_(self.geometry) | ||
| positions = None | ||
| d_geom.create_dataset("atoms", data=(atoms if atoms is not None else False)) | ||
| d_geom.create_dataset("positions", data=(positions if positions is not None else False)) | ||
| # Save basis: | ||
| f.create_dataset("basis", data=numpy.bytes_(self.basis)) | ||
| # Save multiplicity: | ||
| f.create_dataset("multiplicity", data=self.multiplicity) | ||
| # Save charge: | ||
| f.create_dataset("charge", data=self.charge) | ||
| # Save description: | ||
| f.create_dataset("description", data=numpy.bytes_(self.description)) | ||
| # Save name: | ||
| f.create_dataset("name", data=numpy.bytes_(self.name)) | ||
| # Save n_atoms: | ||
| f.create_dataset("n_atoms", data=self.n_atoms) | ||
| # Save atoms: | ||
| f.create_dataset("atoms", data=numpy.bytes_(self.atoms)) | ||
| # Save protons: | ||
| f.create_dataset("protons", data=self.protons) | ||
| # Save n_electrons: | ||
| f.create_dataset("n_electrons", data=self.n_electrons) | ||
| # Save generic attributes from calculations: | ||
| f.create_dataset( | ||
| "n_orbitals", data=(self.n_orbitals if self.n_orbitals is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "n_qubits", data=(self.n_qubits if self.n_qubits is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "nuclear_repulsion", | ||
| data=(self.nuclear_repulsion if self.nuclear_repulsion is not None else False), | ||
| ) | ||
| # Save attributes generated from SCF calculation. | ||
| f.create_dataset( | ||
| "hf_energy", data=(self.hf_energy if self.hf_energy is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "canonical_orbitals", | ||
| data=(self.canonical_orbitals if self.canonical_orbitals is not None else False), | ||
| compression=("gzip" if self.canonical_orbitals is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "overlap_integrals", | ||
| data=(self.overlap_integrals if self.overlap_integrals is not None else False), | ||
| compression=("gzip" if self.overlap_integrals is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "orbital_energies", | ||
| data=(self.orbital_energies if self.orbital_energies is not None else False), | ||
| ) | ||
| # Save attributes generated from integrals. | ||
| f.create_dataset( | ||
| "one_body_integrals", | ||
| data=(self.one_body_integrals if self.one_body_integrals is not None else False), | ||
| compression=("gzip" if self.one_body_integrals is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "two_body_integrals", | ||
| data=(self.two_body_integrals if self.two_body_integrals is not None else False), | ||
| compression=("gzip" if self.two_body_integrals is not None else None), | ||
| ) | ||
| # Save attributes generated from MP2 calculation. | ||
| f.create_dataset( | ||
| "mp2_energy", data=(self.mp2_energy if self.mp2_energy is not None else False) | ||
| ) | ||
| # Save attributes generated from CISD calculation. | ||
| f.create_dataset( | ||
| "cisd_energy", data=(self.cisd_energy if self.cisd_energy is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "cisd_one_rdm", | ||
| data=(self.cisd_one_rdm if self.cisd_one_rdm is not None else False), | ||
| compression=("gzip" if self.cisd_one_rdm is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "cisd_two_rdm", | ||
| data=(self.cisd_two_rdm if self.cisd_two_rdm is not None else False), | ||
| compression=("gzip" if self.cisd_two_rdm is not None else None), | ||
| ) | ||
| # Save attributes generated from exact diagonalization. | ||
| f.create_dataset( | ||
| "fci_energy", data=(self.fci_energy if self.fci_energy is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "fci_one_rdm", | ||
| data=(self.fci_one_rdm if self.fci_one_rdm is not None else False), | ||
| compression=("gzip" if self.fci_one_rdm is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "fci_two_rdm", | ||
| data=(self.fci_two_rdm if self.fci_two_rdm is not None else False), | ||
| compression=("gzip" if self.fci_two_rdm is not None else None), | ||
| ) | ||
| # Save attributes generated from CCSD calculation. | ||
| f.create_dataset( | ||
| "ccsd_energy", data=(self.ccsd_energy if self.ccsd_energy is not None else False) | ||
| ) | ||
| f.create_dataset( | ||
| "ccsd_single_amps", | ||
| data=(self.ccsd_single_amps if self.ccsd_single_amps is not None else False), | ||
| compression=("gzip" if self.ccsd_single_amps is not None else None), | ||
| ) | ||
| f.create_dataset( | ||
| "ccsd_double_amps", | ||
| data=(self.ccsd_double_amps if self.ccsd_double_amps is not None else False), | ||
| compression=("gzip" if self.ccsd_double_amps is not None else None), | ||
| ) | ||
| def _write_hdf5_data(self, hdf5_file): | ||
| """Method to write the class data to an HDF5 file. | ||
|
|
||
| # Save general calculation data | ||
| key_list = list(self.general_calculations.keys()) | ||
| f.create_dataset( | ||
| "general_calculations_keys", | ||
| data=([numpy.bytes_(key) for key in key_list] if len(key_list) > 0 else False), | ||
| ) | ||
| f.create_dataset( | ||
| "general_calculations_values", | ||
| data=( | ||
| [self.general_calculations[key] for key in key_list] | ||
| if len(key_list) > 0 | ||
| else False | ||
| ), | ||
| ) | ||
| The hdf5_file parameter is expected to be an already-open h5py.File handle. | ||
| """ | ||
| # Save geometry (atoms and positions need to be separate): | ||
| d_geom = hdf5_file.create_group("geometry") | ||
| if not isinstance(self.geometry, basestring): | ||
| atoms = [numpy.bytes_(item[0]) for item in self.geometry] | ||
| positions = numpy.array([list(item[1]) for item in self.geometry]) | ||
| else: | ||
| atoms = numpy.bytes_(self.geometry) | ||
| positions = None | ||
| d_geom.create_dataset("atoms", data=(atoms if atoms is not None else False)) | ||
| d_geom.create_dataset("positions", data=(positions if positions is not None else False)) | ||
| # Save basis: | ||
| hdf5_file.create_dataset("basis", data=numpy.bytes_(self.basis)) | ||
| # Save multiplicity: | ||
| hdf5_file.create_dataset("multiplicity", data=self.multiplicity) | ||
| # Save charge: | ||
| hdf5_file.create_dataset("charge", data=self.charge) | ||
| # Save description: | ||
| hdf5_file.create_dataset("description", data=numpy.bytes_(self.description)) | ||
| # Save name: | ||
| hdf5_file.create_dataset("name", data=numpy.bytes_(self.name)) | ||
| # Save n_atoms: | ||
| hdf5_file.create_dataset("n_atoms", data=self.n_atoms) | ||
| # Save atoms: | ||
| hdf5_file.create_dataset("atoms", data=numpy.bytes_(self.atoms)) | ||
| # Save protons: | ||
| hdf5_file.create_dataset("protons", data=self.protons) | ||
| # Save n_electrons: | ||
| hdf5_file.create_dataset("n_electrons", data=self.n_electrons) | ||
| # Save generic attributes from calculations: | ||
| hdf5_file.create_dataset( | ||
| "n_orbitals", data=(self.n_orbitals if self.n_orbitals is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "n_qubits", data=(self.n_qubits if self.n_qubits is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "nuclear_repulsion", | ||
| data=(self.nuclear_repulsion if self.nuclear_repulsion is not None else False), | ||
| ) | ||
| # Save attributes generated from SCF calculation. | ||
| hdf5_file.create_dataset( | ||
| "hf_energy", data=(self.hf_energy if self.hf_energy is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "canonical_orbitals", | ||
| data=(self.canonical_orbitals if self.canonical_orbitals is not None else False), | ||
| compression=("gzip" if self.canonical_orbitals is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "overlap_integrals", | ||
| data=(self.overlap_integrals if self.overlap_integrals is not None else False), | ||
| compression=("gzip" if self.overlap_integrals is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "orbital_energies", | ||
| data=(self.orbital_energies if self.orbital_energies is not None else False), | ||
| ) | ||
| # Save attributes generated from integrals. | ||
| hdf5_file.create_dataset( | ||
| "one_body_integrals", | ||
| data=(self.one_body_integrals if self.one_body_integrals is not None else False), | ||
| compression=("gzip" if self.one_body_integrals is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "two_body_integrals", | ||
| data=(self.two_body_integrals if self.two_body_integrals is not None else False), | ||
| compression=("gzip" if self.two_body_integrals is not None else None), | ||
| ) | ||
| # Save attributes generated from MP2 calculation. | ||
| hdf5_file.create_dataset( | ||
| "mp2_energy", data=(self.mp2_energy if self.mp2_energy is not None else False) | ||
| ) | ||
| # Save attributes generated from CISD calculation. | ||
| hdf5_file.create_dataset( | ||
| "cisd_energy", data=(self.cisd_energy if self.cisd_energy is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "cisd_one_rdm", | ||
| data=(self.cisd_one_rdm if self.cisd_one_rdm is not None else False), | ||
| compression=("gzip" if self.cisd_one_rdm is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "cisd_two_rdm", | ||
| data=(self.cisd_two_rdm if self.cisd_two_rdm is not None else False), | ||
| compression=("gzip" if self.cisd_two_rdm is not None else None), | ||
| ) | ||
| # Save attributes generated from exact diagonalization. | ||
| hdf5_file.create_dataset( | ||
| "fci_energy", data=(self.fci_energy if self.fci_energy is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "fci_one_rdm", | ||
| data=(self.fci_one_rdm if self.fci_one_rdm is not None else False), | ||
| compression=("gzip" if self.fci_one_rdm is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "fci_two_rdm", | ||
| data=(self.fci_two_rdm if self.fci_two_rdm is not None else False), | ||
| compression=("gzip" if self.fci_two_rdm is not None else None), | ||
| ) | ||
| # Save attributes generated from CCSD calculation. | ||
| hdf5_file.create_dataset( | ||
| "ccsd_energy", data=(self.ccsd_energy if self.ccsd_energy is not None else False) | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "ccsd_single_amps", | ||
| data=(self.ccsd_single_amps if self.ccsd_single_amps is not None else False), | ||
| compression=("gzip" if self.ccsd_single_amps is not None else None), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "ccsd_double_amps", | ||
| data=(self.ccsd_double_amps if self.ccsd_double_amps is not None else False), | ||
| compression=("gzip" if self.ccsd_double_amps is not None else None), | ||
| ) | ||
|
|
||
| # Remove old file first for compatibility with systems that don't allow | ||
| # rename replacement. Catching OSError for when file does not exist | ||
| # yet | ||
| try: | ||
| os.remove("{}.hdf5".format(self.filename)) | ||
| except OSError: | ||
| # Nothing to do but carry on. | ||
| pass | ||
| # Save general calculation data | ||
| key_list = list(self.general_calculations.keys()) | ||
| hdf5_file.create_dataset( | ||
| "general_calculations_keys", | ||
| data=([numpy.bytes_(key) for key in key_list] if len(key_list) > 0 else False), | ||
| ) | ||
| hdf5_file.create_dataset( | ||
| "general_calculations_values", | ||
| data=( | ||
| [self.general_calculations[key] for key in key_list] if len(key_list) > 0 else False | ||
| ), | ||
| ) | ||
|
|
||
| shutil.move("{}.hdf5".format(tmp_name), "{}.hdf5".format(self.filename)) | ||
| def save(self): | ||
| """Method to save the class under a systematic name.""" | ||
| # Create a temporary file in the same directory as self.filename, write data to it, then | ||
| # finally replace self.filename with the temp file. | ||
| output_dir = os.path.dirname(os.path.abspath(self.filename)) | ||
| final_filename = f"{self.filename}.hdf5" | ||
| tmp_name = None | ||
| try: | ||
| # Use delete=False for Windows compatibility (allows h5py to open the path). | ||
| with tempfile.NamedTemporaryFile( | ||
| delete=False, suffix='.hdf5', dir=output_dir | ||
| ) as tmp_file: | ||
| tmp_name = tmp_file.name | ||
| with h5py.File(tmp_file, "w") as hdf5_file: | ||
| self._write_hdf5_data(hdf5_file) | ||
|
|
||
| # Determine the umask by setting it to an arbitrary value & immediately restoring it. | ||
| # Note: trying to set umask(0) can be problematic, so we use something else. | ||
| current_umask = os.umask(0o600) | ||
| os.umask(current_umask) | ||
| # Now apply the umask to the file and move it. | ||
| os.chmod(tmp_name, 0o666 & ~current_umask) | ||
|
|
||
| # os.replace is atomic and works across platforms. | ||
| os.replace(tmp_name, final_filename) | ||
|
mhucka marked this conversation as resolved.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will save hdf5 files with user-only access permissions. I'd suggest to adjust the initial code to write
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand. The code already writes to with tempfile.NamedTemporaryFile(
delete=False, suffix='.hdf5', dir=output_dir
) as tmp_file:
tmp_name = tmp_file.nameCan you clarify?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The old code was respecting the umask which is not the case with the new code. Another option is to keep it as is, but adjust the new file permissions to reflect
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The latest commit changes the way the file writing is done and also reads the umask and chmods the file accordingly. Is this what you have in mind?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but the stanza to determine umask should be executed just once (and the umask cached) rather than doing it over for every Was this PR initiated because gemini found an uuid4() call and suggested security improvement? @mpharrigan - is there a concern that this should support high-throughput writes (e.g., save 10_000 MolecularData objects as quickly as possible)? If not, we could just use a standard self-deleting TemporaryFile and after a successful write copy it over to
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't remember what flagged it (might have been GitHub's scanners). IIRC it was about security and possible race conditions. I also think this matter is no longer worth the effort, so let's close this PR. |
||
| tmp_name = None | ||
| finally: | ||
| # Clean up if something went wrong. | ||
| if tmp_name and os.path.exists(tmp_name): | ||
| os.remove(tmp_name) # pragma: no cover | ||
|
|
||
| def load(self): | ||
| geometry = [] | ||
|
|
||
| with h5py.File("{}.hdf5".format(self.filename), "r") as f: | ||
| with h5py.File(f"{self.filename}.hdf5", "r") as f: | ||
|
mhucka marked this conversation as resolved.
|
||
| # Load geometry: | ||
| data = f["geometry/atoms"] | ||
| if data.shape != (()): | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.