-
Notifications
You must be signed in to change notification settings - Fork 7
Open
Description
Hello,
I am using the following code (very similar to the on in Google Collab) for a dataset with ~3,000 structures of 150-190 atoms each. I have a cluster with 128GB and one with 384GB. On both, the code uses all the memory and crashes typically at setting up the training data.
from ase.io import read
import numpy as np
import flare_pp._C_flare as flare_pp
from flare_pp.sparse_gp import SGP_Wrapper
from flare_pp.sparse_gp_calculator import SGP_Calculator
dataset = read('processed_dataset.xyz', index=":", format='extxyz')
n_strucs = len(dataset)
forces = [x.get_forces() for x in dataset]
positions = [x.get_positions() for x in dataset]
cell = np.array(dataset[0].cell.tolist())
species = dataset[0].get_atomic_numbers()
species_code = {k:n for n,k in enumerate(set(dataset[0].get_atomic_numbers()))}
coded_species = []
for spec in species:
coded_species.append(species_code[spec])
# Choose training and validation structures.
training_size = int(n_strucs*.8)
validation_size = n_strucs-int(n_strucs*.8)
shuffled_frames = [int(n) for n in range(n_strucs)]
np.random.shuffle(shuffled_frames)
training_pts = shuffled_frames[0:training_size]
validation_pts = shuffled_frames[training_size:training_size + validation_size]
parameters = {}
# Define many-body descriptor.
cutoff = parameters.get("cutoff",7)
n_species = len(set(species))
N = parameters.get("n_max",15) # Number of radial basis functions
lmax = parameters.get("l_max",3) # Largest L included in spherical harmonics
radial_basis = parameters.get("radial_basis_function","chebyshev") # Radial basis set
cutoff_name = parameters.get("cutoff_function","quadratic") # Cutoff function
radial_hyps = [0, cutoff]
cutoff_hyps = []
descriptor_settings = [n_species, N, lmax]
# Define a B2 object.
B2 = flare_pp.B2(radial_basis, cutoff_name, radial_hyps, cutoff_hyps,
descriptor_settings)
# The GP class can take a list of descriptors as input, but here
# we'll use a single descriptor.
descriptors = [B2]
# Define kernel function.
sigma = parameters.get("sigma",2.0)
power = parameters.get("power",2)
dot_product_kernel = flare_pp.NormalizedDotProduct(sigma, power)
# Define a list of kernels.
# There needs to be one kernel for each descriptor.
kernels = [dot_product_kernel]
# Define sparse GP.
sigma_e = parameters.get("sigma_e",0.005 * noa) # Energy noise (in kcal/mol, so about 5 meV/atom)
sigma_f = parameters.get("sigma_f",0.005) # Force noise (in kcal/mol/A, so about 5 meV/A)
sigma_s = parameters.get("sigma_s",0.0007) # Stress noise (in kcal/A^3, so about 0.1 GPa)
gp_model = flare_pp.SparseGP(kernels, sigma_e, sigma_f, sigma_s)
# Calculate descriptors of the validation and training structures.
print("Computing descriptors of validation points...")
validation_strucs = []
validation_forces = []
for n, snapshot in enumerate(validation_pts):
pos = positions[snapshot]
frcs = forces[snapshot]
# Create structure object, which computes and stores descriptors.
struc = \
flare_pp.Structure(cell, coded_species, pos, cutoff, descriptors)
validation_strucs.append(struc)
validation_forces.append(frcs)
print("Done.")
print("Computing descriptors of training points...")
training_strucs = []
training_forces = []
## CODE TYPICALLY CRASHES IN LOOP BELOW usually anywhere from within 20 iteration to 160 iteration depending on cluster used:
for n, snapshot in enumerate(training_pts):
pos = positions[snapshot]
frcs = forces[snapshot]
# Create structure object, which computes and stores descriptors.
struc = \
flare_pp.Structure(cell, coded_species, pos, cutoff, descriptors)
# Assign force labels to the training structure.
struc.forces = frcs.reshape(-1)
training_strucs.append(struc)
training_forces.append(frcs)
print("Done.")
# Train the model.
print("Training the GP...")
batch_size = 50 # monitor the MAE after adding this many frames
n_strucs = np.zeros(batch_size)
mb_maes = np.zeros(batch_size)
for m in range(training_size):
train_struc = training_strucs[m]
# Add training structure and sparse environments.
gp_model.add_training_structure(train_struc)
gp_model.add_all_environments(train_struc)
if (m + 1) % batch_size == 0:
# Update the sparse GP training coefficients.
gp_model.update_matrices_QR()
# Predict on the validation set.
pred_forces = [] #np.zeros((validation_size, noa, 3))
for n, test_struc in enumerate(validation_strucs):
gp_model.predict_SOR(test_struc)
c_noa = test_struc.noa
pred_vals = test_struc.mean_efs[1:-6].reshape(c_noa, 3)
pred_forces.append(pred_vals)
# Calculate and store the MAE.
batch_no = int((m + 1) / batch_size)
v_f = np.fromiter(chain.from_iterable(validation_forces))
p_f = np.fromiter(chain.from_iterable(pred_forces))
mae = np.mean(np.abs(v_f - p_f))
n_strucs[batch_no - 1] = batch_size * batch_no
mb_maes[batch_no - 1] = mae
print("Batch %i MAE: %.2f eV/atom/A" % (batch_no, mae))
# Write lammps potential file.
file_name = "trained_sparse_gaussian_model.txt"
contributor = self.get("model_description")
# The "kernel index" indicates which kernel to map for multi-descriptor models.
# For single-descriptor models like this one, just set it to 0.
kernel_index = 0
gp_model.write_mapping_coefficients(file_name, contributor, kernel_index)
Is there a more efficient way to load the data as needed so that the system memory is not completely used up?
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels