Help for processing large dataset

Hello,

I am using the following code (very similar to the on in Google Collab) for a dataset with ~3,000 structures of 150-190 atoms each. I have a cluster with 128GB and one with 384GB. On both, the code uses all the memory and crashes typically at setting up the training data.

```
from ase.io import read
import numpy as np
import flare_pp._C_flare as flare_pp
from flare_pp.sparse_gp import SGP_Wrapper
from flare_pp.sparse_gp_calculator import SGP_Calculator

dataset = read('processed_dataset.xyz', index=":", format='extxyz')

n_strucs = len(dataset)
forces = [x.get_forces() for x in dataset]
positions = [x.get_positions() for x in dataset]
cell = np.array(dataset[0].cell.tolist())
species = dataset[0].get_atomic_numbers()
species_code = {k:n for n,k in enumerate(set(dataset[0].get_atomic_numbers()))}

coded_species = []
for spec in species:
    coded_species.append(species_code[spec])

# Choose training and validation structures.
training_size = int(n_strucs*.8)
validation_size = n_strucs-int(n_strucs*.8)
shuffled_frames = [int(n) for n in range(n_strucs)]
np.random.shuffle(shuffled_frames)

training_pts = shuffled_frames[0:training_size]
validation_pts = shuffled_frames[training_size:training_size + validation_size]


parameters = {}
# Define many-body descriptor.
cutoff = parameters.get("cutoff",7)
n_species = len(set(species))
N = parameters.get("n_max",15)  # Number of radial basis functions
lmax = parameters.get("l_max",3)  # Largest L included in spherical harmonics
radial_basis = parameters.get("radial_basis_function","chebyshev")  # Radial basis set
cutoff_name = parameters.get("cutoff_function","quadratic")  # Cutoff function

radial_hyps = [0, cutoff]
cutoff_hyps = []
descriptor_settings = [n_species, N, lmax]

# Define a B2 object.
B2 = flare_pp.B2(radial_basis, cutoff_name, radial_hyps, cutoff_hyps,
                 descriptor_settings)

# The GP class can take a list of descriptors as input, but here
# we'll use a single descriptor.
descriptors = [B2]

# Define kernel function.
sigma = parameters.get("sigma",2.0)
power = parameters.get("power",2)
dot_product_kernel = flare_pp.NormalizedDotProduct(sigma, power)

# Define a list of kernels.
# There needs to be one kernel for each descriptor.
kernels = [dot_product_kernel]

# Define sparse GP.
sigma_e = parameters.get("sigma_e",0.005 * noa)  # Energy noise (in kcal/mol, so about 5 meV/atom)
sigma_f = parameters.get("sigma_f",0.005)  # Force noise (in kcal/mol/A, so about 5 meV/A)
sigma_s = parameters.get("sigma_s",0.0007)  # Stress noise (in kcal/A^3, so about 0.1 GPa)
gp_model = flare_pp.SparseGP(kernels, sigma_e, sigma_f, sigma_s)

# Calculate descriptors of the validation and training structures.
print("Computing descriptors of validation points...")
validation_strucs = []
validation_forces = [] 
for n, snapshot in enumerate(validation_pts):
    pos = positions[snapshot]
    frcs = forces[snapshot]

    # Create structure object, which computes and stores descriptors.
    struc = \
        flare_pp.Structure(cell, coded_species, pos, cutoff, descriptors)
    validation_strucs.append(struc)
    validation_forces.append(frcs)
print("Done.")

print("Computing descriptors of training points...")
training_strucs = []
training_forces = [] 

## CODE TYPICALLY CRASHES IN LOOP BELOW usually anywhere from within 20 iteration to 160 iteration depending on cluster used:

for n, snapshot in enumerate(training_pts):
    pos = positions[snapshot]
    frcs = forces[snapshot]
    # Create structure object, which computes and stores descriptors.
    struc = \
        flare_pp.Structure(cell, coded_species, pos, cutoff, descriptors)

    # Assign force labels to the training structure.
    struc.forces = frcs.reshape(-1)

    training_strucs.append(struc)
    training_forces.append(frcs)
print("Done.")

# Train the model.
print("Training the GP...")
batch_size = 50  # monitor the MAE after adding this many frames
n_strucs = np.zeros(batch_size)
mb_maes = np.zeros(batch_size)
for m in range(training_size):
    train_struc = training_strucs[m]
    # Add training structure and sparse environments.
    gp_model.add_training_structure(train_struc)
    gp_model.add_all_environments(train_struc)

    if (m + 1) % batch_size == 0:
        # Update the sparse GP training coefficients.
        gp_model.update_matrices_QR()

        # Predict on the validation set.
        pred_forces = [] #np.zeros((validation_size, noa, 3))
        for n, test_struc in enumerate(validation_strucs):
            gp_model.predict_SOR(test_struc)
            c_noa = test_struc.noa
            pred_vals = test_struc.mean_efs[1:-6].reshape(c_noa, 3)
            pred_forces.append(pred_vals)

        # Calculate and store the MAE.
        batch_no = int((m + 1) / batch_size)
        v_f = np.fromiter(chain.from_iterable(validation_forces))
        p_f = np.fromiter(chain.from_iterable(pred_forces))
        mae = np.mean(np.abs(v_f - p_f))
        n_strucs[batch_no - 1] = batch_size * batch_no
        mb_maes[batch_no - 1] = mae
        print("Batch %i MAE: %.2f eV/atom/A" % (batch_no, mae))
# Write lammps potential file.
file_name = "trained_sparse_gaussian_model.txt"
contributor = self.get("model_description")

# The "kernel index" indicates which kernel to map for multi-descriptor models.
# For single-descriptor models like this one, just set it to 0.
kernel_index = 0

gp_model.write_mapping_coefficients(file_name, contributor, kernel_index)

```
Is there a more efficient way to load the data as needed so that the system memory is not completely used up?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Help for processing large dataset #44

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Help for processing large dataset #44

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions