Update of rmd17 example

emmanuellujan · emmanuellujan · commit db3fd069c5d2 · 2025-10-30T20:10:20.000-04:00
diff --git a/examples/atomistic/srs-vs-sme-aspirin-rmd17.jl b/examples/atomistic/srs-vs-sme-aspirin-rmd17.jl
@@ -6,9 +6,9 @@ using StreamingSampling
 include("utils/utils.jl")
 
 # Define paths and create experiment folder
-train_path = ["data/md17/aspirin-train.xyz"]
-test_path = ["data/md17/aspirin-test.xyz"]
-res_path  = "results-aspirin-md17/"
+train_path = ["data/rmd17/aspirin-train.xyz"]
+test_path = ["data/rmd17/aspirin-test.xyz"]
+res_path  = "results-aspirin-rmd17/"
 run(`mkdir -p $res_path`)
 
 # Initialize streaming sampling ################################################
@@ -101,33 +101,39 @@ for j in 1:n_experiments
                            chunksize=m,
                            buffersize=1,
                            randomized=true)
-    _, test_inds = take!(ch)
+    cs, test_inds = take!(ch)
     close(ch)
-    test_inds = sort(test_inds)
-    test_confs = get_confs(test_path, test_inds)
-    test_ds = calc_descr(test_confs, basis_fitting)
+    test_confs = []
+    for c in cs
+        system, energy, forces = c
+        conf = Configuration(system, Energy(energy),
+                             Forces([Force(f) for f in forces]))
+        push!(test_confs, conf)
+    end
+    ds_test = DataSet(test_confs)
+    ds_test = calc_descr!(ds_test, basis_fitting)
     open("test-ds-aspirin-rmd17.jls", "w") do io
-     serialize(io, test_ds)
-     flush(io)
+        serialize(io, ds_test)
+        flush(io)
     end
-    #test_ds = deserialize("test-ds-aspirin-rmd17.jls")
+    #ds_test = deserialize("test-ds-aspirin-rmd17.jls")
     
     for n in sample_sizes
         # Sample training dataset using streaming weighted sampling ############
         train_inds = StatsBase.sample(1:length(ws), Weights(ws), n;
-                     replace=false, ordered=true))
+                     replace=false, ordered=true)
         #Load atomistic configurations
-        train_confs = get_confs(train_path, train_inds)
+        ds_train = get_confs(train_path, read_element, train_inds)
         #Adjust reference energies (permanent change)
-        adjust_energies(train_confs, vref_dict)
+        adjust_energies!(ds_train, vref_dict)
         # Compute dataset with energy and force descriptors
-        train_ds = calc_descr(train_confs, basis_fitting)
+        ds_train = calc_descr!(ds_train, basis_fitting)
         # Create result folder
         curr_sampler = "sws"
         exp_path = "$res_path/$j-$curr_sampler-n$n/"
         run(`mkdir -p $exp_path`)
         # Fit and save results
-        metrics_j = fit(exp_path, train_ds, test_ds, basis_fitting; vref_dict=vref_dict)
+        metrics_j = fit(exp_path, ds_train, ds_test, basis_fitting; vref_dict=vref_dict)
         metrics_j = merge(OrderedDict("exp_number" => j,
                                       "method" => "$curr_sampler",
                                       "batch_size_prop" => n/N,
@@ -142,17 +148,17 @@ for j in 1:n_experiments
         train_inds = randperm(N)[1:n]
         
         #Load atomistic configurations
-        train_confs = get_confs(train_path, train_inds)
+        ds_train = get_confs(train_path, read_element, train_inds)
         #Adjust reference energies (permanent change)
-        adjust_energies(train_confs, vref_dict)
+        adjust_energies!(ds_train, vref_dict)
         # Compute dataset with energy and force descriptors
-        train_ds = calc_descr(train_confs, basis_fitting)
+        ds_train = calc_descr!(ds_train, basis_fitting)
         # Create result folder
         curr_sampler = "srs"
         exp_path = "$res_path/$j-$curr_sampler-n$n/"
         run(`mkdir -p $exp_path`)
         # Fit and save results
-        metrics_j = fit(exp_path, train_ds, test_ds, basis_fitting; vref_dict=vref_dict)
+        metrics_j = fit(exp_path, ds_train, ds_test, basis_fitting; vref_dict=vref_dict)
         metrics_j = merge(OrderedDict("exp_number" => j,
                                       "method" => "$curr_sampler",
                                       "batch_size_prop" => n/N,
diff --git a/examples/atomistic/utils/fitting-utils.jl b/examples/atomistic/utils/fitting-utils.jl
@@ -1,6 +1,9 @@
-function get_confs(path, inds)
+function get_confs(path, read_element, inds)
     confs = []
-    ch, N = chunk_iterator(train_path; chunksize=1000, randomized=false)
+    ch, N = chunk_iterator(path;
+                           read_element=read_element,
+                           chunksize=1000,
+                           randomized=false)
     k = 1
     for (c, ci) in ch
         j = 1
diff --git a/examples/atomistic/utils/plot-err-per-sample.jl b/examples/atomistic/utils/plot-err-per-sample.jl
@@ -4,7 +4,7 @@ function plot_err_per_sample(res_path, metrics_filename)
     sort!(df, [:batch_size])
 
     srs = filter(:method => ==("srs"),   df)
-    sme = filter(:method => ==("sme"), df)
+    ss = filter(:method => ==("sws"), df)
 
     # ---------------- Percent formatting (round UP, fixed) ----------------
     # ≥ 1%  -> ceil to integer (no decimals)
@@ -24,9 +24,9 @@ function plot_err_per_sample(res_path, metrics_filename)
     end
 
     # ---------------- X tick labels ----------------
-    xs = sme.batch_size
+    xs = ss.batch_size
     xtick_labels = [string(bs, "\n", format_percent_roundup(prop))
-                    for (bs, prop) in zip(sme.batch_size, sme.batch_size_prop)]
+                    for (bs, prop) in zip(ss.batch_size, ss.batch_size_prop)]
 
     # ---------------- Colors ----------------
     black = RGB(0,0,0)
@@ -69,14 +69,14 @@ function plot_err_per_sample(res_path, metrics_filename)
     )
 
     pE_bottom = plot(
-        sme.batch_size, sme.e_test_mae;
+        ss.batch_size, ss.e_test_mae;
         color = red, lw = 5.5, marker = :utriangle,
         xlabel = "Training Dataset Size (Sample Size)",
         ylabel = "E MAE | eV/atom",
-        label = "SME",
+        label = "SWS",
         xticks = (xs, xtick_labels),
         legend = :topright,
-        ylims = padlims(sme.e_test_mae),
+        ylims = padlims(ss.e_test_mae),
     )
 
     energy_plot = plot(pE_top, pE_bottom; layout=(2,1), size=(1100,1100))
@@ -93,22 +93,18 @@ function plot_err_per_sample(res_path, metrics_filename)
     )
 
     pF_bottom = plot(
-        sme.batch_size, sme.f_test_mae;
+        ss.batch_size, ss.f_test_mae;
         color = red, lw = 5.5, marker = :utriangle,
         xlabel = "Training Dataset Size (Sample Size)",
         ylabel = "F MAE | eV/Å",
-        label = "SME",
+        label = "SWS",
         xticks = (xs, xtick_labels),
         legend = :topright,
-        ylims = padlims(SME.f_test_mae),
+        ylims = padlims(ss.f_test_mae),
     )
 
     force_plot = plot(pF_top, pF_bottom; layout=(2,1), size=(1100,1100))
     savefig(force_plot, "$res_path/f_test_mae_by_sample.pdf")
-
-    println("✅ Saved:")
-    println(" - e_test_mae_by_sample.pdf")
-    println(" - f_test_mae_by_sample.pdf")
 end
 
 function plot_err_per_sample_2(res_path, metrics_filename)
diff --git a/examples/atomistic/utils/subtract-peratom-e.jl b/examples/atomistic/utils/subtract-peratom-e.jl
@@ -10,7 +10,7 @@ function subtract_peratom_e(config::Configuration, vref_dict)
     Energy(new_e,e_unit)
 end
 
-function adjust_energies(ds, vref_dict)
+function adjust_energies!(ds, vref_dict)
     for config in ds
         new_energy = subtract_peratom_e(config,vref_dict)
         config.data[Energy] = new_energy
diff --git a/examples/atomistic/utils/utils.jl b/examples/atomistic/utils/utils.jl
@@ -5,6 +5,7 @@ using DataFrames
 using DelimitedFiles
 using Determinantal
 using InteratomicPotentials
+using LinearAlgebra
 using LowRankApprox
 using Measures
 using OrderedCollections
diff --git a/src/Weights.jl b/src/Weights.jl
@@ -171,7 +171,7 @@ function compute_chunk_weights(features::Matrix{Float64})
     # Form an L-ensemble based on the kernel matrix K
     dpp = EllEnsemble(K)
     # Scale so that the expected size is 1
-    rescale!(dpp, 1)
+    rescale!(dpp, N ÷ 2)
     # Compute inclusion probabilities.
     inclusion_probs = Determinantal.inclusion_prob(dpp)
     return inclusion_probs