diff --git a/CHANGELOG.md b/CHANGELOG.md
index 54877af..61fb5d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## v0.6.1
 
+- Add pyFANS support for large strain and fix pyFANS MPI [#124](https://github.com/DataAnalyticsEngineering/FANS/pull/124)
 - Adds support to write integration point data to disk [#117](https://github.com/DataAnalyticsEngineering/FANS/pull/117)
 
 ## v0.6.0
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85a7736..91523d6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,9 @@ find_package(nlohmann_json REQUIRED)
 # TARGETS
 # ##############################################################################
 
+set(FANS_VERBOSITY 5 CACHE STRING "Set verbosity level: 0 none, 5 all")
+add_definitions(-DVERBOSITY=${FANS_VERBOSITY})
+
 option(FANS_BUILD_STATIC "Build static library" OFF)
 if (FANS_BUILD_STATIC)
     add_library(FANS_FANS STATIC)
@@ -150,6 +153,7 @@ target_include_directories(FANS_FANS PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SO
 # FANS.h header that consuming projects can use
 set_property(TARGET FANS_FANS PROPERTY PUBLIC_HEADER
         include/general.h
+        include/logging.h
         include/matmodel.h
         include/MaterialManager.h
         include/reader.h
@@ -158,6 +162,7 @@ set_property(TARGET FANS_FANS PROPERTY PUBLIC_HEADER
         include/solver.h
         include/setup.h
         include/mixedBCs.h
+        include/serialization.h
 
         include/material_models/LinearThermal.h
         include/material_models/GBDiffusion.h
@@ -184,6 +189,7 @@ target_include_directories(FANS_main PRIVATE "${PROJECT_BINARY_DIR}/include")
 
 target_sources(FANS_FANS PRIVATE
         src/reader.cpp
+        src/logging.cpp
 )
 
 target_sources(FANS_main PRIVATE
diff --git a/include/MaterialManager.h b/include/MaterialManager.h
index cbff981..a905c97 100644
--- a/include/MaterialManager.h
+++ b/include/MaterialManager.h
@@ -2,7 +2,9 @@
 #define MATERIALMANAGER_H
 
 #include "general.h"
+#include "logging.h"
 #include "matmodel.h"
+#include "serialization.h"
 
 template <int howmany, int n_str>
 Matmodel<howmany, n_str> *createMatmodel(const Reader &reader);
@@ -25,7 +27,7 @@ struct MaterialInfo {
  *
  */
 template <int howmany, int n_str>
-class MaterialManager {
+class MaterialManager : public Serializable {
   private:
     MaterialInfo<howmany, n_str> *phase_to_info{nullptr}; // [n_phases] - HOT DATA
     int                           n_phases;
@@ -110,51 +112,47 @@ class MaterialManager {
         compute_reference_stiffness(reader);
 
         // Print detailed information about material configuration for logging
-        if (reader.world_rank == 0) {
-            printf("\n# MaterialManager initialized:\n");
-            printf("#   Number of material models: %zu\n", models.size());
-            printf("#   Number of phases: %d\n#\n", n_phases);
-
-            for (size_t i = 0; i < mats.size(); ++i) {
-                const auto &mg = mats[i];
-                printf("# Material Model %zu: %s\n", i + 1, mg["matmodel"].get<string>().c_str());
-
-                // Print phases
-                auto phases = mg["phases"].get<vector<int>>();
-                printf("#   Phases: [");
-                for (size_t j = 0; j < phases.size(); ++j) {
-                    printf("%d", phases[j]);
-                    if (j < phases.size() - 1)
-                        printf(", ");
-                }
-                printf("]\n");
-
-                // Print material properties
-                printf("#   Material properties:\n");
-                const auto &props = mg["material_properties"];
-                for (auto it = props.begin(); it != props.end(); ++it) {
-                    printf("#     %s: ", it.key().c_str());
-                    if (it.value().is_array()) {
-                        printf("[");
-                        for (size_t k = 0; k < it.value().size(); ++k) {
-                            if (it.value()[k].is_number()) {
-                                printf("%.5g", it.value()[k].get<double>());
-                            } else if (it.value()[k].is_string()) {
-                                printf("\"%s\"", it.value()[k].get<string>().c_str());
-                            }
-                            if (k < it.value().size() - 1)
-                                printf(", ");
+        Log::io->info() << "\n# MaterialManager initialized:\n";
+        Log::io->info() << "#   Number of material models: " << models.size() << "\n";
+        Log::io->info() << "#   Number of phases: " << n_phases << "\n#\n";
+
+        for (size_t i = 0; i < mats.size(); ++i) {
+            const auto &mg = mats[i];
+            Log::io->info() << "# Material Model " << i + 1 << ": " << mg["matmodel"].get<string>() << "\n";
+
+            // Print phases
+            auto phases = mg["phases"].get<vector<int>>();
+            Log::io->info() << "#   Phases: [";
+            for (size_t j = 0; j < phases.size(); ++j) {
+                Log::io->info(true) << phases[j];
+                if (j < phases.size() - 1) Log::io->info(true) << ", ";
+            }
+            Log::io->info(true) << "]\n";
+
+            // Print material properties
+            Log::io->info() << "#   Material properties:\n";
+            const auto &props = mg["material_properties"];
+            for (auto it = props.begin(); it != props.end(); ++it) {
+                Log::io->info() << "#     " << it.key() << ": ";
+                if (it.value().is_array()) {
+                    Log::io->info(true) << "[";
+                    for (size_t k = 0; k < it.value().size(); ++k) {
+                        if (it.value()[k].is_number()) {
+                            Log::io->info(true) << std::setprecision(5) << it.value()[k].get<double>() << std::defaultfloat;
+                        } else if (it.value()[k].is_string()) {
+                            Log::io->info(true) << "\"" << it.value()[k].get<string>() << "\"";
                         }
-                        printf("]");
-                    } else if (it.value().is_number()) {
-                        printf("%.5g", it.value().get<double>());
-                    } else if (it.value().is_string()) {
-                        printf("\"%s\"", it.value().get<string>().c_str());
+                        if (k < it.value().size() - 1) Log::io->info(true) << ", ";
                     }
-                    printf("\n");
+                    Log::io->info(true) << "]";
+                } else if (it.value().is_number()) {
+                    Log::io->info(true) << std::setprecision(5) << it.value().get<double>() << std::defaultfloat;
+                } else if (it.value().is_string()) {
+                    Log::io->info(true) << "\"" << it.value().get<string>() << "\"";
                 }
-                printf("#\n");
+                Log::io->info(true) << "\n";
             }
+            Log::io->info() << "#\n";
         }
     }
 
@@ -204,6 +202,20 @@ class MaterialManager {
         }
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        for (auto *model : models) {
+            model->register_serialization(r);
+        }
+    }
+
+    void init_deserialization() override
+    {
+        for (auto *model : models) {
+            model->init_deserialization();
+        }
+    }
+
     // Update internal variables after converged time step
     void update_internal_variables()
     {
@@ -213,7 +225,7 @@ class MaterialManager {
     }
 
     // Set macroscale loading gradient for all models
-    void set_gradient(vector<double> g0)
+    void set_gradient(const vector<double>& g0)
     {
         for (auto *model : models) {
             model->setGradient(g0);
diff --git a/include/general.h b/include/general.h
index 1128e8a..1949615 100644
--- a/include/general.h
+++ b/include/general.h
@@ -58,6 +58,4 @@ inline void FANS_free(V *p)
 }
 #endif // FANS_MALLOC_H
 
-#define VERBOSITY 0
-
 // #define EIGEN_RUNTIME_NO_MALLOC
diff --git a/include/logging.h b/include/logging.h
new file mode 100755
index 0000000..734a669
--- /dev/null
+++ b/include/logging.h
@@ -0,0 +1,106 @@
+#ifndef LOGGING_H
+#define LOGGING_H
+
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <memory>
+#include "mpi.h"
+
+namespace Log {
+
+enum Level {
+    Error,
+    Info,
+    Warn,
+    Debug,
+    All
+};
+
+[[maybe_unused]] const std::string level_to_string(Level level);
+
+extern Level active_level;
+
+class Logger;
+class MPI_TraceSync {
+  public:
+    explicit MPI_TraceSync(Logger &log, bool append);
+    ~MPI_TraceSync();
+
+    template <class T>
+    MPI_TraceSync &operator<<(const T &v);
+    MPI_TraceSync &operator<<(std::ostream &(*m)(std::ostream &) );
+
+  private:
+    Logger            &_log;
+    bool               _append;
+    std::ostringstream _buffer;
+};
+
+template <class T>
+MPI_TraceSync &MPI_TraceSync::operator<<(const T &v)
+{
+    _buffer << v;
+    return *this;
+}
+
+class Logger {
+  public:
+    friend class MPI_TraceSync;
+    explicit Logger(std::string prefix, int comm_rank, int comm_size, const MPI_Comm& comm);
+
+    /// error > info > warn > debug > trace
+    std::ostream& error(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& info(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& warn(bool append=false);
+    /// error > info > warn > debug > trace
+    std::ostream& debug(bool append=false);
+    /// error > info > warn > debug > trace
+    MPI_TraceSync trace(bool append=false);
+    /// progress bar
+    void progress(const std::string& prefix, int step, int max);
+
+  private:
+    std::ostream& trace_impl(bool append=false);
+    /// starting time
+    std::chrono::steady_clock::time_point _start_time;
+    /// what the logger should always print first
+    const std::string _prefix;
+    /// empty stream to write nothing
+    std::ofstream _nullstr;
+    /// MPI comm rank
+    int _comm_rank;
+    /// MPI comm size
+    int _comm_size;
+    /// communicator
+    MPI_Comm _comm;
+};
+
+/**
+ * Creates all loggers and sets the level
+ * */
+void init(int comm_rank, int comm_size, const MPI_Comm& comm);
+
+/**
+ * Frees all memory
+ * */
+void finalize();
+
+/**
+ * Set activate rank
+ */
+[[maybe_unused]] void setActiveRank(int rank);
+
+/// logger with prefix [GENERAL]
+extern std::unique_ptr<Logger> general;
+/// logger with prefix [SOLVER]
+extern std::unique_ptr<Logger> solver;
+/// logger with prefix [IO]
+extern std::unique_ptr<Logger> io;
+
+}; // namespace Log
+
+#endif
\ No newline at end of file
diff --git a/include/material_models/J2Plasticity.h b/include/material_models/J2Plasticity.h
index 33f2a69..e657a20 100644
--- a/include/material_models/J2Plasticity.h
+++ b/include/material_models/J2Plasticity.h
@@ -55,6 +55,17 @@ class J2Plasticity : public SmallStrainMechModel {
         psi_bar_t.resize(num_elements, Matrix<double, 6, Dynamic>::Zero(6, num_gauss_points));
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        SmallStrainMechModel::register_serialization(r);
+        for (auto &mat : plasticStrain_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : plasticStrain) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : psi_bar_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : psi_bar) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &vec : psi_t) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+        for (auto &vec : psi) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+    }
+
     virtual void updateInternalVariables() override
     {
         plasticStrain_t = plasticStrain;
diff --git a/include/material_models/J2PlasticityNew.h b/include/material_models/J2PlasticityNew.h
index e62996e..3f0e26c 100644
--- a/include/material_models/J2PlasticityNew.h
+++ b/include/material_models/J2PlasticityNew.h
@@ -34,6 +34,15 @@ class J2PlasticityNew : public SmallStrainMechModel {
         q_t.resize(num_elements, VectorXd::Zero(num_gauss_points));
     }
 
+    void register_serialization(registry_t &r) override
+    {
+        SmallStrainMechModel::register_serialization(r);
+        for (auto &mat : plasticStrain_t) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &mat : plasticStrain) r.emplace_back(std::data(mat), (mat.size()) * sizeof(double), true);
+        for (auto &vec : q_t) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+        for (auto &vec : q) r.emplace_back(std::data(vec), (vec.size()) * sizeof(double), true);
+    }
+
     virtual void updateInternalVariables() override
     {
         plasticStrain_t = plasticStrain;
diff --git a/include/matmodel.h b/include/matmodel.h
index bd3a82d..ffdb1bc 100644
--- a/include/matmodel.h
+++ b/include/matmodel.h
@@ -3,12 +3,13 @@
 #define MATMODEL_H
 
 #include "general.h"
+#include "serialization.h"
 
 template <int howmany, int n_str>
 class Solver;
 
 template <int howmany, int n_str>
-class Matmodel {
+class Matmodel : public Serializable {
   public:
     EIGEN_MAKE_ALIGNED_OPERATOR_NEW // see http://eigen.tuxfamily.org/dox-devel/group__TopicStructHavingEigenMembers.html
 
@@ -27,7 +28,7 @@ class Matmodel {
     Matrix<double, howmany * 8, howmany * 8> Compute_Reference_ElementStiffness(const Matrix<double, n_str, n_str> &kapparef_mat);
     Matrix<double, howmany * 8, 1>          &element_residual(Matrix<double, howmany * 8, 1> &ue, int mat_index, ptrdiff_t element_idx);
     void                                     getStrainStress(double *strain, double *stress, Matrix<double, howmany * 8, 1> &ue, int mat_index, ptrdiff_t element_idx);
-    void                                     setGradient(vector<double> _g0);
+    void                                     setGradient(const vector<double> &_g0);
 
     // Accessors for internal Gauss point data (populated after getStrainStress call)
     inline const double *get_eps_data() const
@@ -53,6 +54,9 @@ class Matmodel {
 
     virtual ~Matmodel() = default;
 
+    void        register_serialization(registry_t &r) override;
+    void        init_deserialization() override;
+
   protected:
     double l_e_x;
     double l_e_y;
@@ -101,6 +105,37 @@ Matmodel<howmany, n_str>::Matmodel(const Reader &reader)
     sigma.resize(n_str * n_gp);
 }
 
+template <int howmany, int n_str>
+void Matmodel<howmany, n_str>::register_serialization(registry_t &r)
+{
+    //r.emplace_back(&verbosity, sizeof(int), false);
+    //r.emplace_back(&n_mat,     sizeof(int), false);
+    //r.emplace_back(&n_gp,      sizeof(int), false);
+
+    //r.emplace_back(std::data(FE_type), (FE_type.size() + 1) * sizeof(char), true);
+    //r.emplace_back(std::data(macroscale_loading), macroscale_loading.size() * sizeof(double), true);
+
+    //r.emplace_back(&l_e_x,      sizeof(double), false);
+    //r.emplace_back(&l_e_y,      sizeof(double), false);
+    //r.emplace_back(&l_e_z,      sizeof(double), false);
+    //r.emplace_back(&v_e,        sizeof(double), false);
+
+    //for (auto& mat : B_int) r.emplace_back(std::data(mat), mat.size() * sizeof(double), true);
+
+    r.emplace_back(std::data(B),      B.size()     * sizeof(double), true);
+    r.emplace_back(std::data(eps),    eps.size()   * sizeof(double), true);
+    r.emplace_back(std::data(g0),     g0.size()    * sizeof(double), true);
+    r.emplace_back(std::data(sigma),  sigma.size() * sizeof(double), true);
+    r.emplace_back(std::data(res_e),  res_e.size() * sizeof(double), true);
+}
+
+template <int howmany, int n_str>
+void Matmodel<howmany, n_str>::init_deserialization()
+{
+    // need to allocate enough memory first
+    //FE_type = std::string("\0\0\0\0\0\0");
+}
+
 template <int howmany, int n_str>
 void Matmodel<howmany, n_str>::Construct_B()
 {
@@ -225,11 +260,13 @@ void Matmodel<howmany, n_str>::getStrainStress(double *strain, double *stress, M
 }
 
 template <int howmany, int n_str>
-void Matmodel<howmany, n_str>::setGradient(vector<double> _g0)
+void Matmodel<howmany, n_str>::setGradient(const vector<double> &_g0)
 {
+    // _g0 is smaller than g0, can stay in cache
+    // if j loop inner, then not using cache-lines well
     macroscale_loading = _g0;
-    for (int i = 0; i < n_str; i++) {
-        for (int j = 0; j < n_gp; ++j) {
+    for (int j = 0; j < n_gp; ++j) {
+        for (int i = 0; i < n_str; i++) {
             g0(n_str * j + i, 0) = _g0[i];
         }
     }
diff --git a/include/reader.h b/include/reader.h
index 1e62249..e720704 100644
--- a/include/reader.h
+++ b/include/reader.h
@@ -11,16 +11,29 @@
 #include "H5FDmpio.h"
 #include "mpi.h"
 
+#include "serialization.h"
+
 using namespace std;
 
-class Reader {
+class Reader : public Serializable {
   public:
-    // Default constructor
-    Reader() = default;
+    // Explicit default constructor
+    Reader():
+    force_single_rank(false), communicator(nullptr), n_mat(0),
+    TOL(0), n_it(0), world_rank(0), world_size(0), alloc_local(0),
+    local_n0(0), local_0_start(0), local_n1(0), local_1_start(0)
+    {
+        manual_serialize   = true;
+        manual_deserialize = true;
+    }
 
     // Destructor to free allocated memory
     ~Reader();
 
+    // MPI controlls
+    bool     force_single_rank;
+    MPI_Comm communicator;
+
     // contents of input file:
     char             ms_filename[4096]{};    // Name of Micro-structure hdf5 file
     char             ms_datasetname[4096]{}; // Absolute path of Micro-structure in hdf5 file
@@ -61,8 +74,11 @@ class Reader {
 
     // void Setup(ptrdiff_t howmany);
     void ReadInputFile(char input_fn[]);
+    void ReadJson(json j);
     void ReadMS(int hm);
-    void ComputeVolumeFractions();
+    void     ComputeVolumeFractions();
+    length_t serialize_override(buffer_t &buffer, length_t offset) override;
+    length_t deserialize_override(buffer_t &buffer, length_t offset) override;
     // void ReadHDF5(char file_name[], char dset_name[]);
     void safe_create_group(hid_t file, const char *const name);
     void OpenResultsFile(const char *output_fn); // Open results file once
diff --git a/include/serialization.h b/include/serialization.h
new file mode 100644
index 0000000..4771241
--- /dev/null
+++ b/include/serialization.h
@@ -0,0 +1,164 @@
+//
+// Created by Alex Hocks on 09/01/26.
+//
+
+#ifndef SERIALIZATION_H
+#define SERIALIZATION_H
+#include "logging.h"
+
+#include <cstdint>
+#include <vector>
+#include <type_traits>
+
+template<typename T, typename Base = std::vector<T>, typename r = std::conditional_t<sizeof(T)<=4, std::true_type, void>>
+class SerializationBuffer : public Base {
+    // Buffer always has size multiple of 4 byte
+public:
+    using Base::Base;
+    using size_type = typename Base::size_type;
+    void resize(size_type new_size)
+    {
+        const auto num_bytes = new_size * data_size;
+        const auto delta_bytes = (4 - (num_bytes % 4)) % 4;
+        new_size += delta_bytes / data_size;
+        Base::resize(new_size);
+    }
+
+    void resize(size_type new_size, const T& value)
+    {
+        const auto num_bytes = new_size * data_size;
+        const auto delta_bytes = (4 - (num_bytes % 4)) % 4;
+        new_size += delta_bytes / data_size;
+        Base::resize(new_size, value);
+    }
+
+    void add_size(size_type delta) { resize(Base::size() + delta); }
+
+    std::vector<int>& as_int_vector() { return *reinterpret_cast<std::vector<int>*>(this); }
+
+    static SerializationBuffer& from_int_vector(std::vector<int>& vec) { return *reinterpret_cast<SerializationBuffer*>(&vec); }
+
+private:
+    static constexpr auto data_size = sizeof(T);
+};
+
+class Serializable {
+    /**
+     * Serializes any object that implements this.
+     * Buffer elements will be bytes, but total count will always be padded to be multiple of 4.
+     * Data is run length encoded.
+     */
+public:
+    // run length type
+    using length_t = uint64_t;
+    // run length block size
+    static constexpr auto header_len = sizeof(length_t);
+    // byte data type
+    using data_t   = uint8_t;
+    // buffer type
+    using buffer_t = SerializationBuffer<data_t>;
+    // registry type: pointer to data, data size, true: run length encoded, false: direct read write
+    using reg_entry_t = std::tuple<void*, length_t, bool>;
+    using registry_t = std::vector<reg_entry_t>;
+
+    virtual ~Serializable() = default;
+
+    /**
+     * Serializes this object
+     * @return serial buffer
+     */
+    buffer_t serialize_full()
+    {
+        if (manual_deserialize) {
+            buffer_t buffer;
+            serialize_override(buffer, 0);
+            return buffer;
+        }
+
+        Log::io->trace() << "Serialization::serialize_full() - register_serialization\n";
+        registry_t registry;
+        register_serialization(registry);
+
+        Log::io->trace() << "Serialization::serialize_full() - accumulate\n";
+        const length_t total_size = std::accumulate(registry.begin(), registry.end(), 0UL,
+            [](const length_t acc, const reg_entry_t &e) {
+            const bool use_run_length = std::get<2>(e);
+            length_t size = std::get<1>(e);
+            if (use_run_length) size += header_len;
+            return acc + size;
+        });
+
+        buffer_t buffer;
+        buffer.resize(total_size);
+        Log::io->trace() << "Serialization::serialize_full() - buffer size=" << total_size << " addr=" << static_cast<void*>(std::data(buffer)) << "\n";
+
+        Log::io->trace() << "Serialization::serialize_full() - process entries\n";
+        length_t offset = 0UL;
+        length_t pos = 0UL;
+        for (const reg_entry_t &e : registry) {
+            const bool use_run_length = std::get<2>(e);
+            length_t size             = std::get<1>(e);
+            void* data                = std::get<0>(e);
+            Log::io->trace() << "Serialization::serialize_full() - entry: pos=" << pos << " addr=" << data << " offset=" << offset << " useRL=" << use_run_length << " size=" << size << "\n";
+
+            if (use_run_length) {
+                *reinterpret_cast<length_t*>(std::data(buffer) + offset) = size;
+                offset += header_len;
+            }
+            std::memcpy(std::data(buffer) + offset, data, size);
+            offset += size;
+            pos++;
+        }
+
+        Log::io->trace() << "Serialization::serialize_full() - done\n";
+        return buffer;
+    }
+
+    virtual void register_serialization(registry_t &r) {}
+
+    virtual void init_deserialization() {}
+
+    virtual registry_t::iterator late_init_deserialization(registry_t &r, registry_t::iterator begin) { return r.begin(); }
+
+    bool manual_deserialize = false;
+    bool manual_serialize = false;
+
+    /**
+     * Deserialize this object using provided data
+     * @param buffer
+     */
+    void deserialize_full(buffer_t &buffer)
+    {
+        if (manual_deserialize) {
+            deserialize_override(buffer, 0);
+            return;
+        }
+
+        init_deserialization(); // alloc mem if needed
+
+        // gather serialization description
+        registry_t registry;
+        register_serialization(registry);
+        // place correct lengths into descriptor list, etc
+        late_init_deserialization(registry, registry.begin());
+
+        length_t offset = 0UL;
+        for (reg_entry_t &e : registry) {
+            const bool use_run_length = std::get<2>(e);
+            length_t &size            = std::get<1>(e);
+            void* data                = std::get<0>(e);
+
+            if (use_run_length) {
+                size = *reinterpret_cast<const length_t*>(std::data(buffer) + offset);
+                offset += header_len;
+            }
+            std::memcpy(data, std::data(buffer) + offset, size);
+            offset += size;
+        }
+    }
+
+    virtual length_t deserialize_override(buffer_t &buffer, length_t offset) { return offset; }
+    virtual length_t serialize_override(buffer_t &buffer, length_t offset) { return offset; }
+};
+
+#endif
diff --git a/include/solver.h b/include/solver.h
index cb05d49..6cd7c37 100644
--- a/include/solver.h
+++ b/include/solver.h
@@ -3,21 +3,24 @@
 
 #include "matmodel.h"
 #include "MaterialManager.h"
+#include "logging.h"
+#include "serialization.h"
 
 class J2Plasticity;
 
 typedef Map<Array<double, Dynamic, Dynamic>, Unaligned, OuterStride<>> RealArray;
 
 template <int howmany, int n_str>
-class Solver : private MixedBCController<howmany> {
+class Solver : MixedBCController<howmany>, public Serializable {
   public:
-    Solver(Reader &reader, MaterialManager<howmany, n_str> *matmanager);
+    explicit Solver(Reader &reader, MaterialManager<howmany, n_str> *matmanager=nullptr);
     virtual ~Solver();
 
     Reader &reader;
 
     const int world_rank;
     const int world_size;
+    MPI_Comm  communicator;
 
     const ptrdiff_t n_x, n_y, n_z;
     // NOTE: the order in the declaration is very important because it is the same order in which the later initialization via member initializer lists takes place
@@ -43,6 +46,10 @@ class Solver : private MixedBCController<howmany> {
 
     ArrayXd                          err_all; //!< Absolute error history
     Matrix<double, howmany, Dynamic> fundamentalSolution;
+    void init_fundamentalSolutionBuffer()
+    {
+        fundamentalSolution = Matrix<double, howmany, Dynamic>(howmany, (local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2);
+    }
 
     template <int padding, typename F>
     void iterateCubes(F f);
@@ -62,6 +69,8 @@ class Solver : private MixedBCController<howmany> {
     double compute_error(RealArray &r);
     void   CreateFFTWPlans(double *in, fftw_complex *transformed, double *out);
 
+    void register_serialization(registry_t &r) override;
+
     VectorXd homogenized_stress;
     VectorXd get_homogenized_stress();
 
@@ -108,6 +117,7 @@ Solver<howmany, n_str>::Solver(Reader &reader, MaterialManager<howmany, n_str> *
       matmanager(matmgr),
       world_rank(reader.world_rank),
       world_size(reader.world_size),
+      communicator(reader.communicator),
       n_x(reader.dims[0]),
       n_y(reader.dims[1]),
       n_z(reader.dims[2]),
@@ -130,23 +140,26 @@ Solver<howmany, n_str>::Solver(Reader &reader, MaterialManager<howmany, n_str> *
       rhat((std::complex<double> *) v_r, local_n1 * n_x * (n_z / 2 + 1) * howmany), // actual initialization is below
       buffer_padding(fftw_alloc_real(n_y * (n_z + 2) * howmany))
 {
+    Log::solver->trace() << "Start constructing solver\n";
+
     v_u_real.setZero();
     for (ptrdiff_t i = local_n0 * n_y * n_z * howmany; i < (local_n0 + 1) * n_y * n_z * howmany; i++) {
         this->v_u[i] = 0;
     }
     std::memset(v_u_prev, 0, local_n0 * n_y * n_z * howmany * sizeof(double));
 
-    matmanager->initialize_internal_variables(local_n0 * n_y * n_z, matmanager->models[0]->n_gp);
-
-    computeFundamentalSolution();
+    if (matmanager != nullptr) {
+        Log::solver->trace() << "Init internal vars.\n";
+        matmanager->initialize_internal_variables(local_n0 * n_y * n_z, matmanager->models[0]->n_gp);
+        computeFundamentalSolution();
+    }
 }
 
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::computeFundamentalSolution()
 {
-    if (world_rank == 0) {
-        printf("\n# Start creating Fundamental Solution(s) \n");
-    }
+    Log::solver->info() << "\n# Start creating Fundamental Solution(s) \n";
+
     clock_t tot_time = clock();
 
     Matrix<double, howmany * 8, howmany * 8> Ker0 = matmanager->models[0]->Compute_Reference_ElementStiffness(matmanager->kapparef_mat);
@@ -160,7 +173,7 @@ void Solver<howmany, n_str>::computeFundamentalSolution()
     Matrix<complex<double>, 8, 1>    A;
     Matrix<double, 8, 8>             AA;
     Matrix<double, howmany, howmany> block;
-    fundamentalSolution = Matrix<double, howmany, Dynamic>(howmany, (local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2);
+    init_fundamentalSolutionBuffer();
     fundamentalSolution.setZero();
 
     for (int i_y = 0; i_y < local_n1; ++i_y) {
@@ -198,14 +211,13 @@ void Solver<howmany, n_str>::computeFundamentalSolution()
     fundamentalSolution /= (double) (n_x * n_y * n_z);
 
     tot_time = clock() - tot_time;
-    if (world_rank == 0) {
-        printf("# Complete; Time for construction of Fundamental Solution(s): %f seconds\n", double(tot_time) / CLOCKS_PER_SEC);
-    }
+    Log::solver->info() << "# Complete; Time for construction of Fundamental Solution(s): " << static_cast<double>(tot_time) / CLOCKS_PER_SEC << " seconds\n";
 }
 
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::CreateFFTWPlans(double *in, fftw_complex *transformed, double *out)
 {
+    Log::solver->trace() << "Solver::CreateFFTWPlans() begin \n";
     int       rank   = 3;
     ptrdiff_t iblock = FFTW_MPI_DEFAULT_BLOCK;
     ptrdiff_t oblock = FFTW_MPI_DEFAULT_BLOCK;
@@ -218,13 +230,28 @@ void Solver<howmany, n_str>::CreateFFTWPlans(double *in, fftw_complex *transform
     // But, according to https://fftw.org/doc/MPI-Plan-Creation.html the BLOCK sizes must be the same:
     // "These must be the same block sizes as were passed to the corresponding ‘local_size’ function"
     const ptrdiff_t n[3] = {n_x, n_y, n_z};
-    planfft              = fftw_mpi_plan_many_dft_r2c(rank, n, howmany, iblock, oblock, in, transformed, MPI_COMM_WORLD, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_OUT);
-    planifft             = fftw_mpi_plan_many_dft_c2r(rank, n, howmany, iblock, oblock, transformed, out, MPI_COMM_WORLD, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_IN);
+    planfft              = fftw_mpi_plan_many_dft_r2c(rank, n, howmany, iblock, oblock, in, transformed, communicator, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_OUT);
+    planifft             = fftw_mpi_plan_many_dft_c2r(rank, n, howmany, iblock, oblock, transformed, out, communicator, FFTW_MEASURE | FFTW_MPI_TRANSPOSED_IN);
 
     // see https://eigen.tuxfamily.org/dox/group__TutorialMapClass.html#title3
     new (&rhat) Map<VectorXcd>((std::complex<double> *) transformed, local_n1 * n_x * (n_z / 2 + 1) * howmany);
 }
 
+template <int howmany, int n_str>
+void Solver<howmany, n_str>::register_serialization(registry_t &r)
+{
+    Log::io->trace() << "Solver::register_serialization v_r size=" << std::max(reader.alloc_local * 2, (local_n0 + 1) * n_y * (n_z + 2) * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_r, std::max(reader.alloc_local * 2, (local_n0 + 1) * n_y * (n_z + 2) * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization v_u size=" << (local_n0 * n_y * n_z * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_u, (local_n0 * n_y * n_z * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization v_u_prev size=" << (local_n0 * n_y * n_z * howmany) * sizeof(double) << "\n";
+    r.emplace_back(v_u_prev, (local_n0 * n_y * n_z * howmany) * sizeof(double), true);
+    Log::io->trace() << "Solver::register_serialization rhat size=" << (local_n1 * n_x * (n_z / 2 + 1) * howmany) * sizeof(std::complex<double>) << "\n";
+    r.emplace_back(std::data(rhat), (local_n1 * n_x * (n_z / 2 + 1) * howmany) * sizeof(std::complex<double>), true);
+    Log::io->trace() << "Solver::register_serialization fundamentalSolution size=" << ((local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2) * sizeof(double) << "\n";
+    r.emplace_back(std::data(fundamentalSolution), ((local_n1 * n_x * (n_z / 2 + 1) * (howmany + 1)) / 2) * sizeof(double), true);
+}
+
 // TODO: possibly circumvent the padding problem by accessing r as a matrix?
 template <int howmany, int n_str>
 template <int padding, typename F>
@@ -242,7 +269,7 @@ void Solver<howmany, n_str>::compute_residual_basic(RealArray &r_matrix, RealArr
     // int MPI_Sendrecv(void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf,
     //           int recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status)
     MPI_Sendrecv(u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
 
@@ -262,7 +289,7 @@ void Solver<howmany, n_str>::compute_residual_basic(RealArray &r_matrix, RealArr
     });
 
     MPI_Sendrecv(r + local_n0 * n_y * (n_z + padding) * howmany, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0,
-                 buffer_padding, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 buffer_padding, n_y * (n_z + padding) * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     RealArray b(buffer_padding, n_z * howmany, n_y, OuterStride<>((n_z + padding) * howmany)); // NOTE: for any padding of more than 2, the buffer_padding has to be extended
 
@@ -282,20 +309,18 @@ void Solver<howmany, n_str>::compute_residual(RealArray &r_matrix, RealArray &u_
 template <int howmany, int n_str>
 void Solver<howmany, n_str>::solve()
 {
-
     err_all          = ArrayXd::Zero(n_it + 1);
     fft_time         = 0.0;
     clock_t tot_time = clock();
     internalSolve();
     tot_time = clock() - tot_time;
-    // if( VERBOSITY > 5 ){
-    if (world_rank == 0) {
-        printf("# FFT Time per iteration .......   %2.6f sec\n", double(fft_time) / CLOCKS_PER_SEC / iter);
-        printf("# Total FFT Time ...............   %2.6f sec\n", double(fft_time) / CLOCKS_PER_SEC);
-        printf("# Total Time per iteration .....   %2.6f sec\n", double(tot_time) / CLOCKS_PER_SEC / iter);
-        printf("# Total Time ...................   %2.6f sec\n", double(tot_time) / CLOCKS_PER_SEC);
-        printf("# FFT contribution to total time   %2.6f %% \n", 100. * double(fft_time) / double(tot_time));
-    }
+
+    Log::solver->info() << "# FFT Time per iteration .......   " << std::setw(2) << std::setprecision(6) << double(fft_time) / CLOCKS_PER_SEC / iter   << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total FFT Time ...............   " << std::setw(2) << std::setprecision(6) << double(fft_time) / CLOCKS_PER_SEC          << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total Time per iteration .....   " << std::setw(2) << std::setprecision(6) << double(tot_time) / CLOCKS_PER_SEC / iter   << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# Total Time ...................   " << std::setw(2) << std::setprecision(6) << double(tot_time) / CLOCKS_PER_SEC          << std::defaultfloat << " sec\n";
+    Log::solver->info() << "# FFT contribution to total time   " << std::setw(2) << std::setprecision(6) << 100. * double(fft_time) / double(tot_time) << std::defaultfloat << " % \n";
+
     matmanager->update_internal_variables();
 }
 
@@ -427,19 +452,18 @@ double Solver<howmany, n_str>::compute_error(RealArray &r)
     }
 
     double err;
-    MPI_Allreduce(&err_local, &err, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+    MPI_Allreduce(&err_local, &err, 1, MPI_DOUBLE, MPI_MAX, communicator);
 
     err_all[iter]  = err;
     double err0    = err_all[0];
     double err_rel = (iter == 0 ? 100 : err / err0);
 
-    if (world_rank == 0) {
-        if (iter == 0) {
-            printf("Before 1st iteration: %16.8e\n", err0);
-        } else {
-            printf("it %3lu .... err %16.8e  / %8.4e, ratio: %4.8e, FFT time: %2.6f sec\n", iter, err, err / err0, (iter == 1 ? 0.0 : err / err_all[iter - 1]), double(buftime) / CLOCKS_PER_SEC);
-        }
-    }
+    if (iter == 0) Log::solver->info() << "Before 1st iteration: " << std::setw(16) << std::setprecision(8) << err0 << std::defaultfloat << "\n";
+    else           Log::solver->info() << "it "                    << iter << " .... "
+                                       << "err "                   << std::setw(16) << std::setprecision(8) << err << std::defaultfloat << "  / "
+                                                                   << std::setw( 8) << std::setprecision(4) << err / err0 << std::defaultfloat
+                                       << ", ratio: "              << std::setw(4)  << std::setprecision(8) << (iter == 1 ? 0.0 : err / err_all[iter - 1]) << std::defaultfloat
+                                       << ", FFT time: "           << std::setw(2)  << std::setprecision(6) << static_cast<double>(buftime) / CLOCKS_PER_SEC << std::defaultfloat << " sec\n";
 
     const std::string &error_type = reader.errorParameters["type"].get<std::string>();
     if (error_type == "absolute") {
@@ -495,7 +519,7 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     vector<int>      phase_counts(n_mat, 0);
 
     MPI_Sendrecv(v_u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
     int                            phase_id;
@@ -555,16 +579,16 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
         });
     }
 
-    MPI_Allreduce(MPI_IN_PLACE, stress_average.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-    MPI_Allreduce(MPI_IN_PLACE, strain_average.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, stress_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+    MPI_Allreduce(MPI_IN_PLACE, strain_average.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
     stress_average /= (n_x * n_y * n_z);
     strain_average /= (n_x * n_y * n_z);
 
     // Reduce per-phase accumulations across all processes
     for (int mat_index = 0; mat_index < n_mat; ++mat_index) {
-        MPI_Allreduce(MPI_IN_PLACE, phase_stress_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, phase_strain_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
-        MPI_Allreduce(MPI_IN_PLACE, &phase_counts[mat_index], 1, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(MPI_IN_PLACE, phase_stress_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+        MPI_Allreduce(MPI_IN_PLACE, phase_strain_average[mat_index].data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
+        MPI_Allreduce(MPI_IN_PLACE, &phase_counts[mat_index], 1, MPI_INT, MPI_SUM, communicator);
 
         // Compute average for each phase
         if (phase_counts[mat_index] > 0) {
@@ -573,16 +597,12 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
         }
     }
 
-    if (world_rank == 0) {
-        printf("# Effective Stress .. (");
-        for (int i = 0; i < n_str; ++i)
-            printf("%+.12f ", stress_average[i]);
-        printf(") \n");
-        printf("# Effective Strain .. (");
-        for (int i = 0; i < n_str; ++i)
-            printf("%+.12f ", strain_average[i]);
-        printf(") \n\n");
-    }
+    Log::solver->info() << "# Effective Stress .. (";
+    for (int i = 0; i < n_str; ++i) Log::solver->info(true) << std::showpos << std::fixed << std::setprecision(12) << stress_average[i] << " " << std::noshowpos << std::defaultfloat;
+    Log::solver->info(true) << ") \n";
+    Log::solver->info() << "# Effective Strain .. (";
+    for (int i = 0; i < n_str; ++i) Log::solver->info(true) << std::showpos << std::fixed << std::setprecision(12) << strain_average[i] << " " << std::noshowpos << std::defaultfloat;
+    Log::solver->info(true) << ") \n\n";
 
     /* ====================================================================== *
      *  u_total = g0·X  +  ũ          (vector or scalar, decided at compile time)
@@ -695,11 +715,7 @@ void Solver<howmany, n_str>::postprocess(Reader &reader, int load_idx, int time_
     if (find(reader.resultsToWrite.begin(), reader.resultsToWrite.end(), "homogenized_tangent") != reader.resultsToWrite.end()) {
         homogenized_tangent = get_homogenized_tangent(1e-6);
         hsize_t dims[2]     = {static_cast<hsize_t>(n_str), static_cast<hsize_t>(n_str)};
-        if (world_rank == 0) {
-            cout << "# Homogenized tangent: " << endl
-                 << setprecision(12) << homogenized_tangent << endl
-                 << endl;
-        }
+        Log::solver->info() << "# Homogenized tangent: \n" << std::setprecision(12) << homogenized_tangent << std::defaultfloat << "\n\n";
         reader.writeData("homogenized_tangent", load_idx, time_idx, homogenized_tangent.data(), dims, 2);
     }
     extrapolateDisplacement(); // prepare v_u for next time step
@@ -714,7 +730,7 @@ VectorXd Solver<howmany, n_str>::get_homogenized_stress()
     homogenized_stress = VectorXd::Zero(n_str);
 
     MPI_Sendrecv(v_u, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + world_size - 1) % world_size, 0,
-                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+                 v_u + local_n0 * n_y * n_z * howmany, n_y * n_z * howmany, MPI_DOUBLE, (world_rank + 1) % world_size, 0, communicator, MPI_STATUS_IGNORE);
 
     Matrix<double, howmany * 8, 1> ue;
     int                            phase_id;
@@ -731,7 +747,7 @@ VectorXd Solver<howmany, n_str>::get_homogenized_stress()
         homogenized_stress += stress.segment(n_str * idx[0], n_str);
     });
 
-    MPI_Allreduce(MPI_IN_PLACE, homogenized_stress.data(), n_str, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(MPI_IN_PLACE, homogenized_stress.data(), n_str, MPI_DOUBLE, MPI_SUM, communicator);
     homogenized_stress /= (n_x * n_y * n_z);
 
     return homogenized_stress;
diff --git a/include/solverCG.h b/include/solverCG.h
index b02a428..0fcdf59 100644
--- a/include/solverCG.h
+++ b/include/solverCG.h
@@ -53,15 +53,14 @@ double SolverCG<howmany, n_str>::dotProduct(RealArray &a, RealArray &b)
 {
     double local_value = (a * b).sum();
     double result;
-    MPI_Allreduce(&local_value, &result, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+    MPI_Allreduce(&local_value, &result, 1, MPI_DOUBLE, MPI_SUM, this->communicator);
     return result;
 }
 
 template <int howmany, int n_str>
 void SolverCG<howmany, n_str>::internalSolve()
 {
-    if (this->world_rank == 0)
-        printf("\n# Start FANS - Conjugate Gradient Solver \n");
+    Log::solver->info() << "\n# Start FANS - Conjugate Gradient Solver \n";
 
     bool islinear = this->matmanager->all_linear;
 
@@ -110,8 +109,8 @@ void SolverCG<howmany, n_str>::internalSolve()
         iter++;
         err_rel = this->compute_error(v_r_real);
     }
-    if (this->world_rank == 0)
-        printf("# Complete FANS - Conjugate Gradient Solver \n");
+
+    Log::solver->info() << "# Complete FANS - Conjugate Gradient Solver \n";
 }
 
 template <int howmany, int n_str>
@@ -142,8 +141,10 @@ void SolverCG<howmany, n_str>::LineSearchSecant()
     }
     v_u_real += d_real * (alpha_new - alpha_old);
     v_r_real = rnew_real;
-    if (this->world_rank == 0)
-        printf("line search iter %i, alpha %f - error %e - ", _iter, alpha_new, err);
+
+    Log::solver->info() << "line search iter " << _iter
+                        << ", alpha "  << std::fixed << std::setprecision(6) << alpha_new  << std::defaultfloat
+                        << " - error " << std::fixed << std::setprecision(6) << err        << std::defaultfloat << " - ";
 }
 
 template <int howmany, int n_str>
diff --git a/include/solverFP.h b/include/solverFP.h
index 67b99a4..6a00691 100644
--- a/include/solverFP.h
+++ b/include/solverFP.h
@@ -32,8 +32,7 @@ SolverFP<howmany, n_str>::SolverFP(Reader &reader, MaterialManager<howmany, n_st
 template <int howmany, int n_str>
 void SolverFP<howmany, n_str>::internalSolve()
 {
-    if (this->world_rank == 0)
-        printf("\n# Start FANS - Fixed Point Solver \n");
+    Log::solver->info() << "\n# Start FANS - Fixed Point Solver \n";
 
     this->template compute_residual<2>(v_r_real, v_u_real);
 
@@ -50,7 +49,7 @@ void SolverFP<howmany, n_str>::internalSolve()
         iter++;
         err_rel = this->compute_error(v_r_real);
     }
-    if (this->world_rank == 0)
-        printf("# Complete FANS - Fixed Point Solver \n");
+
+    Log::solver->info() << "# Complete FANS - Fixed Point Solver \n";
 }
 #endif
diff --git a/pixi.lock b/pixi.lock
index a1b4345..1c1db95 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -2253,7 +2253,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstandard-0.25.0-py314h2e8dab5_1.conda
       - conda: https://conda.anaconda.org/conda-forge/linux-aarch64/zstd-1.5.7-hbcf94c1_2.conda
       - conda: .
-        subdir: linux-aarch64
+        build: he8cfe8b_0
       osx-64:
       - conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiobotocore-2.25.2-pyhcf101f3_0.conda
@@ -2600,7 +2600,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/osx-64/zstandard-0.25.0-py314hd1e8ddb_1.conda
       - conda: https://conda.anaconda.org/conda-forge/osx-64/zstd-1.5.7-h8210216_2.conda
       - conda: .
-        subdir: osx-64
+        build: h0dc7051_0
       osx-arm64:
       - conda: https://conda.anaconda.org/conda-forge/noarch/_python_abi3_support-1.0-hd8ed1ab_2.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/aiobotocore-2.25.2-pyhcf101f3_0.conda
@@ -6125,63 +6125,55 @@ packages:
 - conda: .
   name: fans
   version: 0.6.1
-  build: h1235946_0
-  subdir: linux-64
+  build: h0dc7051_0
+  subdir: osx-64
   variants:
-    cxx_compiler: clangxx
-    target_platform: linux-64
+    target_platform: osx-64
   depends:
-  - libstdcxx >=15
-  - libgcc >=15
+  - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 16dd9c1131985cbcdb3bb37c6672fa20ea368188987b8d42da1b2ebd02bd9fbf
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hbf21a9e_0
-  subdir: linux-aarch64
+  build: h1235946_0
+  subdir: linux-64
+  variants:
+    cxx_compiler: clangxx
+    target_platform: linux-64
   depends:
   - libstdcxx >=15
   - libgcc >=15
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 8ad54598e221efef736cec65db9e327856f556e55f81445cda5620e181ef8813
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hbf21a9e_0
-  subdir: osx-64
+  build: hcb8d3e5_0
+  subdir: osx-arm64
+  variants:
+    cxx_compiler: clangxx
+    target_platform: osx-arm64
   depends:
   - libcxx >=21
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 8ad54598e221efef736cec65db9e327856f556e55f81445cda5620e181ef8813
-    globs: []
 - conda: .
   name: fans
   version: 0.6.1
-  build: hcb8d3e5_0
-  subdir: osx-arm64
+  build: he8cfe8b_0
+  subdir: linux-aarch64
   variants:
-    cxx_compiler: clangxx
-    target_platform: osx-arm64
+    target_platform: linux-aarch64
   depends:
-  - libcxx >=21
+  - libstdcxx >=15
+  - libgcc >=15
   - hdf5 >=1.14.6,<1.14.7.0a0 mpi_openmpi_*
   - fftw >=3.3.10,<4.0a0
   - fftw * mpi_openmpi_*
-  input:
-    hash: 16dd9c1131985cbcdb3bb37c6672fa20ea368188987b8d42da1b2ebd02bd9fbf
-    globs: []
 - conda: https://conda.anaconda.org/conda-forge/linux-64/fftw-3.3.10-mpi_openmpi_h99e62ba_10.conda
   sha256: 59a1fd0daa71bd5529e19b4da8aae2ded4d4ef05e5e31d80c39cbe2fc7664b6a
   md5: 3fcbf2ef5f3a8f0ed5683f60fe08293b
diff --git a/pyfans/micro.cpp b/pyfans/micro.cpp
index b9f3636..4e82f22 100644
--- a/pyfans/micro.cpp
+++ b/pyfans/micro.cpp
@@ -6,6 +6,7 @@
 #include "micro.hpp"
 #include "setup.h"
 #include "matmodel.h"
+#include "mpi.h"
 
 py::array_t<double> merge_arrays(py::array_t<double> array1, py::array_t<double> array2)
 {
@@ -24,21 +25,34 @@ py::array_t<double> merge_arrays(py::array_t<double> array1, py::array_t<double>
     return result;
 }
 
-MicroSimulation::MicroSimulation(int sim_id, char *input_file)
+MicroSimulation::MicroSimulation(int sim_id, bool late_init, char *input_file) : _sim_id(sim_id)
 {
     // initialize fftw mpi
     fftw_mpi_init();
 
-    // Input file name is hardcoded. TODO: Make it configurable
-    reader.ReadInputFile(input_file);
+    if (not late_init || sim_id >= 0) {
+        // Input file name is hardcoded. TODO: Make it configurable
+        reader.ReadInputFile(input_file);
+        reader.ReadMS(3);
+
+        if (reader.strain_type == "small") {
+            matmanager = createMaterialManager<3, 6>(reader);
+            solver     = createSolver<3, 6>(reader, std::get<MaterialManager<3, 6> *>(matmanager));
+        } else {
+            matmanager = createMaterialManager<3, 9>(reader);
+            solver     = createSolver<3, 9>(reader, std::get<MaterialManager<3, 9> *>(matmanager));
+        }
+    }
+}
 
-    reader.ReadMS(3);
-    matmanager = createMaterialManager<3, 6>(reader);
-    solver     = createSolver<3, 6>(reader, matmanager);
+MicroSimulation::~MicroSimulation()
+{
+    Log::finalize();
 }
 
 py::dict MicroSimulation::solve(py::dict macro_data, double dt)
 {
+    const bool is_small_strain = std::holds_alternative<MaterialManager<3, 6> *>(matmanager);
     // Time step value dt is not used currently, but is available for future use
 
     // Create a pybind style Numpy array from macro_write_data["micro_vector_data"], which is a Numpy array
@@ -46,44 +60,115 @@ py::dict MicroSimulation::solve(py::dict macro_data, double dt)
     py::array_t<double> strain2 = macro_data["strains4to6"].cast<py::array_t<double>>();
 
     py::array_t<double> strain = merge_arrays(strain1, strain2);
-    std::vector<double> g0     = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
+    if (not is_small_strain) {
+        py::array_t<double> strain3 = macro_data["strains7to9"].cast<py::array_t<double>>();
+        strain                      = merge_arrays(strain, strain3);
+    }
+
+    std::vector<double> g0 = std::vector<double>(strain.data(), strain.data() + strain.size()); // convert numpy array to std::vector.
 
     VectorXd homogenized_stress;
 
-    matmanager->set_gradient(g0);
+    std::visit([&](auto &mm) { mm->set_gradient(g0); }, matmanager);
 
-    solver->solve();
+    std::visit([](auto &s) { s->solve(); }, solver);
 
-    homogenized_stress = solver->get_homogenized_stress();
+    homogenized_stress = std::visit([](auto &s) -> VectorXd { return s->get_homogenized_stress(); }, solver);
 
-    auto C = solver->get_homogenized_tangent(pert_param);
+    MatrixXd C = std::visit([&](auto &s) -> MatrixXd { return s->get_homogenized_tangent(pert_param); }, solver);
 
     // Convert data to a py::dict again to send it back to the Micro Manager
     py::dict micro_write_data;
 
     // Add stress and stiffness matrix data to Python dict to be returned
-    std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
-    micro_write_data["stresses1to3"] = stress13;
-    std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
-    micro_write_data["stresses4to6"] = stress46;
-    std::vector<double> C_1          = {C(0, 0), C(0, 1), C(0, 2)};
-    micro_write_data["cmat1"]        = C_1;
-    std::vector<double> C_2          = {C(0, 3), C(0, 4), C(0, 5)};
-    micro_write_data["cmat2"]        = C_2;
-    std::vector<double> C_3          = {C(1, 1), C(1, 2), C(1, 3)};
-    micro_write_data["cmat3"]        = C_3;
-    std::vector<double> C_4          = {C(1, 4), C(1, 5), C(2, 2)};
-    micro_write_data["cmat4"]        = C_4;
-    std::vector<double> C_5          = {C(2, 3), C(2, 4), C(2, 5)};
-    micro_write_data["cmat5"]        = C_5;
-    std::vector<double> C_6          = {C(3, 3), C(3, 4), C(3, 5)};
-    micro_write_data["cmat6"]        = C_6;
-    std::vector<double> C_7          = {C(4, 4), C(4, 5), C(5, 5)};
-    micro_write_data["cmat7"]        = C_7;
+    if (is_small_strain) {
+        std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
+        micro_write_data["stresses1to3"] = stress13;
+        std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
+        micro_write_data["stresses4to6"] = stress46;
+        std::vector<double> C_1          = {C(0, 0), C(0, 1), C(0, 2)};
+        micro_write_data["cmat1"]        = C_1;
+        std::vector<double> C_2          = {C(0, 3), C(0, 4), C(0, 5)};
+        micro_write_data["cmat2"]        = C_2;
+        std::vector<double> C_3          = {C(1, 1), C(1, 2), C(1, 3)};
+        micro_write_data["cmat3"]        = C_3;
+        std::vector<double> C_4          = {C(1, 4), C(1, 5), C(2, 2)};
+        micro_write_data["cmat4"]        = C_4;
+        std::vector<double> C_5          = {C(2, 3), C(2, 4), C(2, 5)};
+        micro_write_data["cmat5"]        = C_5;
+        std::vector<double> C_6          = {C(3, 3), C(3, 4), C(3, 5)};
+        micro_write_data["cmat6"]        = C_6;
+        std::vector<double> C_7          = {C(4, 4), C(4, 5), C(5, 5)};
+        micro_write_data["cmat7"]        = C_7;
+    } else {
+        std::vector<double> stress13     = {homogenized_stress[0], homogenized_stress[1], homogenized_stress[2]};
+        micro_write_data["stresses1to3"] = stress13;
+        std::vector<double> stress46     = {homogenized_stress[3], homogenized_stress[4], homogenized_stress[5]};
+        micro_write_data["stresses4to6"] = stress46;
+        std::vector<double> stress79     = {homogenized_stress[6], homogenized_stress[7], homogenized_stress[8]};
+        micro_write_data["stresses7to9"] = stress79;
+    }
 
     return micro_write_data;
 }
 
+py::dict MicroSimulation::get_state()
+{
+    py::dict state;
+
+    Log::io->debug() << "Serializing reader\n";
+    Serializable::buffer_t reader_state = reader.serialize_full();
+    state["reader_state"] = reader_state.as_int_vector();
+    Log::io->debug() << "Serializing solver\n";
+    Serializable::buffer_t solver_state = std::visit([](auto &s) { return s->serialize_full(); }, solver);
+    state["solver_state"] = solver_state.as_int_vector();
+    Log::io->debug() << "Serializing material manager\n";
+    Serializable::buffer_t matmngr_state = std::visit([](auto &m) { return m->serialize_full(); }, matmanager);
+    state["matmanager_state"] = matmngr_state.as_int_vector();
+    Log::io->debug() << "Serializing micro sim done\n";
+
+    return state;
+}
+
+void MicroSimulation::set_state(py::dict &state)
+{
+    auto py_reader_state = state["reader_state"].cast<py::array_t<int>>();
+    std::vector<int> reader_state_i(py_reader_state.data(), py_reader_state.data() + py_reader_state.size());
+    Serializable::buffer_t &reader_state = Serializable::buffer_t::from_int_vector(reader_state_i);
+
+    auto py_solver_state = state["solver_state"].cast<py::array_t<int>>();
+    std::vector<int> solver_state_i(py_solver_state.data(), py_solver_state.data() + py_solver_state.size());
+    Serializable::buffer_t &solver_state = Serializable::buffer_t::from_int_vector(solver_state_i);
+
+    auto py_matmngr_state = state["matmanager_state"].cast<py::array_t<int>>();
+    std::vector<int> matmngr_state_i(py_matmngr_state.data(), py_matmngr_state.data() + py_matmngr_state.size());
+    Serializable::buffer_t &matmngr_state = Serializable::buffer_t::from_int_vector(matmngr_state_i);
+
+    reader.deserialize_full(reader_state);
+    if (reader.strain_type == "small") {
+        auto *mat_ptr = createMaterialManager<3, 6>(reader);
+        mat_ptr->deserialize_full(matmngr_state);
+        matmanager = mat_ptr;
+        auto *sol_ptr = createSolver<3, 6>(reader, nullptr);
+        sol_ptr->init_fundamentalSolutionBuffer();
+        sol_ptr->matmanager = std::get<MaterialManager<3, 6> *>(matmanager);
+        sol_ptr->deserialize_full(solver_state);
+        solver     = sol_ptr;
+
+    } else {
+        auto *mat_ptr = createMaterialManager<3, 9>(reader);
+        mat_ptr->deserialize_full(matmngr_state);
+        matmanager = mat_ptr;
+        auto *sol_ptr = createSolver<3, 9>(reader, nullptr);
+        sol_ptr->init_fundamentalSolutionBuffer();
+        sol_ptr->matmanager = std::get<MaterialManager<3, 9> *>(matmanager);
+        sol_ptr->deserialize_full(solver_state);
+        solver     = sol_ptr;
+    }
+}
+
+int MicroSimulation::get_id() { return _sim_id; }
+
 PYBIND11_MODULE(PyFANS, m)
 {
     // optional docstring
@@ -91,5 +176,22 @@ PYBIND11_MODULE(PyFANS, m)
 
     py::class_<MicroSimulation>(m, "MicroSimulation")
         .def(py::init<int>())
-        .def("solve", &MicroSimulation::solve);
+        .def("solve", &MicroSimulation::solve)
+        .def(py::pickle(
+            [](MicroSimulation& m) {
+                return py::make_tuple(m.get_id(), m.get_state());
+            },
+            [](py::tuple t) {
+                int id = t[0].cast<int>();
+                py::dict state = t[1].cast<py::dict>();
+
+                MicroSimulation* m = new MicroSimulation(id, true);
+                m->set_state(state);
+                return m;
+            }
+            ))
+        .def("set_state", &MicroSimulation::set_state)
+        .def("get_state", &MicroSimulation::get_state)
+        .def("get_global_id", &MicroSimulation::get_id)
+    ;
 }
diff --git a/pyfans/micro.hpp b/pyfans/micro.hpp
index b6ebc6c..db83a04 100644
--- a/pyfans/micro.hpp
+++ b/pyfans/micro.hpp
@@ -5,6 +5,7 @@
 #pragma once
 #include <iostream>
 #include <vector>
+#include <variant>
 
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h" // numpy arrays
@@ -14,19 +15,28 @@
 #include "matmodel.h"
 #include "MaterialManager.h"
 #include "solver.h"
+#include "serialization.h"
 
 namespace py = pybind11;
 
 class MicroSimulation {
   public:
-    MicroSimulation(int sim_id, char *input_file = "input.json");
+    MicroSimulation(int sim_id, bool late_init = false, char *input_file = "input.json");
+    ~MicroSimulation();
     py::dict solve(py::dict macro_write_data, double dt);
 
+    py::dict get_state();
+    void set_state(py::dict &state);
+
+    int get_id();
+
   private:
     int    _sim_id;
     Reader reader;
     // Hardcoding mechanical models because these definitions need information from the input file.
-    MaterialManager<3, 6> *matmanager;
-    Solver<3, 6>          *solver;
-    double                 pert_param = 1e-6; // scalar strain perturbation parameter
+    using matmanager_t = std::variant<MaterialManager<3, 6> *, MaterialManager<3, 9> *>;
+    using solver_t     = std::variant<Solver<3, 6> *, Solver<3, 9> *>;
+    matmanager_t matmanager;
+    solver_t     solver;
+    double       pert_param = 1e-6; // scalar strain perturbation parameter
 };
diff --git a/src/logging.cpp b/src/logging.cpp
new file mode 100755
index 0000000..cc36aa6
--- /dev/null
+++ b/src/logging.cpp
@@ -0,0 +1,174 @@
+#include "logging.h"
+
+#include <utility>
+
+#include "mpi.h"
+
+int active_rank = 0;
+
+Log::MPI_TraceSync::MPI_TraceSync(Logger &log, bool append) : _log(log), _append(append) { }
+Log::MPI_TraceSync::~MPI_TraceSync()
+{
+    if (active_level >= All) {
+        MPI_Barrier(_log._comm);
+        const int size = _log._comm_size;
+        for (int i = 0; i < size; i++) {
+            Log::setActiveRank(i);
+            MPI_Barrier(_log._comm);
+            _log.trace_impl(_append) << _buffer.str() << std::flush;
+        }
+        Log::setActiveRank(0);
+        MPI_Barrier(_log._comm);
+    }
+}
+
+Log::MPI_TraceSync &Log::MPI_TraceSync::operator<<(std::ostream &(*m)(std::ostream &) )
+{
+    _buffer << m;
+    return *this;
+}
+
+Log::Logger::Logger(std::string prefix, int comm_rank, int comm_size, const MPI_Comm& comm) : _prefix(std::move(prefix)), _nullstr(), _comm_rank(comm_rank), _comm_size(comm_size), _comm(comm) {
+    _start_time = std::chrono::steady_clock::now();
+    _nullstr.setstate(std::ios_base::badbit);
+}
+
+std::ostream &Log::Logger::error(bool append) {
+    if (active_level >= Error && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr;
+}
+
+std::ostream &Log::Logger::info(bool append) {
+    if (active_level >= Info && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+std::ostream &Log::Logger::warn(bool append) {
+    if (active_level >= Warn && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+std::ostream &Log::Logger::debug(bool append) {
+    if (active_level >= Debug && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix;
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr; 
+}
+
+Log::MPI_TraceSync Log::Logger::trace(bool append) {
+    return Log::MPI_TraceSync(*this, append);
+}
+
+std::ostream &Log::Logger::trace_impl(bool append) {
+    if (active_level >= All && _comm_rank == active_rank) {
+        if (append) return std::cout;
+        std::cout << _prefix << "[" << active_rank << "] ";
+        auto now = std::chrono::steady_clock::now();
+        auto elapse_time = static_cast<double>(std::chrono::duration_cast<std::chrono::milliseconds>(now - _start_time).count()) / 1000.0;
+        std::cout << elapse_time << ": ";
+        return std::cout;
+    } else return _nullstr;
+}
+
+void Log::Logger::progress(const std::string &prefix, int step, int max) {
+    if (_comm_rank != active_rank) return;
+
+    if (step == 0) std::cout << _prefix << prefix;
+
+    int digits = 1;
+    int div = 10;
+    while ((max / div) > 0) {
+        digits++;
+        div *= 10;
+    }
+    if (step > 0) for (int i = 0; i < digits; i++) std::cout << '\b';
+
+    int step_digits = 1;
+    div = 10;
+    while ((step / div) > 0) {
+        step_digits++;
+        div *= 10;
+    }
+
+    for (int i = 0; i < digits - step_digits; i++) std::cout << ' ';
+    std::cout << step;
+
+    if (step >= max-1) {
+        for (int i = 0; i < digits; i++)
+            std::cout << '\b';
+        for (int i = 0; i < prefix.size(); i++)
+            std::cout << '\b';
+        for (int i = 0; i < _prefix.size(); i++)
+            std::cout << '\b';
+    }
+    //if (step >= max) std::cout << '\n';
+    std::cout.flush();
+}
+
+void Log::init(int comm_rank, int comm_size, const MPI_Comm& comm) {
+    Level level = Log::Error;
+    if constexpr (VERBOSITY <= 0) level = Error;
+    if constexpr (VERBOSITY == 1) level = Info;
+    if constexpr (VERBOSITY == 2) level = Warn;
+    if constexpr (VERBOSITY == 3) level = Debug;
+    if constexpr (VERBOSITY >= 4) level = All;
+
+    active_level = level;
+    general = std::make_unique<Logger>("[FANS-GENERAL] ", comm_rank, comm_size, comm);
+    solver = std::make_unique<Logger>("[FANS-SOLVER] ", comm_rank, comm_size, comm);
+    io = std::make_unique<Logger>("[FANS-IO] ", comm_rank, comm_size, comm);
+}
+
+void Log::finalize() {
+    general = nullptr;
+    solver = nullptr;
+    io = nullptr;
+}
+
+Log::Level Log::active_level = Info;
+std::unique_ptr<Log::Logger> Log::general = nullptr;
+std::unique_ptr<Log::Logger> Log::solver = nullptr;
+std::unique_ptr<Log::Logger> Log::io = nullptr;
+
+const std::string Log::level_to_string(Log::Level level) {
+    switch (level) {
+    case Error:
+        return "Error";
+    case Info:
+        return "Info";
+    case Warn:
+        return "Warn";
+    case Debug:
+        return "Debug";
+    case All:
+        return "All";
+    }
+    return "";
+}
+
+void Log::setActiveRank(int rank) {
+    active_rank = rank;
+}
+
diff --git a/src/main.cpp b/src/main.cpp
index 6c69bf6..bcdce0b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,6 +4,7 @@
 #include "solver.h"
 
 // Version
+#include "logging.h"
 #include "version.h"
 
 template <int howmany, int n_str>
@@ -15,17 +16,13 @@ void runSolver(Reader &reader)
         MaterialManager<howmany, n_str> *matmanager = createMaterialManager<howmany, n_str>(reader);
         Solver<howmany, n_str>          *solver     = createSolver(reader, matmanager);
 
-        if (reader.world_rank == 0) {
-            printf("\n╔════════════════════════════════════════════════════════════ Load case %zu/%zu: %zu time steps ════════════════════════════════════════════════════════════╗\n",
-                   load_path_idx + 1, reader.load_cases.size(), reader.load_cases[load_path_idx].n_steps);
-        }
+        Log::general->info() << "\n╔════════════════════════════════════════════════════════════ Load case "
+                             << load_path_idx + 1 << "/" << reader.load_cases.size() << ": " << reader.load_cases[load_path_idx].n_steps
+                             << " time steps ════════════════════════════════════════════════════════════╗\n";
 
         for (size_t time_step_idx = 0; time_step_idx < reader.load_cases[load_path_idx].n_steps; ++time_step_idx) {
-            if (reader.world_rank == 0) {
-                printf("║   ▶ Time step %zu/%zu (load case %zu/%zu) ◀ \n",
-                       time_step_idx + 1, reader.load_cases[load_path_idx].n_steps,
-                       load_path_idx + 1, reader.load_cases.size());
-            }
+            Log::general->info() << "║   ▶ Time step " << time_step_idx + 1 << "/" << reader.load_cases[load_path_idx].n_steps
+                                 << " (load case " << load_path_idx + 1 << "/" << reader.load_cases.size() << ") ◀ \n";
             if (reader.load_cases[load_path_idx].mixed) {
                 solver->enableMixedBC(reader.load_cases[load_path_idx].mbc, time_step_idx);
             } else {
@@ -37,9 +34,7 @@ void runSolver(Reader &reader)
         }
         delete solver;
         delete matmanager;
-        if (reader.world_rank == 0) {
-            printf("╚══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝\n");
-        }
+        Log::general->info() << "╚══════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╝\n";
     }
 }
 
@@ -73,6 +68,7 @@ int main(int argc, char *argv[])
     }
     reader.CloseResultsFile();
 
+    Log::finalize();
     MPI_Finalize();
     return 0;
 }
diff --git a/src/reader.cpp b/src/reader.cpp
index 288d4af..b1fe343 100644
--- a/src/reader.cpp
+++ b/src/reader.cpp
@@ -1,5 +1,6 @@
 #include "general.h"
 #include "reader.h"
+#include "logging.h"
 
 #include "H5Cpp.h"
 #include "fftw3-mpi.h"
@@ -29,16 +30,14 @@ void Reader::ComputeVolumeFractions()
 
     // Find the global maximum and minimum material indices
     unsigned short global_max, global_min;
-    MPI_Allreduce(&local_max, &global_max, 1, MPI_UNSIGNED_SHORT, MPI_MAX, MPI_COMM_WORLD);
-    MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_SHORT, MPI_MIN, MPI_COMM_WORLD);
+    MPI_Allreduce(&local_max, &global_max, 1, MPI_UNSIGNED_SHORT, MPI_MAX, communicator);
+    MPI_Allreduce(&local_min, &global_min, 1, MPI_UNSIGNED_SHORT, MPI_MIN, communicator);
 
     // Calculate total number of materials
     n_mat = global_max - global_min + 1;
 
-    if (world_rank == 0) {
-        printf("# Number of materials: %i (from %u to %u)\n", n_mat, global_min, global_max);
-        printf("# Volume fractions\n");
-    }
+    Log::io->info() << "# Number of materials: " << n_mat << " (from " << global_min << " to " << global_max << ")\n";
+    Log::io->info() << "# Volume fractions\n";
 
     // Using dynamic allocation for arrays since we don't know size at compile time
     std::vector<long>   vol_frac(n_mat, 0);
@@ -52,26 +51,45 @@ void Reader::ComputeVolumeFractions()
 
     for (int i = 0; i < n_mat; i++) {
         long vf;
-        MPI_Allreduce(&(vol_frac[i]), &vf, 1, MPI_LONG, MPI_SUM, MPI_COMM_WORLD);
+        MPI_Allreduce(&(vol_frac[i]), &vf, 1, MPI_LONG, MPI_SUM, communicator);
         v_frac[i] = double(vf) / double(dims[0] * dims[1] * dims[2]);
-        if (world_rank == 0)
-            printf("# material %4u    vol. frac. %10.4f%%  \n",
-                   static_cast<unsigned int>(i) + global_min, 100. * v_frac[i]);
+        Log::io->info() << "# material " << static_cast<unsigned int>(i) + global_min << "    vol. frac. " << std::setw(10) << std::setprecision(4) << 100. * v_frac[i] << std::defaultfloat << "%\n";
     }
 }
 
-void Reader ::ReadInputFile(char input_fn[])
+void Reader::ReadInputFile(char input_fn[])
 {
     try {
-
-        MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-        MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-
         ifstream i(input_fn);
         json     j;
         i >> j;
+        ReadJson(j);
+
+    } catch (const std::exception &e) {
+        Log::io->error() << "ERROR trying to read input file '" << input_fn << "' for FANS\n";
+        Log::finalize();
+        exit(10);
+    }
+}
+
+void Reader::ReadJson(json j)
+{
+    try {
         inputJson = j; // Store complete input JSON for MaterialManager
 
+        if (j.contains("no_mpi")) {
+            force_single_rank = true;
+            communicator      = MPI_COMM_SELF;
+        } else {
+            force_single_rank = false;
+            communicator      = MPI_COMM_WORLD;
+        }
+
+        MPI_Comm_rank(communicator, &world_rank);
+        MPI_Comm_size(communicator, &world_size);
+        Log::init(world_rank, world_size, communicator);
+        Log::io->trace() << "Running with total ranks=" << world_size << ", rank=" << world_rank << "\n";
+
         microstructure = j["microstructure"];
         strcpy(ms_filename, microstructure["filepath"].get<string>().c_str());
         strcpy(ms_datasetname, microstructure["datasetname"].get<string>().c_str());
@@ -148,22 +166,20 @@ void Reader ::ReadInputFile(char input_fn[])
             load_cases.push_back(std::move(lc));
         }
 
-        if (world_rank == 0) {
-            printf("# microstructure file name: \t '%s'\n", ms_filename);
-            printf("# microstructure dataset name: \t '%s'\n", ms_datasetname);
-            printf("# strain type: \t %s\n", strain_type.c_str());
-            printf("# problem type: \t %s\n", problemType.c_str());
-            printf("# FE type: \t %s\n", FE_type.c_str());
-            printf(
-                "# FANS error measure: \t %s %s error  \n",
-                errorParameters["type"].get<string>().c_str(),
-                errorParameters["measure"].get<string>().c_str());
-            printf("# FANS Tolerance: \t %10.5e\n", errorParameters["tolerance"].get<double>());
-            printf("# Max iterations: \t %6i\n", n_it);
-        }
+        Log::io->info() << "# microstructure file name: \t " << ms_filename << "\n";
+        Log::io->info() << "# microstructure dataset name: \t " << ms_datasetname <<"\n";
+        Log::io->info() << "# strain type: \t " << strain_type << "\n";
+        Log::io->info() << "# problem type: \t " << problemType << "\n";
+        Log::io->info() << "# FE type: \t " << FE_type << "\n";
+        Log::io->info() << "# FANS error measure: \t " <<
+            errorParameters["type"].get<string>() << " " <<
+            errorParameters["measure"].get<string>() << " error  \n";
+        Log::io->info() << "# FANS Tolerance: \t " << std::setw(10) << std::setprecision(5) << errorParameters["tolerance"].get<double>() << std::defaultfloat << "e\n";
+        Log::io->info() << "# Max iterations: \t " << n_it << "\n";
 
     } catch (const std::exception &e) {
-        fprintf(stderr, "ERROR trying to read input file '%s' for FANS\n", input_fn);
+        Log::io->error() << "ERROR trying to read input json for FANS\n";
+        Log::finalize();
         exit(10);
     }
 }
@@ -222,8 +238,8 @@ void Reader ::ReadMS(int hm)
     hid_t   plist_id; /* property list identifier */
     herr_t  status;
 
-    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
-    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
+    MPI_Comm_rank(communicator, &world_rank);
+    MPI_Comm_size(communicator, &world_size);
     MPI_Info info = MPI_INFO_NULL;
 
     // Set up file access property list with parallel I/O access
@@ -257,13 +273,8 @@ void Reader ::ReadMS(int hm)
         H5Aclose(attr_id);
         H5Tclose(attr_type);
     }
-    if (world_rank == 0) {
-        if (is_zyx) {
-            printf("# Using Z-Y-X dimension ordering for the microstructure data\n");
-        } else {
-            printf("# Using X-Y-Z dimension ordering for the microstructure data\n");
-        }
-    }
+    if (is_zyx) Log::io->info() << "# Using Z-Y-X dimension ordering for the microstructure data\n";
+    else Log::io->info() << "# Using X-Y-Z dimension ordering for the microstructure data\n";
 
     dims.resize(3);
     if (is_zyx) {           /* file layout Z Y X  -> logical X Y Z */
@@ -281,18 +292,20 @@ void Reader ::ReadMS(int hm)
     l_e[1] = L[1] / double(dims[1]);
     l_e[2] = L[2] / double(dims[2]);
 
-    if (world_rank == 0) {
-        printf("# grid size set to [%i x %i x %i] --> %i voxels \nMicrostructure length: [%3.6f x %3.6f x %3.6f]\n", dims[0], dims[1], dims[2], dims[0] * dims[1] * dims[2], L[0], L[1], L[2]);
-        if (dims[0] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_x is not a multiple of 2\n");
-        if (dims[1] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_y is not a multiple of 2\n");
-        if (dims[2] % 2 != 0)
-            fprintf(stderr, "[ FANS3D_Grid ] WARNING: n_z is not a multiple of 2\n");
-        if (dims[0] / 4 < world_size)
-            throw std::runtime_error("[ FANS3D_Grid ] ERROR: Please decrease the number of processes or increase the grid size to ensure that each process has at least 4 boxels in the x direction.");
-        printf("Voxel length: [%1.8f, %1.8f, %1.8f]\n", l_e[0], l_e[1], l_e[2]);
-    }
+    Log::io->info() << "# grid size set to [" << dims[0] << " x " << dims[1] << " x " << dims[2] << "] --> " << dims[0] * dims[1] * dims[2] << " voxels \n";
+    Log::io->info() << "Microstructure length: ["
+                    << std::setw(3) << std::setprecision(6) << L[0] << std::defaultfloat << " x "
+                    << std::setw(3) << std::setprecision(6) << L[1] << std::defaultfloat << " x "
+                    << std::setw(3) << std::setprecision(6) << L[2] << std::defaultfloat << "]\n";
+    if (dims[0] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_x is not a multiple of 2\n";
+    if (dims[1] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_y is not a multiple of 2\n";
+    if (dims[2] % 2 != 0) Log::io->warn() << "[ FANS3D_Grid ] WARNING: n_z is not a multiple of 2\n";
+    if (dims[0] / 4 < world_size)
+        throw std::runtime_error("[ FANS3D_Grid ] ERROR: Please decrease the number of processes or increase the grid size to ensure that each process has at least 4 boxels in the x direction.");
+    Log::io->info() << "Voxel length: ["
+                    << std::setw(1) << std::setprecision(8) << l_e[0] << std::defaultfloat << ", "
+                    << std::setw(1) << std::setprecision(8) << l_e[1] << std::defaultfloat << ", "
+                    << std::setw(1) << std::setprecision(8) << l_e[2] << std::defaultfloat << "]\n";
 
     const ptrdiff_t n[3]   = {dims[0], dims[1], dims[2] / 2 + 1};
     ptrdiff_t       block0 = FFTW_MPI_DEFAULT_BLOCK;
@@ -310,11 +323,11 @@ void Reader ::ReadMS(int hm)
       ptrdiff_t *local_n1, ptrdiff_t *local_1_start);		\
     */
 
-    alloc_local = fftw_mpi_local_size_many_transposed(rank, n, hm, block0, block1, MPI_COMM_WORLD, &local_n0, &local_0_start, &local_n1, &local_1_start);
+    alloc_local = fftw_mpi_local_size_many_transposed(rank, n, hm, block0, block1, communicator, &local_n0, &local_0_start, &local_n1, &local_1_start);
 
     if (local_n0 < 4)
         throw std::runtime_error("[ FANS3D_Grid ] ERROR: Number of voxels in x-direction is less than 4 in process " + to_string(world_rank));
-    MPI_Barrier(MPI_COMM_WORLD);
+    MPI_Barrier(communicator);
 
     hsize_t fcount[3], foffset[3];
     if (is_zyx) {              /* file layout  Z Y X */
@@ -381,6 +394,7 @@ void Reader ::ReadMS(int hm)
         FANS_free(tmp);
     } else {
         /* XYZ case: the slab is already in correct order */
+        FANS_free(ms); // dealloc mem
         ms = tmp; // steal the buffer; no copy
     }
 
@@ -396,11 +410,85 @@ void Reader ::ReadMS(int hm)
     this->ComputeVolumeFractions();
 }
 
+Reader::length_t Reader::serialize_override(buffer_t &buffer, length_t offset)
+{
+    // write JSON
+    std::string j_str = inputJson.dump();
+    length_t block_size = (j_str.size()+1) * sizeof(char);
+    buffer.add_size(block_size + header_len);
+    *reinterpret_cast<length_t*>(std::data(buffer) + offset) = block_size;
+    offset += header_len;
+    std::memcpy(std::data(buffer) + offset, j_str.c_str(), block_size);
+    offset += block_size;
+
+    // write ms
+    block_size =     sizeof(bool)      +
+                 3 * sizeof(int)       +
+                 3 * sizeof(double)    +
+                 5 * sizeof(ptrdiff_t) +
+                     sizeof(int);
+    buffer.add_size(block_size);
+    *reinterpret_cast<bool*>(std::data(buffer) + offset) = is_zyx; offset += sizeof(bool);
+    std::memcpy(std::data(buffer) + offset, std::data(dims), sizeof(int)*3); offset += sizeof(int) * 3;
+    std::memcpy(std::data(buffer) + offset, std::data(l_e), sizeof(double)*3); offset += sizeof(double) * 3;
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = alloc_local; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_n0; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_0_start; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_n1; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset) = local_1_start; offset += sizeof(ptrdiff_t);
+    *reinterpret_cast<int*>(std::data(buffer) + offset) = n_mat; offset += sizeof(int);
+
+    block_size = local_n0 * dims[1] * dims[2] * sizeof(unsigned short);
+    buffer.add_size(block_size);
+    std::memcpy(std::data(buffer) + offset, ms, block_size);
+    offset += block_size;
+
+    return offset;
+}
+
+Reader::length_t Reader::deserialize_override(buffer_t &buffer, length_t offset)
+{
+    // write JSON
+    length_t block_size = *reinterpret_cast<length_t*>(std::data(buffer) + offset);
+    offset += header_len;
+    std::string tmp("");
+    tmp.resize(block_size / sizeof(char), '\0');
+    std::memcpy(std::data(tmp), std::data(buffer) + offset, block_size);
+    offset += block_size;
+
+    std::istringstream iss(tmp);
+    json               j;
+    iss >> j;
+    ReadJson(j);
+
+    // write ms
+    is_zyx = *reinterpret_cast<bool*>(std::data(buffer) + offset); offset += sizeof(bool);
+    dims.resize(3);
+    std::memcpy(std::data(dims), std::data(buffer) + offset, sizeof(int)*3); offset += sizeof(int) * 3;
+    l_e.resize(3);
+    std::memcpy(std::data(l_e), std::data(buffer) + offset, sizeof(double)*3); offset += sizeof(double) * 3;
+    alloc_local   = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_n0      = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_0_start = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_n1      = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    local_1_start = *reinterpret_cast<ptrdiff_t*>(std::data(buffer) + offset); offset += sizeof(ptrdiff_t);
+    n_mat         = *reinterpret_cast<int*>(std::data(buffer) + offset);       offset += sizeof(int);
+
+    block_size = local_n0 * dims[1] * dims[2] * sizeof(unsigned short);
+    ms = FANS_malloc<unsigned short>(static_cast<unsigned long>(local_n0) *
+                                     static_cast<unsigned long>(dims[1]) *
+                                     static_cast<unsigned long>(dims[2]));
+    std::memcpy(ms, std::data(buffer) + offset, block_size);
+    offset += block_size;
+
+    return offset;
+}
+
 void Reader::OpenResultsFile(const char *output_fn)
 {
     std::snprintf(results_filename, sizeof(results_filename), "%s", output_fn);
     hid_t plist_id = H5Pcreate(H5P_FILE_ACCESS);
-    H5Pset_fapl_mpio(plist_id, MPI_COMM_WORLD, MPI_INFO_NULL);
+    H5Pset_fapl_mpio(plist_id, communicator, MPI_INFO_NULL);
     results_file_id = H5Fcreate(results_filename, H5F_ACC_TRUNC, H5P_DEFAULT, plist_id);
     H5Pclose(plist_id);