Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion src/ArbConnectivity.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <numeric>
#include <vector>

#include "types.h"

struct ArbLatticeConnectivity {
using Index = long;
using ZoneIndex = unsigned short;
Expand All @@ -14,8 +16,10 @@ struct ArbLatticeConnectivity {
std::unique_ptr<Index[]> og_index;
std::unique_ptr<Index[]> nbrs;
std::unique_ptr<ZoneIndex[]> zones_per_node;
std::vector<Index> cart_index;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be only encode the position in a different way. Couldn't we just search in the coords?

std::vector<ZoneIndex> zones;
double grid_size{};
int nx, ny, nz; // total region

ArbLatticeConnectivity() = default;
ArbLatticeConnectivity(size_t chunk_begin_, size_t chunk_end_, size_t num_nodes_global_, size_t Q_)
Expand All @@ -26,7 +30,8 @@ struct ArbLatticeConnectivity {
coords(std::make_unique<double[]>(3 * (chunk_end_ - chunk_begin_))),
og_index(std::make_unique<Index[]>(chunk_end_ - chunk_begin_)),
nbrs(std::make_unique<Index[]>((chunk_end_ - chunk_begin_) * Q)),
zones_per_node(std::make_unique<ZoneIndex[]>(chunk_end_ - chunk_begin_)) {
zones_per_node(std::make_unique<ZoneIndex[]>(chunk_end_ - chunk_begin_)),
cart_index(chunk_end_ - chunk_begin_) {
zones.reserve(getLocalSize());
}

Expand All @@ -46,12 +51,16 @@ struct ArbLatticeConnectivity {
}

size_t getLocalSize() const { return chunk_end - chunk_begin; }

bool isGhost(Index nbr) const { return nbr != -1 && (nbr < static_cast<Index>(chunk_begin) || nbr >= static_cast<Index>(chunk_end)); }

double& coord(size_t dim, size_t local_node_ind) { return coords[local_node_ind + dim * getLocalSize()]; }
double coord(size_t dim, size_t local_node_ind) const { return coords[local_node_ind + dim * getLocalSize()]; }
Index& cartesian_ind(size_t local_node_ind) { return cart_index[local_node_ind]; }
Index cartesian_ind(size_t local_node_ind) const { return cart_index[local_node_ind]; }
Index& neighbor(size_t q, size_t local_node_ind) { return nbrs[local_node_ind + q * getLocalSize()]; }
Index neighbor(size_t q, size_t local_node_ind) const { return nbrs[local_node_ind + q * getLocalSize()]; }

};

inline auto computeInitialNodeDist(size_t num_nodes_global, size_t comm_size) -> std::vector<long> {
Expand Down
54 changes: 51 additions & 3 deletions src/ArbLattice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ ArbLattice::ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map<
void ArbLattice::initialize(size_t num_snaps_, const std::map<std::string, int>& setting_zones, pugi::xml_node arb_node) {
const int rank = mpitools::MPI_Rank(comm);
sizes.snaps = num_snaps_;
sample = std::make_unique<Sampler>(model.get(), units, rank);
#ifdef ADJOINT
sizes.snaps += 2; // Adjoint snaps are appended to the total snap allocation
#endif
Expand Down Expand Up @@ -71,6 +72,12 @@ int ArbLattice::reinitialize(size_t num_snaps_, const std::map<std::string, int>
return EXIT_SUCCESS;
}

/// Calculation of the offset from x, y and z
int ArbLattice::Offset(int x, int y, int z)
{
return x+connect.nx*y + connect.nx*connect.ny*z;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem to be a good idea, as the size of the thing can be quite big. Even if we stick with this solution, this should be 64bit (see sizeL in region)

}

void ArbLattice::readFromCxn(const std::string& cxn_path) {
using namespace std::string_literals;
using namespace std::string_view_literals;
Expand Down Expand Up @@ -152,6 +159,14 @@ void ArbLattice::readFromCxn(const std::string& cxn_path) {
file >> grid_size;
check_file_ok("Failed to read section: GRID_SIZE");

// Total region
file >> word;
check_file_ok("Failed to read section header: TOTAL_REGION");
check_expected_word("TOTAL_REGION", word);
int nx, ny, nz;
file >> nx >> ny >> nz;
check_file_ok("Failed to read section: TOTAL_REGION");

// Labels present in the .cxn file
const auto labels = process_section("NODE_LABELS", [&file](size_t n_labels) {
std::vector<std::string> retval(n_labels);
Expand All @@ -162,24 +177,30 @@ void ArbLattice::readFromCxn(const std::string& cxn_path) {

// Nodes header
process_section("NODES", [&](size_t num_nodes_global) {
// Compute the current rank's offset and number of nodes to read
// Compute the current ranks offset and number of nodes to read
const auto chunk_offsets = computeInitialNodeDist(num_nodes_global, static_cast<size_t>(comm_size));
const auto chunk_begin = static_cast<size_t>(chunk_offsets[comm_rank]), chunk_end = static_cast<size_t>(chunk_offsets[comm_rank + 1]);
const auto num_nodes_local = chunk_end - chunk_begin;

connect = ArbLatticeConnectivity(chunk_begin, chunk_end, num_nodes_global, Q);
connect.grid_size = grid_size;
connect.nx = nx;
connect.ny = ny;
connect.nz = nz;

// Skip chunk_begin + 1 (header) newlines
for (size_t i = 0; i != chunk_begin + 1; ++i) file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
check_file_ok("Failed to skip ahead to this rank's chunk");
check_file_ok("Failed to skip ahead to this ranks chunk");

// Parse node data
std::vector<long> nbrs_in_file(q_provided.size());
for (size_t local_node_ind = 0; local_node_ind != num_nodes_local; ++local_node_ind) {
// Read coords
file >> connect.coord(0, local_node_ind) >> connect.coord(1, local_node_ind) >> connect.coord(2, local_node_ind);

// Read cartesian index
file >> connect.cartesian_ind(local_node_ind);

// Read neighbors, map to local (required) ones
for (auto& nbr : nbrs_in_file) file >> nbr;
size_t j = 0;
Expand Down Expand Up @@ -539,6 +560,17 @@ std::vector<real_t> ArbLattice::getQuantity(const Model::Quantity& q, real_t sca
return ret;
}

void ArbLattice::getSample(int quant, lbRegion over, real_t scale, real_t *buf) {
setSnapIn(Snap);
#ifdef ADJOINT
setAdjSnapIn(aSnap);
#endif
lbRegion small = getLocalBoundingBox().intersect(over);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seem to be wrong. The indexes in read from the file are in the global region, a this seems to take smaller (local) region.

auto it = std::find(connect.cart_index.begin(), connect.cart_index.end(), Offset(small.dx, small.dy, small.dz));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the only place these indexes are really used. Couldn't it be replaced with search for the closest point in coords?

int lid = std::distance(connect.cart_index.begin(), it);
launcher.SampleQuantity(quant, lid, buf, scale, data);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't seem very efficient to search this point every time.

}

std::vector<real_t> ArbLattice::getCoord(const Model::Coord& d, real_t scale) {
size_t size = getLocalSize();
std::vector<real_t> ret(size);
Expand Down Expand Up @@ -843,4 +875,20 @@ void ArbLattice::resetAverage(){
CudaMemset(&getSnapPtr(Snap)[f.id*sizes.snaps_pitch], 0, sizes.snaps_pitch*sizeof(real_t));
}
}
}
}

void ArbLattice::updateAllSamples(){
const int rank = mpitools::MPI_Rank(comm);
if (sample->size != 0) {
for (size_t j = 0; j < sample->spoints.size(); j++) {
if (rank == sample->spoints[j].rank) {
for(const Model::Quantity& q : model->quantities) {
if (sample->quant->in(q.name.c_str())){
double v = sample->units->alt(q.unit.c_str());
getSample(q.id, sample->spoints[j].location, 1/v, &sample->gpu_buffer[sample->location[q.name.c_str()]+(data.iter - sample->startIter)*sample->size + sample->totalIter*j*sample->size]);
}
}
}
}
}
}
7 changes: 6 additions & 1 deletion src/ArbLattice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include "ArbConnectivity.hpp"
#include "ArbLatticeLauncher.h"
#include "Sampler.h"
#include "CudaUtils.hpp"
#include "LatticeBase.hpp"
#include "Lists.h"
Expand Down Expand Up @@ -71,6 +72,7 @@ class ArbLattice : public LatticeBase {
public:
static constexpr size_t Q = Model_m::Q; /// Stencil size
static constexpr size_t NF = Model_m::NF; /// Number of fields
std::unique_ptr<Sampler> sample; /// initializing sample with zero size

ArbLattice(size_t num_snaps_, const UnitEnv& units_, const std::map<std::string, int>& setting_zones, pugi::xml_node arb_node, MPI_Comm comm_);
ArbLattice(const ArbLattice&) = delete;
Expand All @@ -86,6 +88,7 @@ class ArbLattice : public LatticeBase {

virtual std::vector<int> shape() const { return {static_cast<int>(getLocalSize())}; };
virtual std::vector<real_t> getQuantity(const Model::Quantity& q, real_t scale = 1) ;
virtual void getSample(int quant, lbRegion over, real_t scale, real_t *buf);
virtual std::vector<big_flag_t> getFlags() const;
virtual std::vector<real_t> getField(const Model::Field& f);
virtual std::vector<real_t> getFieldAdj(const Model::Field& f);
Expand All @@ -99,8 +102,10 @@ class ArbLattice : public LatticeBase {
Span<const flag_t> getNodeTypes() const { return {node_types_host.data(), node_types_host.size()}; } /// Get host view of node types (permuted)
const ArbLatticeConnectivity& getConnectivity() const { return connect; }
const std::vector<unsigned>& getLocalPermutation() const { return local_permutation; }
lbRegion getLocalBoundingBox() const; /// Compute local bounding box, assuming the arbitrary lattice is a subset of a Cartesian lattice

void resetAverage();
void updateAllSamples();

protected:
ArbLatticeLauncher launcher; /// Launcher responsible for running CUDA kernels on the lattice
Expand Down Expand Up @@ -135,6 +140,7 @@ class ArbLattice : public LatticeBase {
void clearAdjoint() final; /// TODO

void initialize(size_t num_snaps_, const std::map<std::string, int>& setting_zones, pugi::xml_node arb_node); /// Init based on args
int Offset(int x, int y, int z); /// Calculation of the offset from x, y and z
void readFromCxn(const std::string& cxn_path); /// Read the lattice info from a .cxn file
void partition(); /// Repartition the lattice, if ParMETIS is not present this is a noop
std::function<bool(int, int)> makePermCompare(pugi::xml_node arb_node, const std::map<std::string, int>& setting_zones); /// Make type-erased comparison operator for computing the local permutation, according to the strategy specified in the xml file
Expand All @@ -149,7 +155,6 @@ class ArbLattice : public LatticeBase {
void initCommManager(); /// Compute which fields need to be sent to/received from which neighbors
void initContainer(); /// Initialize the data residing in launcher.container
int fullLatticePos(double pos) const; /// Compute the position (in terms of lattice offsets) of a node, assuming the arbitrary lattice is a subset of a Cartesian lattice
lbRegion getLocalBoundingBox() const; /// Compute local bounding box, assuming the arbitrary lattice is a subset of a Cartesian lattice
ArbVTUGeom makeVTUGeom() const; /// Compute VTU geometry
void communicateBorder(); /// Send and receive border values in snap (overlapped with interior computation)
unsigned lookupLocalGhostIndex(ArbLatticeConnectivity::Index gid) const; /// For a given ghost gid, look up its local id
Expand Down
2 changes: 2 additions & 0 deletions src/ArbLatticeLauncher.h.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ struct ArbLatticeLauncher {
void unpack(CudaStream_t stream) const;

void getQuantity(int quant, real_t* host_tab, real_t scale, const LatticeData& data) const;
void SampleQuantity(int quant, int lid, real_t *host_tab, real_t scale, const LatticeData& data);

private:
<?R for (q in rows(Quantities)) { ifdef(q$adjoint);
Expand All @@ -31,6 +32,7 @@ if (q$adjoint) {
}
?>
void getQuantity<?%s q$name ?>(<?%s q$type ?>* tab, real_t scale, const LatticeData& data) const;
void getSample<?%s q$name ?>(int lid, <?%s q$type ?>* tab, real_t scale, const LatticeData& data) const;
<?R }
ifdef() ?>
};
Expand Down
56 changes: 56 additions & 0 deletions src/ArbLatticeLauncher.hpp.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,22 @@ ifdef();
}
}

void ArbLatticeLauncher::SampleQuantity(int quant, int lid, real_t* host_tab, real_t scale, const LatticeData &data) {
switch(quant) { <?R
for (q in rows(Quantities)) { ifdef(q$adjoint);
?>
case <?%s q$Index ?>: {
const auto gpu_tab = cudaMakeUnique<<?%s q$type ?>>(1);
getSample<?%s q$name ?>(lid, gpu_tab.get(), scale, data);
CudaMemcpy(host_tab, gpu_tab.get(), sizeof(<?%s q$type ?>), CudaMemcpyDeviceToHost);
break;
} <?R
}
ifdef();
?>
}
}

<?R for (q in rows(Quantities)) { ifdef(q$adjoint);
if (q$adjoint) {
node = "Node_Adj"
Expand Down Expand Up @@ -128,10 +144,50 @@ if (q$type == "vector_t") {
}
};

struct GetSampleArbExecutor<?%s q$name?> : public LinearExecutor {
ArbLatticeContainer container;
LatticeData data;

int lid;
<?%s q$type ?>* buf;
real_t scale;

CudaDeviceFunction void Execute() const {
using LA = ArbLatticeAccess; <?R
if (q$adjoint) { ?>
using N = Node< LA, Adjoint, NoGlobals, Get >; <?R
} else { ?>
using N = Node< LA, Primal, NoGlobals, Get >; <?R
}?>

LA acc(lid, container);
N now(acc, data);
acc.pop(now); <?R
if (q$adjoint) { ?>
acc.pop_adj(now); <?R
} ?>
auto val = now.get<?%s q$name ?>(); <?R
if (q$type == "vector_t") {
for (coef in c("x","y","z")) { ?>
val.<?%s coef ?> *= scale; <?R
}
} else { ?>
val *= scale; <?R
} ?>
buf[0] = val;
}
};

void ArbLatticeLauncher::getQuantity<?%s q$name ?>(<?%s q$type ?>* tab, real_t scale, const LatticeData& data) const {
const GetQuantityArbExecutor<?%s q$name?> executor{{container.num_border_nodes + container.num_interior_nodes}, container, data, tab, scale};
LaunchExecutor(executor);
}

void ArbLatticeLauncher::getSample<?%s q$name ?>(int lid, <?%s q$type ?>* tab, real_t scale, const LatticeData& data) const {
const GetSampleArbExecutor<?%s q$name?> executor{{1}, container, data, lid, tab, scale};
LaunchExecutor(executor);
}

<?R }
ifdef() ?>

Expand Down
6 changes: 3 additions & 3 deletions src/CartLatticeLauncher.hpp.Rt
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,9 @@ public:

CudaDeviceFunction void Execute() const {
using LA = CartLatticeAccessAll;
int x = CudaBlock.x + small.dx;
int y = CudaBlock.y + small.dy;
int z = CudaBlock.z + small.dz;
int x = (CudaBlock.x + small.dx) %container.nx;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did this appear here? x,y,z should always be in container's region, and modulu ("%") is very expensive and generally not very reliable.

int y = (CudaBlock.y + small.dy) %container.ny;
int z = (CudaBlock.z + small.dz) %container.nz;
LA acc(x, y, z, container); <?R
if (q$adjoint) { ?>
Node< LA, Adjoint, NoGlobals, Get > now(acc, data); <?R
Expand Down
Loading
Loading