Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/flexflow/flexflow_c.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ void flexflow_model_compute_metrics(flexflow_model_t handle);

void flexflow_model_update(flexflow_model_t handle);

void flexflow_model_unified_update(flexflow_model_t handle);

void flexflow_model_compile(flexflow_model_t handle,
enum LossType loss_type,
int *metrics,
Expand Down
2 changes: 2 additions & 0 deletions include/flexflow/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ enum TaskIDs {
// Optimizer with NCCL
SGD_UPD_NCCL_TASK_ID,
ADAM_UPD_NCCL_TASK_ID,
ADAM_UNIFY_UPD_NCCL_TASK_ID,
// Initializer
GLOROT_INIT_TASK_ID,
ZERO_INIT_TASK_ID,
Expand Down Expand Up @@ -777,6 +778,7 @@ class FFModel {
void get_metrics();
void backward(int seq_length = -1);
void update();
void unified_update();
bool apply_fusion(std::vector<Op *> const &operators,
std::vector<Op *> &new_operators);
Op *get_final_operator() const;
Expand Down
7 changes: 7 additions & 0 deletions include/flexflow/ops/dropout.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@
#include "flexflow/node.h"
#include "flexflow/operator.h"
#include "flexflow/ops/dropout_params.h"
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
#include <curand.h>
#include <curand_kernel.h>
#elif defined(FF_USE_HIP_ROCM)
#include <hiprand/hiprand.h>
#include <hiprand/hiprand_kernel.h>
#endif

namespace FlexFlow {

Expand Down
16 changes: 12 additions & 4 deletions include/flexflow/ops/kernels/dropout_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "flexflow/fftype.h"
#include "flexflow/op_meta.h"
#include "flexflow/ops/dropout.h"
#include "flexflow/accessor.h"

namespace FlexFlow {

Expand All @@ -17,33 +18,40 @@ class DropoutMeta : public OpMeta {
~DropoutMeta(void);
Realm::RegionInstance reserveInst;
#if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
curandState *state;
cudnnTensorDescriptor_t inputTensor, outputTensor;
cudnnDropoutDescriptor_t dropoutDesc;
#else
miopenTensorDescriptor_t inputTensor, outputTensor;
miopenDropoutDescriptor_t dropoutDesc;
hiprandState *state;
#endif
void *reserveSpace, *dropoutStates;
size_t reserveSpaceSize, dropoutStateSize;
size_t num_elements;
long long seed;
float rate;
};

namespace Kernels {
namespace Dropout {
void forward_kernel_wrapper(DropoutMeta *m,
float const *input_ptr,
float *output_ptr);
GenericTensorAccessorR const &input,
GenericTensorAccessorW const &output);
void backward_kernel_wrapper(DropoutMeta *m,
float const *output_grad_ptr,
float *input_grad_ptr);
GenericTensorAccessorR const &output_grad,
GenericTensorAccessorW const &input_grad);

namespace Internal {
void forward_kernel(DropoutMeta *m,
float const *input_ptr,
float *output_ptr,
size_t num_elements,
ffStream_t stream);
void backward_kernel(DropoutMeta *m,
float const *output_grad_ptr,
float *input_grad_ptr,
size_t num_elements,
ffStream_t stream);
} // namespace Internal
} // namespace Dropout
Expand Down
23 changes: 23 additions & 0 deletions include/flexflow/optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include "flexflow/parallel_tensor.h"
#include "legion.h"
#include "accessor.h"

namespace FlexFlow {

Expand All @@ -30,6 +31,7 @@ class Optimizer {
virtual void init(void) = 0;
virtual void next(void) = 0;
virtual void update(const ParallelTensor p) = 0;
virtual void unified_update(std::vector<ParallelTensor> const parameters) = 0;
FFModel const *model;
};

Expand All @@ -43,6 +45,7 @@ class SGDOptimizer : public Optimizer {
void init(void);
void next(void);
void update(const ParallelTensor p);
void unified_update(std::vector<ParallelTensor> const parameters);
void set_weight_decay(double _weight_decay);
static void ps_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Expand All @@ -60,6 +63,11 @@ class SGDOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void
nccl_unified_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(SGDOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
Expand All @@ -85,6 +93,7 @@ class AdamOptimizer : public Optimizer {
void init(void);
void next(void);
void update(const ParallelTensor p);
void unified_update(std::vector<ParallelTensor> const parameters);
void set_weight_decay(double _weight_decay);
static void ps_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Expand All @@ -103,17 +112,31 @@ class AdamOptimizer : public Optimizer {
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void
nccl_unified_update_task(Legion::Task const *task,
std::vector<Legion::PhysicalRegion> const &regions,
Legion::Context ctx,
Legion::Runtime *runtime);
static void nccl_update_task_gpu(AdamOptimizer const *op,
OpMeta const *meta,
float const *w_grad_ptr,
size_t size,
float *w_ptr,
float *v_ptr,
float *m_ptr);
static void nccl_unified_update_task_gpu(AdamOptimizer const *op,
OpMeta const *meta,
GenericTensorAccessorR *accWGrads,
size_t *size,
GenericTensorAccessorW *accWs,
GenericTensorAccessorW *accVs,
GenericTensorAccessorW *accMs);
#endif
double alpha, beta1, beta2, weight_decay, epsilon;
double alpha_t, beta1_t, beta2_t;
std::map<Legion::LogicalRegion, ParallelTensor> v_values, m_values;
size_t reservedWorkSpaceSize = 0;
int parameters_num = 0;
};

}; // namespace FlexFlow
Expand Down
7 changes: 7 additions & 0 deletions python/flexflow/core/flexflow_cffi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2018,6 +2018,13 @@ def update(self):
:returns: None -- no returns.
"""
ffc.flexflow_model_update(self.handle)

def unified_update(self):
"""Update weights and biases of all layers.

:returns: None -- no returns.
"""
ffc.flexflow_model_unified_update(self.handle)

def compile(self, optimizer=None, loss_type=None, metrics=None, comp_mode=None):
"""Configure the model for trainting. FlexFlow uses lazy initialization,
Expand Down
5 changes: 5 additions & 0 deletions src/c/flexflow_c.cc
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ void flexflow_model_update(flexflow_model_t handle_) {
handle->update();
}

void flexflow_model_unified_update(flexflow_model_t handle_) {
FFModel *handle = FFCObjectWrapper::unwrap(handle_);
handle->unified_update();
}

void flexflow_model_compile(flexflow_model_t handle_,
enum LossType loss_type,
int *metrics,
Expand Down
38 changes: 25 additions & 13 deletions src/ops/dropout.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using PCG::Node;

using namespace FlexFlow::Kernels::Dropout;

Tensor FFModel::dropout(const Tensor input,
Tensor FFModel::dropout(Tensor const input,
float rate,
unsigned long long seed,
char const *name) {
Expand Down Expand Up @@ -86,7 +86,7 @@ bool operator==(DropoutParams const &lhs, DropoutParams const &rhs) {
}

Dropout::Dropout(FFModel &model,
const ParallelTensor _input,
ParallelTensor const _input,
float _rate,
unsigned long long _seed,
char const *name)
Expand All @@ -111,12 +111,12 @@ Dropout::Dropout(FFModel &model,

Dropout::Dropout(FFModel &model,
Dropout const &other,
const ParallelTensor input)
ParallelTensor const input)
: Dropout(model, input, other.rate, other.seed, other.name) {}

Dropout::Dropout(FFModel &model,
DropoutParams const &params,
const ParallelTensor input,
ParallelTensor const input,
char const *name)
: Dropout(model, input, params.rate, params.seed, name) {}

Expand Down Expand Up @@ -210,12 +210,12 @@ void Dropout::forward_task(Task const *task,
assert(task->regions.size() == 2);
// const Dropout* dropout = (const Dropout*) task->args;
DropoutMeta *m = *((DropoutMeta **)task->local_args);
float const *input_ptr = helperGetTensorPointerRO<float>(
regions[0], task->regions[0], FID_DATA, ctx, runtime);
float *output_ptr = helperGetTensorPointerWO<float>(
regions[1], task->regions[1], FID_DATA, ctx, runtime);

forward_kernel_wrapper(m, input_ptr, output_ptr);

GenericTensorAccessorR input = helperGetGenericTensorAccessorRO(
m->input_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
GenericTensorAccessorW output = helperGetGenericTensorAccessorWO(
m->output_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);
forward_kernel_wrapper(m, input, output);
}

void Dropout::backward(FFModel const &ff) {
Expand Down Expand Up @@ -264,7 +264,13 @@ void Dropout::backward_task(Task const *task,
float const *output_grad_ptr = helperGetTensorPointerRO<float>(
regions[1], task->regions[1], FID_DATA, ctx, runtime);

backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr);

GenericTensorAccessorW input_grad = helperGetGenericTensorAccessorRW(
m->output_type[0], regions[0], task->regions[0], FID_DATA, ctx, runtime);
GenericTensorAccessorR output_grad = helperGetGenericTensorAccessorRO(
m->input_type[0], regions[1], task->regions[1], FID_DATA, ctx, runtime);

backward_kernel_wrapper(m, output_grad, input_grad);
}

void Dropout::serialize(Legion::Serializer &sez) const {
Expand Down Expand Up @@ -304,30 +310,36 @@ bool Dropout::measure_operator_cost(Simulator *sim,
sim->free_all();
float *input_ptr = (float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
assert(input_ptr != NULL);

GenericTensorAccessorR input_acc(m->input_type[0], sub_input.get_domain(), input_ptr);
cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);

float *output_ptr = (float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
assert(output_ptr != NULL);

GenericTensorAccessorW output_acc(m->output_type[0], sub_input.get_domain(), output_ptr);
cost_metrics.outputs_memory += cost_metrics.total_mem_diff_from(sim->offset);

assert(m->profiling == false);

std::function<void()> forward, backward;
forward = [&] { forward_kernel_wrapper(m, input_ptr, output_ptr); };
forward = [&] { forward_kernel_wrapper(m, input_acc, output_acc); };
if (sim->computationMode == COMP_MODE_TRAINING) {
float *input_grad_ptr =
(float *)sim->allocate(sub_input.get_volume(), DT_FLOAT);
assert(input_grad_ptr != NULL);
GenericTensorAccessorW input_grad_acc(m->output_type[0], sub_input.get_domain(), input_grad_ptr);
cost_metrics.inputs_memory += cost_metrics.total_mem_diff_from(sim->offset);

float *output_grad_ptr =
(float *)sim->allocate(sub_output.get_volume(), DT_FLOAT);
assert(output_grad_ptr != NULL);
GenericTensorAccessorR output_grad_acc(m->output_type[0], sub_input.get_domain(), output_grad_ptr);
cost_metrics.outputs_memory +=
cost_metrics.total_mem_diff_from(sim->offset);

backward = [&] {
backward_kernel_wrapper(m, output_grad_ptr, input_grad_ptr);
backward_kernel_wrapper(m, output_grad_acc, input_grad_acc);
};
}

Expand Down
8 changes: 2 additions & 6 deletions src/ops/fused.cu
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,7 @@ __host__ void FusedOp::forward_task(Task const *task,
assert(fused->op_num_outputs[op] == 1);
DropoutMeta *m = (DropoutMeta *)metas->meta[op];
Kernels::Dropout::forward_kernel_wrapper(
m,
my_input_accessor[0].get_float_ptr(),
my_output_accessor[0].get_float_ptr());
m, my_input_accessor[0], my_output_accessor[0]);
break;
}
case OP_LINEAR: {
Expand Down Expand Up @@ -819,9 +817,7 @@ __host__ void FusedOp::backward_task(Task const *task,
assert(fused->op_num_outputs[op] == 1);
DropoutMeta *m = (DropoutMeta *)metas->meta[op];
Kernels::Dropout::backward_kernel_wrapper(
m,
my_output_grad_accessor[0].get_float_ptr(),
my_input_grad_accessor[0].get_float_ptr());
m, my_output_grad_accessor[0], my_input_grad_accessor[0]);
break;
}
case OP_EW_ADD:
Expand Down
Loading