Skip to content

Commit a224019

Browse files
authored
Record op shape data for profiler [cherry-pick PR43405 43578 43822] (#44384)
* add serialization for new field in event node (#43405) * add serialization for new field in event node * fix a bug * add more field to memory record (#43578) * Add infer shape in dygraph (#43822) * record memory and op supplement info * update * update * fix a bug * fix memory recording * fix a bug * update * update * fix a bug * update * fix a bug * fix a bug * fix a bug * update dygraph record * add infer shape record * fix * fix * fix * add comments * fix a bug * fix * fix * add record op info * fix file mode * add op input shape info * fix dependency
1 parent 94271bc commit a224019

27 files changed

+1975
-611
lines changed

paddle/fluid/framework/new_executor/interpretercore.cc

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,17 @@
2222
#include "paddle/fluid/framework/operator.h"
2323
#include "paddle/fluid/platform/os_info.h"
2424
#include "paddle/fluid/platform/profiler/event_tracing.h"
25+
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
2526
#include "paddle/phi/core/kernel_context.h"
2627
#ifdef PADDLE_WITH_MKLDNN
2728
#include "paddle/fluid/platform/mkldnn_helper.h"
2829
#endif
2930

30-
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true,
31+
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
32+
true,
3133
"Use inplace in new executor");
32-
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true,
34+
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
35+
true,
3336
"Use local_scope in new executor(especially used "
3437
"in UT), can turn off for better performance");
3538

@@ -167,8 +170,8 @@ paddle::framework::FetchList InterpreterCore::Run(
167170
// scope?
168171
}
169172
global_scope_->SetLocalScope(local_scope_);
170-
paddle::framework::interpreter::build_variable_scope(block_, global_scope_,
171-
create_local_scope_);
173+
paddle::framework::interpreter::build_variable_scope(
174+
block_, global_scope_, create_local_scope_);
172175
std::vector<paddle::framework::OpFuncNode> op_func_nodes;
173176
paddle::framework::interpreter::build_op_func_list(
174177
place_, block_, &op_func_nodes, global_scope_, create_local_scope_);
@@ -490,7 +493,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
490493
// If it is OperatorBase, InferShape do nothing.
491494
if (op_with_kernel != nullptr) {
492495
platform::RecordEvent infershape_event(
493-
"infer_shape", platform::TracerEventType::OperatorInner, 1,
496+
"infer_shape",
497+
platform::TracerEventType::OperatorInner,
498+
1,
494499
platform::EventRole::kInnerOp);
495500

496501
// see OperatorWithKernel::RunImpl in operator.cc for why
@@ -499,6 +504,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
499504
op_with_kernel->Info().infer_shape_(
500505
instr_node.InnerInferShapeContext().get());
501506
}
507+
infershape_event.End();
508+
platform::RecordOpInfoSupplement(op->Type(),
509+
op->Attrs(),
510+
*(instr_node.InnerInferShapeContext()),
511+
*(instr_node.InnerRuntimeContext()));
502512
}
503513
}
504514

@@ -516,7 +526,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
516526

517527
{
518528
platform::RecordEvent compute_event(
519-
"compute", platform::TracerEventType::OperatorInner, 1,
529+
"compute",
530+
platform::TracerEventType::OperatorInner,
531+
1,
520532
platform::EventRole::kInnerOp);
521533
if (op_with_kernel == nullptr) {
522534
instr_node.OpBase()->Run(*local_scope, place_);
@@ -571,7 +583,8 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
571583
if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
572584
VLOG(4) << "Check nan/inf";
573585
framework::details::CheckOpHasNanOrInf(
574-
*op, *global_scope_,
586+
*op,
587+
*global_scope_,
575588
place); // TODO(xiongkun03) change it to inner scope.
576589
}
577590
}
@@ -596,10 +609,14 @@ void InterpreterCore::ExecuteInstructionList(
596609

597610
for (size_t i = 0; i < dependecy_count_.size(); ++i) {
598611
if (dependecy_count_[i] == 0) {
599-
async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
600-
this, i, atomic_deps = atomic_deps.get(),
601-
atomic_var_ref = atomic_var_ref.get()
602-
] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
612+
async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
613+
[this,
614+
i,
615+
atomic_deps = atomic_deps.get(),
616+
atomic_var_ref = atomic_var_ref.get()] {
617+
RunInstructionAsync(
618+
i, atomic_deps, atomic_var_ref);
619+
});
603620
}
604621
}
605622

@@ -615,7 +632,8 @@ void InterpreterCore::ExecuteInstructionList(
615632
}
616633
VLOG(4) << "Cancel ok";
617634
PADDLE_ENFORCE_EQ(
618-
main_thread_blocker_.Clear(), 0,
635+
main_thread_blocker_.Clear(),
636+
0,
619637
platform::errors::PreconditionNotMet(
620638
"main_thread_blocker_.Clear() return -1, clear failed"));
621639
VLOG(4) << "clear ok";
@@ -624,7 +642,8 @@ void InterpreterCore::ExecuteInstructionList(
624642
}
625643

626644
void InterpreterCore::RunNextInstructions(
627-
const Instruction& instr, std::queue<size_t>* reserved_next_ops,
645+
const Instruction& instr,
646+
std::queue<size_t>* reserved_next_ops,
628647
std::vector<std::atomic<size_t>>* atomic_deps,
629648
std::vector<std::atomic<size_t>>* atomic_var_ref) {
630649
auto& next_instr = instr.NextInstructions();
@@ -691,7 +710,8 @@ void InterpreterCore::RunNextInstructions(
691710
}
692711

693712
void InterpreterCore::RunInstructionAsync(
694-
size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
713+
size_t instr_id,
714+
std::vector<std::atomic<size_t>>* atomic_deps,
695715
std::vector<std::atomic<size_t>>* atomic_var_ref) {
696716
std::queue<size_t> ready_ops;
697717
ready_ops.push(instr_id);
@@ -700,10 +720,10 @@ void InterpreterCore::RunInstructionAsync(
700720
ready_ops.pop();
701721
auto& instr_node = vec_instruction_.at(instr_id);
702722
VLOG(5) << __func__ << " OP id:" << instr_node.Id()
703-
<< " name:" << instr_node.OpBase()->Type()
704-
<< " type:" << (instr_node.KernelType() == OpFuncType::kQueueSync
705-
? "kQueueSync"
706-
: "kQueueAsync")
723+
<< " name:" << instr_node.OpBase()->Type() << " type:"
724+
<< (instr_node.KernelType() == OpFuncType::kQueueSync
725+
? "kQueueSync"
726+
: "kQueueAsync")
707727
<< " runs on " << platform::GetCurrentThreadName();
708728

709729
auto* op = instr_node.OpBase();
@@ -877,12 +897,14 @@ void InterpreterCore::CheckGC(
877897

878898
} else {
879899
static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
880-
var_scope.Var(var_id), &gc_event_.at(instr_id),
900+
var_scope.Var(var_id),
901+
&gc_event_.at(instr_id),
881902
&instr.DeviceContext());
882903
}
883904
#else
884905
static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
885-
var_scope.Var(var_id), &gc_event_.at(instr_id),
906+
var_scope.Var(var_id),
907+
&gc_event_.at(instr_id),
886908
&instr.DeviceContext());
887909
#endif
888910
}
@@ -891,20 +913,24 @@ void InterpreterCore::CheckGC(
891913

892914
void InterpreterCore::Prepare(
893915
const std::vector<std::string>& feed_names,
894-
const std::vector<framework::LoDTensor>& feed_tensors, bool prepare_feed) {
895-
PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(),
916+
const std::vector<framework::LoDTensor>& feed_tensors,
917+
bool prepare_feed) {
918+
PADDLE_ENFORCE_EQ(feed_names.size(),
919+
feed_tensors.size(),
896920
platform::errors::PreconditionNotMet(
897921
"Required feed_names.size() == feed_tensors.size(), "
898922
"but received %d != %d",
899-
feed_names.size(), feed_tensors.size()));
923+
feed_names.size(),
924+
feed_tensors.size()));
900925

901926
auto FeedInput = [&] {
902927
VLOG(4) << "Feed inputs";
903928
for (size_t i = 0; i < feed_names.size(); ++i) {
904929
auto* feed_var = global_scope_->FindVar(feed_names[i]);
905930
PADDLE_ENFORCE_NOT_NULL(
906-
feed_var, platform::errors::NotFound(
907-
"Variable %s should not be nullptr.", feed_names[i]));
931+
feed_var,
932+
platform::errors::NotFound("Variable %s should not be nullptr.",
933+
feed_names[i]));
908934

909935
auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
910936
feed_tensor->ShareDataWith(feed_tensors[i]);
@@ -913,8 +939,8 @@ void InterpreterCore::Prepare(
913939
};
914940

915941
if (!is_build_) {
916-
paddle::framework::interpreter::build_variable_scope(block_, global_scope_,
917-
create_local_scope_);
942+
paddle::framework::interpreter::build_variable_scope(
943+
block_, global_scope_, create_local_scope_);
918944
FeedInput();
919945
std::vector<paddle::framework::OpFuncNode> op_func_nodes;
920946
paddle::framework::interpreter::build_op_func_list(

0 commit comments

Comments
 (0)