Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions projects/miopen/src/include/miopen/conv/solvers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2804,6 +2804,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmFwd1x1_0_2 final : GemmFwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand Down Expand Up @@ -2840,6 +2843,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmFwd1x1_0_1 final : GemmFwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand All @@ -2858,6 +2864,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmFwdRest final : GemmFwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand Down Expand Up @@ -2888,6 +2897,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmBwd1x1_stride2 final : GemmBwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand All @@ -2906,6 +2918,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmBwd1x1_stride1 final : GemmBwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription& problem) const override;

Expand All @@ -2924,6 +2939,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmBwdRest final : GemmBwdBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand All @@ -2948,6 +2966,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmWrw1x1_stride1 final : GemmWrwBase
{
const std::string& SolverDbId() const override { return GetSolverDbId<GemmWrw1x1_stride1>(); }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand All @@ -2966,6 +2987,9 @@ struct MIOPEN_INTERNALS_EXPORT GemmWrwUniversal final : GemmWrwBase

bool MayNeedWorkspace() const override { return true; }

bool IsSlow(const ExecutionContext& context,
const miopen::conv::ProblemDescription& problem) const override;

bool IsApplicable(const ExecutionContext&,
const miopen::conv::ProblemDescription&) const override;

Expand Down
10 changes: 10 additions & 0 deletions projects/miopen/src/include/miopen/find_solution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
#include <miopen/search_options.hpp>
#include <miopen/solver_id.hpp>
#include <miopen/solver.hpp>
#include <miopen/env.hpp>
#include <miopen/generic_search_controls.hpp>

#include <limits>
#include <type_traits>
Expand Down Expand Up @@ -327,6 +329,10 @@ struct SolverContainer
{
MIOPEN_LOG_I2(solver.SolverDbId() << ": Not applicable");
}
else if(env::enabled(MIOPEN_SEARCH_CUTOFF) && solver.IsSlow(ctx, problem))
{
MIOPEN_LOG_I2(solver.SolverDbId() << ": Skipped (slow, search cutoff active)");
}
else
{
const Solution s =
Expand Down Expand Up @@ -382,6 +388,10 @@ struct SolverContainer
{
MIOPEN_LOG_I2(solver.SolverDbId() << ": Not applicable");
}
else if(env::enabled(MIOPEN_SEARCH_CUTOFF) && solver.IsSlow(ctx, problem))
{
MIOPEN_LOG_I2(solver.SolverDbId() << ": Skipped (slow, search cutoff active)");
}
else
{
auto db = [&]() -> PerformanceDb& {
Expand Down
3 changes: 3 additions & 0 deletions projects/miopen/src/include/miopen/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ struct SolverInterface : SolverBase

/// Returns the workspace size required by the solver for the given Problem
virtual size_t GetWorkspaceSize(const Context&, const Problem&) const { return 0; };

/// Returns true if the solver is expected to be slow for the given problem.
virtual bool IsSlow(const Context&, const Problem&) const { return false; };
};

/// Common interface for non-tunable solvers
Expand Down
108 changes: 108 additions & 0 deletions projects/miopen/src/solver/conv/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <miopen/solver/gemm_common.hpp>

#include <ranges>
#include <set>

namespace miopen {
namespace solver {
Expand Down Expand Up @@ -201,6 +202,37 @@ size_t GemmFwd1x1_0_2::GetWorkspaceSize(const ExecutionContext& context,
#endif
}

bool GemmFwd1x1_0_2::IsSlow(const ExecutionContext& context,
const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are the gfx11/12 even needed since doesn't is_mi already gate on it being exactly gfx942 or gfx950?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is the potential that other archs outside those explicitly listed could slip in here. So this is for specificity.
In this case the gfx11/12 condition doesn't help because there is no filter. It's here because the frame of IsSlow is basically copy pasted and the filtering logic came in as blank later.

{
return false;
}
else if(is_mi)
{
// PRIMARY: Extreme low CPG detection
// SWPG < 150k: Low spatial-channel work
// CPG < 48: Extremely low channels per group (critical discriminator)
if(spatial_work_per_group < 150000 && channels_per_group < 48)
return true;
}

return false;
}

bool GemmFwd1x1_0_2::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down Expand Up @@ -620,6 +652,37 @@ size_t GemmFwd1x1_0_1::GetWorkspaceSize(const ExecutionContext&, const ProblemDe
return 0;
}

bool GemmFwd1x1_0_1::IsSlow(const ExecutionContext& context,
const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
{
return false;
}
else if(is_mi)
{
// PRIMARY: Memory-bound small problem detection
// SWPG < 200k: Low spatial-channel work (memory-bound)
// CPG < 704: Moderate channels (poor reuse)
if(spatial_work_per_group < 200000 && channels_per_group < 704)
return true;
}

return false;
}

bool GemmFwd1x1_0_1::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down Expand Up @@ -842,6 +905,51 @@ size_t GemmFwdRest::GetWorkspaceSize(const ExecutionContext& context,
#endif
}

bool GemmFwdRest::IsSlow(const ExecutionContext& context, const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto b = problem.GetBatchSize();
auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto spatial_per_batch = s / b;
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
{
// GemmFwdRest - SPB-only filtering
// Analysis: 63.8% terrible cases - batch fragmentation dominant
//
// Terrible cases have high batch (32x) but low SPB (0.07x)
// This indicates batch fragmentation → poor GPU utilization
//
// SPB < 50: Low spatial-per-batch = batch fragmentation
// Performance: FPR=11-25%, TPR=61-76%, Score=1.37-1.60
if(spatial_per_batch < 50)
return true;
}
else if(is_mi)
{
// PRIMARY: Memory-bound small problem detection
// SWPG < 2.5M: Low spatial-channel work (memory-bound)
// CPG < 56: Very low channels (minimal reuse)
if(spatial_work_per_group < 2500000 && channels_per_group < 56)
return true;

// SECONDARY: Batch fragmentation detection
if(spatial_per_batch < 24.0)
return true;
}

return false;
}

bool GemmFwdRest::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down
112 changes: 112 additions & 0 deletions projects/miopen/src/solver/conv/gemm_bwd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <miopen/solver/gemm_common.hpp>

#include <ranges>
#include <set>

namespace miopen {
namespace solver {
Expand Down Expand Up @@ -173,6 +174,37 @@ size_t GemmBwd1x1_stride2::GetWorkspaceSize(const ExecutionContext& context,
#endif
}

bool GemmBwd1x1_stride2::IsSlow(const ExecutionContext& context,
const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
{
return false;
}
else if(is_mi)
{
// PRIMARY: Extreme low CPG detection
// SWPG < 400k: Moderate spatial-channel work
// CPG < 192: Low channels per group (critical discriminator)
if(spatial_work_per_group < 400000 && channels_per_group < 192)
return true;
}

return false;
}

bool GemmBwd1x1_stride2::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down Expand Up @@ -377,6 +409,37 @@ size_t GemmBwd1x1_stride1::GetWorkspaceSize(const ExecutionContext&,
return 0;
}

bool GemmBwd1x1_stride1::IsSlow(const ExecutionContext& context,
const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
{
return false;
}
else if(is_mi)
{
// PRIMARY: Memory-bound small problem detection
// SWPG < 200k: Low spatial-channel work (memory-bound)
// CPG < 640: Moderate channels (poor reuse)
if(spatial_work_per_group < 200000 && channels_per_group < 640)
return true;
}

return false;
}

bool GemmBwd1x1_stride1::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down Expand Up @@ -566,6 +629,55 @@ size_t GemmBwdRest::GetWorkspaceSize(const ExecutionContext& context,
#endif
}

bool GemmBwdRest::IsSlow(const ExecutionContext& context, const ProblemDescription& problem) const
{
const std::string& arch = context.GetStream().GetDeviceName();
const std::set<std::string> mi = {"gfx942", "gfx955"};
const bool is_mi = mi.find(arch) != mi.end();
const bool is_gfx11 = StartsWith(arch, "gfx11");
const bool is_gfx12 = StartsWith(arch, "gfx12");

auto b = problem.GetBatchSize();
auto s = problem.GetOutHeight() * problem.GetOutWidth();
auto c = problem.GetInChannels() + problem.GetOutChannels();
auto g = problem.GetGroupCount();
auto spatial_per_batch = s / b;
auto channels_per_group = c / g;
auto spatial_work_per_group = s * channels_per_group;

if(is_gfx11 || is_gfx12)
{
// GemmBwdRest - Multi-metric filtering
// Analysis: 51.6% terrible cases - significant filtering benefit
//
// PRIMARY: Memory-bound small problem detection
// SWPG < 1.6M: Low spatial-channel work
// CPG < 360: Low channels
if(spatial_work_per_group < 1600000 && channels_per_group < 360)
return true;

// SECONDARY: Batch fragmentation detection
// SPB < 0.8: Extreme batch fragmentation
if(spatial_per_batch < 0.8)
return true;
}
else if(is_mi)
{
// PRIMARY: Memory-bound small problem detection
// SWPG < 3M: Low spatial-channel work
// CPG < 112: Very low channels
if(spatial_work_per_group < 3000000 && channels_per_group < 112)
return true;

// SECONDARY: Batch fragmentation detection
// SPB < 40.0: Each batch item has < 40 pixels of spatial work
if(spatial_per_batch < 40.0)
return true;
}

return false;
}

bool GemmBwdRest::IsApplicable(const ExecutionContext& context,
const ProblemDescription& problem) const
{
Expand Down
Loading
Loading