From 7d35267908109e1f9b10df55abbe665e819782be Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sun, 4 Jan 2026 19:59:00 -0800 Subject: [PATCH 1/2] Fix several incorrect uses of try_emplace --- csrc/host_ir/lowering.cpp | 15 ++++++--------- csrc/multidevice/utils.cpp | 2 +- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp index 30aa06e003b..d25e9ffcc12 100644 --- a/csrc/host_ir/lowering.cpp +++ b/csrc/host_ir/lowering.cpp @@ -192,11 +192,11 @@ void lowerSegment( out, DomainType::kLoop, {ParallelType::Stream})) { - auto [i, inserted] = replacement_map.try_emplace( - in, - hir::shardByStream(in, innermost.loop->index(), communication)); - if (inserted) { - innermost_scope.push_back(i->second->definition()); + TensorView*& sharded_in = replacement_map[in]; + if (sharded_in == nullptr) { + sharded_in = + hir::shardByStream(in, innermost.loop->index(), communication); + innermost_scope.push_back(sharded_in->definition()); } } @@ -210,7 +210,7 @@ void lowerSegment( nullptr) { innermost.parent_scope->insert( innermost.parent_insertion_point, allocate); - auto [i, inserted] = replacement_map.try_emplace( + auto [i, inserted] = replacement_map.emplace( out, hir::shardByStream(out, innermost.loop->index(), communication)); NVF_ERROR(inserted, "The input segmented fusion should be SSA."); @@ -314,9 +314,6 @@ void lowerSegment( innermost.parent_insertion_point, allocate); // Loop is stream parallelized but allocation is not. Therefore, // `out` should be allocated outside the loop. - // - // I use try_emplace here so shardByStream is called only when `out` - // is missing. TensorView* sharded_out = hir::shardByStream(out, innermost.loop->index(), e); replacement_map[out] = sharded_out; diff --git a/csrc/multidevice/utils.cpp b/csrc/multidevice/utils.cpp index c968a8545ca..e1b45cbbb52 100644 --- a/csrc/multidevice/utils.cpp +++ b/csrc/multidevice/utils.cpp @@ -114,7 +114,7 @@ std::unordered_map mapDeviceAndStreamParallelTypeToId } NVF_ERROR( - parallel_type_to_id.try_emplace(parallel_type, id).second, + parallel_type_to_id.emplace(parallel_type, id).second, "Found multiple loop IterDomains with the same parallel type (", parallel_type, "): ", From 1ec207128cef136c52ea100957c9b8fd085e6508 Mon Sep 17 00:00:00 2001 From: Jingyue Wu Date: Sun, 4 Jan 2026 22:58:37 -0800 Subject: [PATCH 2/2] Fix build --- csrc/host_ir/lowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/host_ir/lowering.cpp b/csrc/host_ir/lowering.cpp index d25e9ffcc12..ea1bc7ed21d 100644 --- a/csrc/host_ir/lowering.cpp +++ b/csrc/host_ir/lowering.cpp @@ -192,7 +192,7 @@ void lowerSegment( out, DomainType::kLoop, {ParallelType::Stream})) { - TensorView*& sharded_in = replacement_map[in]; + Val*& sharded_in = replacement_map[in]; if (sharded_in == nullptr) { sharded_in = hir::shardByStream(in, innermost.loop->index(), communication);