diff --git a/Project.toml b/Project.toml
index 99d1605b69..a149b51dad 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,10 @@
 name = "Reactant"
 uuid = "3c362404-f566-11ee-1572-e11a4b42c853"
-authors = ["William Moses <wmoses@mit.edu>", "Valentin Churavy <vchuravy@mit.edu>", "Sergio Sánchez Ramírez <sergio.sanchez.ramirez@bsc.es>", "Paul Berg <paul@plutojl.org>", "Avik Pal <avikpal@mit.edu>", "Mosè Giordano <mose@gnu.org>"]
 version = "0.2.203"
+authors = ["William Moses <wmoses@mit.edu>", "Valentin Churavy <vchuravy@mit.edu>", "Sergio Sánchez Ramírez <sergio.sanchez.ramirez@bsc.es>", "Paul Berg <paul@plutojl.org>", "Avik Pal <avikpal@mit.edu>", "Mosè Giordano <mose@gnu.org>"]
+
+[workspace]
+projects = ["docs", "test", "benchmark"]
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -30,7 +33,6 @@ ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433"
 Reactant_jll = "0192cb87-2b54-54ad-80e0-3be72ad8a3c0"
 ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
-Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 p7zip_jll = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
@@ -124,7 +126,6 @@ ReactantCore = "0.1.16"
 Reactant_jll = "0.0.305"
 ScopedValues = "1.3.0"
 Scratch = "1.2"
-Setfield = "1.1.2"
 Sockets = "1.10"
 SparseArrays = "1.10"
 SpecialFunctions = "2.4"
@@ -141,6 +142,3 @@ ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
-
-[workspace]
-projects = ["docs", "test", "benchmark"]
diff --git a/src/Compiler.jl b/src/Compiler.jl
index 966e93181d..295cfdc241 100644
--- a/src/Compiler.jl
+++ b/src/Compiler.jl
@@ -3763,8 +3763,8 @@ function compile_xla(
 
             exec = XLA.compile(
                 client,
-                mod;
-                compile_options=xla_compile_options,
+                mod,
+                xla_compile_options;
                 num_outputs=length(mlir_fn_res.linear_results),
                 num_parameters=length(mlir_fn_res.linear_args),
                 mlir_fn_res.is_sharded,
diff --git a/src/proto/google/protobuf/any_pb.jl b/src/proto/google/protobuf/any_pb.jl
index 7fb8b3bc1f..c49a0a0357 100644
--- a/src/proto/google/protobuf/any_pb.jl
+++ b/src/proto/google/protobuf/any_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export var"#Any"
 
 
-struct var"#Any"
+mutable struct var"#Any"
     type_url::String
     value::Vector{UInt8}
 end
diff --git a/src/proto/google/protobuf/duration_pb.jl b/src/proto/google/protobuf/duration_pb.jl
index 6b763a9102..9b1ca2271d 100644
--- a/src/proto/google/protobuf/duration_pb.jl
+++ b/src/proto/google/protobuf/duration_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Duration
 
 
-struct Duration
+mutable struct Duration
     seconds::Int64
     nanos::Int32
 end
diff --git a/src/proto/google/protobuf/timestamp_pb.jl b/src/proto/google/protobuf/timestamp_pb.jl
index 02b5887556..4f2e19cd0a 100644
--- a/src/proto/google/protobuf/timestamp_pb.jl
+++ b/src/proto/google/protobuf/timestamp_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Timestamp
 
 
-struct Timestamp
+mutable struct Timestamp
     seconds::Int64
     nanos::Int32
 end
diff --git a/src/proto/google/protobuf/wrappers_pb.jl b/src/proto/google/protobuf/wrappers_pb.jl
index 397ad4f251..06b78e538d 100644
--- a/src/proto/google/protobuf/wrappers_pb.jl
+++ b/src/proto/google/protobuf/wrappers_pb.jl
@@ -6,7 +6,7 @@ export BoolValue, Int64Value, FloatValue, Int32Value, DoubleValue, UInt64Value,
 export BytesValue, StringValue
 
 
-struct BoolValue
+mutable struct BoolValue
     value::Bool
 end
 PB.default_values(::Type{BoolValue}) = (;value = false)
@@ -36,7 +36,7 @@ function PB._encoded_size(x::BoolValue)
     return encoded_size
 end
 
-struct Int64Value
+mutable struct Int64Value
     value::Int64
 end
 PB.default_values(::Type{Int64Value}) = (;value = zero(Int64))
@@ -66,7 +66,7 @@ function PB._encoded_size(x::Int64Value)
     return encoded_size
 end
 
-struct FloatValue
+mutable struct FloatValue
     value::Float32
 end
 PB.default_values(::Type{FloatValue}) = (;value = zero(Float32))
@@ -96,7 +96,7 @@ function PB._encoded_size(x::FloatValue)
     return encoded_size
 end
 
-struct Int32Value
+mutable struct Int32Value
     value::Int32
 end
 PB.default_values(::Type{Int32Value}) = (;value = zero(Int32))
@@ -126,7 +126,7 @@ function PB._encoded_size(x::Int32Value)
     return encoded_size
 end
 
-struct DoubleValue
+mutable struct DoubleValue
     value::Float64
 end
 PB.default_values(::Type{DoubleValue}) = (;value = zero(Float64))
@@ -156,7 +156,7 @@ function PB._encoded_size(x::DoubleValue)
     return encoded_size
 end
 
-struct UInt64Value
+mutable struct UInt64Value
     value::UInt64
 end
 PB.default_values(::Type{UInt64Value}) = (;value = zero(UInt64))
@@ -186,7 +186,7 @@ function PB._encoded_size(x::UInt64Value)
     return encoded_size
 end
 
-struct UInt32Value
+mutable struct UInt32Value
     value::UInt32
 end
 PB.default_values(::Type{UInt32Value}) = (;value = zero(UInt32))
@@ -216,7 +216,7 @@ function PB._encoded_size(x::UInt32Value)
     return encoded_size
 end
 
-struct BytesValue
+mutable struct BytesValue
     value::Vector{UInt8}
 end
 PB.default_values(::Type{BytesValue}) = (;value = UInt8[])
@@ -246,7 +246,7 @@ function PB._encoded_size(x::BytesValue)
     return encoded_size
 end
 
-struct StringValue
+mutable struct StringValue
     value::String
 end
 PB.default_values(::Type{StringValue}) = (;value = "")
diff --git a/src/proto/stream_executor/cuda_compute_capability_pb.jl b/src/proto/stream_executor/cuda_compute_capability_pb.jl
index 64306bb8dc..c6f1b40094 100644
--- a/src/proto/stream_executor/cuda_compute_capability_pb.jl
+++ b/src/proto/stream_executor/cuda_compute_capability_pb.jl
@@ -7,7 +7,7 @@ export var"CudaComputeCapabilityProto.FeatureExtension", CudaComputeCapabilityPr
 
 @enumx var"CudaComputeCapabilityProto.FeatureExtension" UNSPECIFIED=0 NONE=1 ACCELERATED_FEATURES=2 FAMILY_COMPATIBLE_FEATURES=3
 
-struct CudaComputeCapabilityProto
+mutable struct CudaComputeCapabilityProto
     major::Int32
     minor::Int32
     feature_extension::var"CudaComputeCapabilityProto.FeatureExtension".T
diff --git a/src/proto/stream_executor/device_description_pb.jl b/src/proto/stream_executor/device_description_pb.jl
index 86a3b39964..3ee940eb0a 100644
--- a/src/proto/stream_executor/device_description_pb.jl
+++ b/src/proto/stream_executor/device_description_pb.jl
@@ -6,7 +6,7 @@ export RocmComputeCapabilityProto, DnnVersionInfoProto, RuntimeVersionProto
 export GpuDeviceInfoProto, GpuComputeCapabilityProto, GpuTargetConfigProto
 
 
-struct RocmComputeCapabilityProto
+mutable struct RocmComputeCapabilityProto
     gcn_arch_name::String
 end
 PB.default_values(::Type{RocmComputeCapabilityProto}) = (;gcn_arch_name = "")
@@ -36,7 +36,7 @@ function PB._encoded_size(x::RocmComputeCapabilityProto)
     return encoded_size
 end
 
-struct DnnVersionInfoProto
+mutable struct DnnVersionInfoProto
     major::Int32
     minor::Int32
     patch::Int32
@@ -78,7 +78,7 @@ function PB._encoded_size(x::DnnVersionInfoProto)
     return encoded_size
 end
 
-struct RuntimeVersionProto
+mutable struct RuntimeVersionProto
     major::Int32
     minor::Int32
     patch::Int32
@@ -120,7 +120,7 @@ function PB._encoded_size(x::RuntimeVersionProto)
     return encoded_size
 end
 
-struct GpuDeviceInfoProto
+mutable struct GpuDeviceInfoProto
     threads_per_block_limit::Int32
     threads_per_warp::Int32
     shared_memory_per_block::Int32
@@ -267,7 +267,7 @@ function PB._encoded_size(x::GpuDeviceInfoProto)
     return encoded_size
 end
 
-struct GpuComputeCapabilityProto
+mutable struct GpuComputeCapabilityProto
     compute_capability::Union{Nothing,OneOf{<:Union{CudaComputeCapabilityProto,RocmComputeCapabilityProto}}}
 end
 PB.oneof_field_types(::Type{GpuComputeCapabilityProto}) = (;
@@ -312,7 +312,7 @@ function PB._encoded_size(x::GpuComputeCapabilityProto)
     return encoded_size
 end
 
-struct GpuTargetConfigProto
+mutable struct GpuTargetConfigProto
     gpu_device_info::Union{Nothing,GpuDeviceInfoProto}
     platform_name::String
     dnn_version_info::Union{Nothing,DnnVersionInfoProto}
diff --git a/src/proto/tensorflow/profiler/diagnostics_pb.jl b/src/proto/tensorflow/profiler/diagnostics_pb.jl
index c2d6990cd7..0d205b057e 100644
--- a/src/proto/tensorflow/profiler/diagnostics_pb.jl
+++ b/src/proto/tensorflow/profiler/diagnostics_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Diagnostics
 
 
-struct Diagnostics
+mutable struct Diagnostics
     info::Vector{String}
     warnings::Vector{String}
     errors::Vector{String}
diff --git a/src/proto/tensorflow/profiler/hardware_types_pb.jl b/src/proto/tensorflow/profiler/hardware_types_pb.jl
index 9b4f8e3f0a..c4995d4e95 100644
--- a/src/proto/tensorflow/profiler/hardware_types_pb.jl
+++ b/src/proto/tensorflow/profiler/hardware_types_pb.jl
@@ -7,7 +7,7 @@ export HardwareType, GPUComputeCapability, DeviceCapabilities
 
 @enumx HardwareType UNKNOWN_HARDWARE=0 CPU_ONLY=1 GPU=2 TPU=3
 
-struct GPUComputeCapability
+mutable struct GPUComputeCapability
     major::UInt32
     minor::UInt32
 end
@@ -43,7 +43,7 @@ function PB._encoded_size(x::GPUComputeCapability)
     return encoded_size
 end
 
-struct DeviceCapabilities
+mutable struct DeviceCapabilities
     clock_rate_in_ghz::Float64
     num_cores::UInt32
     memory_size_in_bytes::UInt64
diff --git a/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl b/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl
index d5a01b81b2..94096c455a 100644
--- a/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl
+++ b/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export HloStatsRecord, HloStatsDatabase
 
 
-struct HloStatsRecord
+mutable struct HloStatsRecord
     rank::UInt64
     program_id::UInt64
     hlo_category::String
@@ -240,7 +240,7 @@ function PB._encoded_size(x::HloStatsRecord)
     return encoded_size
 end
 
-struct HloStatsDatabase
+mutable struct HloStatsDatabase
     hlo_stats_record::Vector{HloStatsRecord}
 end
 PB.default_values(::Type{HloStatsDatabase}) = (;hlo_stats_record = Vector{HloStatsRecord}())
diff --git a/src/proto/tensorflow/profiler/input_pipeline_pb.jl b/src/proto/tensorflow/profiler/input_pipeline_pb.jl
index 61d9dc6242..9497f26308 100644
--- a/src/proto/tensorflow/profiler/input_pipeline_pb.jl
+++ b/src/proto/tensorflow/profiler/input_pipeline_pb.jl
@@ -7,7 +7,7 @@ export InputPipelineAnalysisRecommendation, InputOpDetails, StepSummary
 export GenericStepTimeBreakdown, InputPipelineAnalysisResult
 
 
-struct InputTimeBreakdown
+mutable struct InputTimeBreakdown
     demanded_file_read_us::Float64
     advanced_file_read_us::Float64
     preprocessing_us::Float64
@@ -61,7 +61,7 @@ function PB._encoded_size(x::InputTimeBreakdown)
     return encoded_size
 end
 
-struct PerGenericStepDetails
+mutable struct PerGenericStepDetails
     step_number::Int32
     step_name::String
     step_time_ms::Float64
@@ -164,7 +164,7 @@ function PB._encoded_size(x::PerGenericStepDetails)
     return encoded_size
 end
 
-struct BottleneckAnalysis
+mutable struct BottleneckAnalysis
     input_percent::Float64
     output_percent::Float64
     idle_percent::Float64
@@ -260,7 +260,7 @@ function PB._encoded_size(x::BottleneckAnalysis)
     return encoded_size
 end
 
-struct InputPipelineAnalysisRecommendation
+mutable struct InputPipelineAnalysisRecommendation
     details::Vector{String}
     bottleneck_analysis::Union{Nothing,google.protobuf.var"#Any"}
     summary_next_step::String
@@ -302,7 +302,7 @@ function PB._encoded_size(x::InputPipelineAnalysisRecommendation)
     return encoded_size
 end
 
-struct InputOpDetails
+mutable struct InputOpDetails
     op_name::String
     count::UInt64
     time_in_ms::Float64
@@ -368,7 +368,7 @@ function PB._encoded_size(x::InputOpDetails)
     return encoded_size
 end
 
-struct StepSummary
+mutable struct StepSummary
     average::Float64
     standard_deviation::Float64
     minimum::Float64
@@ -416,7 +416,7 @@ function PB._encoded_size(x::StepSummary)
     return encoded_size
 end
 
-struct GenericStepTimeBreakdown
+mutable struct GenericStepTimeBreakdown
     unknown_time_ms_summary::Union{Nothing,StepSummary}
     host_wait_input_ms_summary::Union{Nothing,StepSummary}
     host_to_device_ms_summary::Union{Nothing,StepSummary}
@@ -507,7 +507,7 @@ function PB._encoded_size(x::GenericStepTimeBreakdown)
     return encoded_size
 end
 
-struct InputPipelineAnalysisResult
+mutable struct InputPipelineAnalysisResult
     tag::Bool
     hardware_type::String
     step_time_summary::Union{Nothing,StepSummary}
diff --git a/src/proto/tensorflow/profiler/kernel_stats_pb.jl b/src/proto/tensorflow/profiler/kernel_stats_pb.jl
index f5d132ffe0..889f53813c 100644
--- a/src/proto/tensorflow/profiler/kernel_stats_pb.jl
+++ b/src/proto/tensorflow/profiler/kernel_stats_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export KernelReport, KernelStatsDb
 
 
-struct KernelReport
+mutable struct KernelReport
     name::String
     registers_per_thread::UInt32
     static_shmem_bytes::UInt32
@@ -113,7 +113,7 @@ function PB._encoded_size(x::KernelReport)
     return encoded_size
 end
 
-struct KernelStatsDb
+mutable struct KernelStatsDb
     reports::Vector{KernelReport}
 end
 PB.default_values(::Type{KernelStatsDb}) = (;reports = Vector{KernelReport}())
diff --git a/src/proto/tensorflow/profiler/memory_profile_pb.jl b/src/proto/tensorflow/profiler/memory_profile_pb.jl
index 8e5dc9648b..7ad2d362e0 100644
--- a/src/proto/tensorflow/profiler/memory_profile_pb.jl
+++ b/src/proto/tensorflow/profiler/memory_profile_pb.jl
@@ -9,7 +9,7 @@ export MemoryProfile
 
 @enumx MemoryActivity UNKNOWN_ACTIVITY=0 ALLOCATION=1 DEALLOCATION=2 RESERVATION=3 EXPANSION=4
 
-struct MemoryAggregationStats
+mutable struct MemoryAggregationStats
     stack_reserved_bytes::Int64
     heap_allocated_bytes::Int64
     free_memory_bytes::Int64
@@ -63,7 +63,7 @@ function PB._encoded_size(x::MemoryAggregationStats)
     return encoded_size
 end
 
-struct ActiveAllocation
+mutable struct ActiveAllocation
     snapshot_index::Int64
     special_index::Int64
     num_occurrences::Int64
@@ -105,7 +105,7 @@ function PB._encoded_size(x::ActiveAllocation)
     return encoded_size
 end
 
-struct MemoryActivityMetadata
+mutable struct MemoryActivityMetadata
     memory_activity::MemoryActivity.T
     requested_bytes::Int64
     allocation_bytes::Int64
@@ -183,7 +183,7 @@ function PB._encoded_size(x::MemoryActivityMetadata)
     return encoded_size
 end
 
-struct MemoryProfileSummary
+mutable struct MemoryProfileSummary
     peak_bytes_usage_lifetime::Int64
     peak_stats::Union{Nothing,MemoryAggregationStats}
     peak_stats_time_ps::Int64
@@ -231,7 +231,7 @@ function PB._encoded_size(x::MemoryProfileSummary)
     return encoded_size
 end
 
-struct MemoryProfileSnapshot
+mutable struct MemoryProfileSnapshot
     time_offset_ps::Int64
     aggregation_stats::Union{Nothing,MemoryAggregationStats}
     activity_metadata::Union{Nothing,MemoryActivityMetadata}
@@ -273,7 +273,7 @@ function PB._encoded_size(x::MemoryProfileSnapshot)
     return encoded_size
 end
 
-struct PerAllocatorMemoryProfile
+mutable struct PerAllocatorMemoryProfile
     memory_profile_snapshots::Vector{MemoryProfileSnapshot}
     profile_summary::Union{Nothing,MemoryProfileSummary}
     active_allocations::Vector{ActiveAllocation}
@@ -327,7 +327,7 @@ function PB._encoded_size(x::PerAllocatorMemoryProfile)
     return encoded_size
 end
 
-struct MemoryProfile
+mutable struct MemoryProfile
     memory_profile_per_allocator::Dict{String,PerAllocatorMemoryProfile}
     num_hosts::Int32
     memory_ids::Vector{String}
diff --git a/src/proto/tensorflow/profiler/op_metrics_pb.jl b/src/proto/tensorflow/profiler/op_metrics_pb.jl
index a242f228b1..0d87ed2f7e 100644
--- a/src/proto/tensorflow/profiler/op_metrics_pb.jl
+++ b/src/proto/tensorflow/profiler/op_metrics_pb.jl
@@ -11,7 +11,7 @@ abstract type var"##Abstract#OpMetrics" end
 abstract type var"##Abstract#OpMetricsDb" end
 
 
-struct PrecisionStats
+mutable struct PrecisionStats
     compute_16bit_ps::UInt64
     compute_32bit_ps::UInt64
 end
@@ -59,7 +59,7 @@ end
 
 @enumx MemorySpace MEMORY_SPACE_UNDEFINED=0 MEMORY_SPACE_HBM=1 MEMORY_SPACE_ON_CHIP=2147483646 MEMORY_SPACE_ALL=2147483647
 
-struct var"PerformanceInfo.MemoryAccessed"
+mutable struct var"PerformanceInfo.MemoryAccessed"
     is_read::Bool
     memory_space::var"PerformanceInfo.MemoryAccessed.MemorySpace".T
     bytes_accessed::Int64
@@ -101,7 +101,7 @@ function PB._encoded_size(x::var"PerformanceInfo.MemoryAccessed")
     return encoded_size
 end
 
-struct var"LayoutAnalysis.Dimension"
+mutable struct var"LayoutAnalysis.Dimension"
     size::Int32
     alignment::Int32
     semantics::LayoutDimensionSemantics.T
@@ -143,7 +143,7 @@ function PB._encoded_size(x::var"LayoutAnalysis.Dimension")
     return encoded_size
 end
 
-struct var"OpMetrics.MemoryAccessed"
+mutable struct var"OpMetrics.MemoryAccessed"
     operation_type::var"OpMetrics.MemoryAccessed.OperationType".T
     memory_space::UInt64
     bytes_accessed::UInt64
@@ -185,7 +185,7 @@ function PB._encoded_size(x::var"OpMetrics.MemoryAccessed")
     return encoded_size
 end
 
-struct PerformanceInfo
+mutable struct PerformanceInfo
     flops::Int64
     bytes_accessed::Int64
     memory_accessed_breakdown::Vector{var"PerformanceInfo.MemoryAccessed"}
@@ -228,7 +228,7 @@ function PB._encoded_size(x::PerformanceInfo)
     return encoded_size
 end
 
-struct LayoutAnalysis
+mutable struct LayoutAnalysis
     dimensions::Vector{var"LayoutAnalysis.Dimension"}
 end
 PB.default_values(::Type{LayoutAnalysis}) = (;dimensions = Vector{var"LayoutAnalysis.Dimension"}())
@@ -258,7 +258,7 @@ function PB._encoded_size(x::LayoutAnalysis)
     return encoded_size
 end
 
-struct MemoryAccessBreakdown
+mutable struct MemoryAccessBreakdown
     memory_accessed::Vector{var"OpMetrics.MemoryAccessed"}
 end
 PB.default_values(::Type{MemoryAccessBreakdown}) = (;memory_accessed = Vector{var"OpMetrics.MemoryAccessed"}())
@@ -289,7 +289,7 @@ function PB._encoded_size(x::MemoryAccessBreakdown)
 end
 
 # Stub definitions for cyclic types
-struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract#OpMetrics"
+mutable struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract#OpMetrics"
     hlo_module_id::UInt64
     name::String
     long_name::String
@@ -317,7 +317,7 @@ struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract#
     core_type::var"OpMetrics.TpuCoreType".T
 end
 
-struct var"##Stub#OpMetricsDb" <: var"##Abstract#OpMetricsDb"
+mutable struct var"##Stub#OpMetricsDb" <: var"##Abstract#OpMetricsDb"
     metrics_db::Vector{var"##Stub#OpMetrics"{var"##Stub#OpMetricsDb"}}
     total_host_infeed_enq_duration_ps::UInt64
     total_host_infeed_enq_start_timestamp_ps_diff::UInt64
diff --git a/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl b/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl
index 43e7d814bb..285fc26635 100644
--- a/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl
+++ b/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl
@@ -7,7 +7,7 @@ export Metrics, var"Node.XLAInstruction.LayoutAnalysis", var"Node.XLAInstruction
 export Profile
 
 
-struct var"Node.InstructionCategory" end
+mutable struct var"Node.InstructionCategory" end
 
 function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:var"Node.InstructionCategory"})
     while !PB.message_done(d)
@@ -26,7 +26,7 @@ function PB._encoded_size(x::var"Node.InstructionCategory")
     return encoded_size
 end
 
-struct var"Node.XLAInstruction.LayoutAnalysis.Dimension"
+mutable struct var"Node.XLAInstruction.LayoutAnalysis.Dimension"
     size::Int32
     alignment::Int32
     semantics::String
@@ -68,7 +68,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction.LayoutAnalysis.Dimension")
     return encoded_size
 end
 
-struct Metrics
+mutable struct Metrics
     flops::Float64
     uncapped_flops::Float64
     bandwidth_utils::Vector{Float64}
@@ -153,7 +153,7 @@ function PB._encoded_size(x::Metrics)
     return encoded_size
 end
 
-struct var"Node.XLAInstruction.LayoutAnalysis"
+mutable struct var"Node.XLAInstruction.LayoutAnalysis"
     dimensions::Vector{var"Node.XLAInstruction.LayoutAnalysis.Dimension"}
 end
 PB.default_values(::Type{var"Node.XLAInstruction.LayoutAnalysis"}) = (;dimensions = Vector{var"Node.XLAInstruction.LayoutAnalysis.Dimension"}())
@@ -183,7 +183,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction.LayoutAnalysis")
     return encoded_size
 end
 
-struct var"Node.XLAInstruction"
+mutable struct var"Node.XLAInstruction"
     op::String
     expression::String
     provenance::String
@@ -267,7 +267,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction")
     return encoded_size
 end
 
-struct Node
+mutable struct Node
     name::String
     metrics::Union{Nothing,Metrics}
     children::Vector{Node}
@@ -336,7 +336,7 @@ function PB._encoded_size(x::Node)
     return encoded_size
 end
 
-struct Profile
+mutable struct Profile
     by_category::Union{Nothing,Node}
     by_program::Union{Nothing,Node}
     device_type::String
diff --git a/src/proto/tensorflow/profiler/op_stats_pb.jl b/src/proto/tensorflow/profiler/op_stats_pb.jl
index 4056282dd4..48b0d84233 100644
--- a/src/proto/tensorflow/profiler/op_stats_pb.jl
+++ b/src/proto/tensorflow/profiler/op_stats_pb.jl
@@ -6,7 +6,7 @@ export HostIndependentJobInfoResult, CoreDetails, HostDependentJobInfoResult
 export SystemTopology, PerformanceCounterResult, PerfEnv, RunEnvironment, OpStats
 
 
-struct HostIndependentJobInfoResult
+mutable struct HostIndependentJobInfoResult
     change_list::Int64
     workspace_id::String
     snapshot::Int64
@@ -66,7 +66,7 @@ function PB._encoded_size(x::HostIndependentJobInfoResult)
     return encoded_size
 end
 
-struct CoreDetails
+mutable struct CoreDetails
     hostname::String
     device_ordinal::UInt32
     core_num::UInt32
@@ -132,7 +132,7 @@ function PB._encoded_size(x::CoreDetails)
     return encoded_size
 end
 
-struct HostDependentJobInfoResult
+mutable struct HostDependentJobInfoResult
     host_id::String
     command_line::String
     start_time::Int64
@@ -186,7 +186,7 @@ function PB._encoded_size(x::HostDependentJobInfoResult)
     return encoded_size
 end
 
-struct SystemTopology
+mutable struct SystemTopology
     x_dimension::Int64
     y_dimension::Int64
     z_dimension::Int64
@@ -234,7 +234,7 @@ function PB._encoded_size(x::SystemTopology)
     return encoded_size
 end
 
-struct PerformanceCounterResult
+mutable struct PerformanceCounterResult
     matrix_unit_utilization_percent::Float64
     hbm_utilization_percent::Float64
 end
@@ -270,7 +270,7 @@ function PB._encoded_size(x::PerformanceCounterResult)
     return encoded_size
 end
 
-struct PerfEnv
+mutable struct PerfEnv
     peak_tera_flops_per_second::Float64
     peak_bw_giga_bytes_per_second::Float64
     peak_hbm_bw_giga_bytes_per_second::Float64
@@ -342,7 +342,7 @@ function PB._encoded_size(x::PerfEnv)
     return encoded_size
 end
 
-struct RunEnvironment
+mutable struct RunEnvironment
     host_count::Int32
     task_count::Int32
     hostnames::Dict{String,Bool}
@@ -451,7 +451,7 @@ function PB._encoded_size(x::RunEnvironment)
     return encoded_size
 end
 
-struct OpStats
+mutable struct OpStats
     host_op_metrics_db::Union{Nothing,OpMetricsDb}
     device_op_metrics_db::Union{Nothing,OpMetricsDb}
     hlo_metrics_db_complete_steps_only::Union{Nothing,OpMetricsDb}
diff --git a/src/proto/tensorflow/profiler/overview_page_pb.jl b/src/proto/tensorflow/profiler/overview_page_pb.jl
index c4bc5f80d3..aca5a4ac18 100644
--- a/src/proto/tensorflow/profiler/overview_page_pb.jl
+++ b/src/proto/tensorflow/profiler/overview_page_pb.jl
@@ -8,7 +8,7 @@ export OverviewPageAnalysis, OverviewInferenceLatency, OverviewPageRecommendatio
 export OverviewPageRunEnvironment, OverviewPage
 
 
-struct OverviewTfOp
+mutable struct OverviewTfOp
     name::String
     category::String
     self_time_fraction::Float64
@@ -74,7 +74,7 @@ function PB._encoded_size(x::OverviewTfOp)
     return encoded_size
 end
 
-struct OverviewPageHostDependentJobInfo
+mutable struct OverviewPageHostDependentJobInfo
     host_id::String
     command_line::String
     start_time::Int64
@@ -128,7 +128,7 @@ function PB._encoded_size(x::OverviewPageHostDependentJobInfo)
     return encoded_size
 end
 
-struct OverviewLatencyBreakdown
+mutable struct OverviewLatencyBreakdown
     total_latency_us::Float64
     host_latency_us::Float64
     device_latency_us::Float64
@@ -176,7 +176,7 @@ function PB._encoded_size(x::OverviewLatencyBreakdown)
     return encoded_size
 end
 
-struct GenericRecommendation
+mutable struct GenericRecommendation
     kernel_launch_bottleneck::String
     kernel_launch_statement::String
     all_other_bottleneck::String
@@ -242,7 +242,7 @@ function PB._encoded_size(x::GenericRecommendation)
     return encoded_size
 end
 
-struct OverviewPageTip
+mutable struct OverviewPageTip
     link::String
 end
 PB.default_values(::Type{OverviewPageTip}) = (;link = "")
@@ -272,7 +272,7 @@ function PB._encoded_size(x::OverviewPageTip)
     return encoded_size
 end
 
-struct OverviewPageHostIndependentJobInfo
+mutable struct OverviewPageHostIndependentJobInfo
     change_list::Int64
     workspace_id::String
     snapshot::Int64
@@ -332,7 +332,7 @@ function PB._encoded_size(x::OverviewPageHostIndependentJobInfo)
     return encoded_size
 end
 
-struct OverviewPageAnalysis
+mutable struct OverviewPageAnalysis
     mxu_utilization_percent::Float64
     device_idle_time_percent::Float64
     host_idle_time_percent::Float64
@@ -542,7 +542,7 @@ function PB._encoded_size(x::OverviewPageAnalysis)
     return encoded_size
 end
 
-struct OverviewInferenceLatency
+mutable struct OverviewInferenceLatency
     percentile_numbers::Vector{Float64}
     latency_breakdowns::Vector{OverviewLatencyBreakdown}
     max_latency_us::Float64
@@ -596,7 +596,7 @@ function PB._encoded_size(x::OverviewInferenceLatency)
     return encoded_size
 end
 
-struct OverviewPageRecommendation
+mutable struct OverviewPageRecommendation
     bottleneck::String
     statement::String
     input_tips::Vector{OverviewPageTip}
@@ -698,7 +698,7 @@ function PB._encoded_size(x::OverviewPageRecommendation)
     return encoded_size
 end
 
-struct OverviewPageRunEnvironment
+mutable struct OverviewPageRunEnvironment
     host_count::Int32
     task_count::Int32
     hostnames::Dict{String,Bool}
@@ -789,7 +789,7 @@ function PB._encoded_size(x::OverviewPageRunEnvironment)
     return encoded_size
 end
 
-struct OverviewPage
+mutable struct OverviewPage
     run_environment::Union{Nothing,OverviewPageRunEnvironment}
     input_analysis::Union{Nothing,InputPipelineAnalysisResult}
     analysis::Union{Nothing,OverviewPageAnalysis}
diff --git a/src/proto/tensorflow/profiler/power_metrics_pb.jl b/src/proto/tensorflow/profiler/power_metrics_pb.jl
index 290652ab3d..3064821985 100644
--- a/src/proto/tensorflow/profiler/power_metrics_pb.jl
+++ b/src/proto/tensorflow/profiler/power_metrics_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export PowerComponentMetrics, PowerMetrics
 
 
-struct PowerComponentMetrics
+mutable struct PowerComponentMetrics
     component_name::String
     max_power::Float64
     avg_power::Float64
@@ -83,7 +83,7 @@ function PB._encoded_size(x::PowerComponentMetrics)
     return encoded_size
 end
 
-struct PowerMetrics
+mutable struct PowerMetrics
     power_component_metrics::Vector{PowerComponentMetrics}
 end
 PB.default_values(::Type{PowerMetrics}) = (;power_component_metrics = Vector{PowerComponentMetrics}())
diff --git a/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl b/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl
index e1c2732254..b51e1be3bf 100644
--- a/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl
+++ b/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl
@@ -7,7 +7,7 @@ export RecordType, RooflineModelRecord, RooflineModelDatabase
 
 @enumx RecordType INVALID_RECORD_TYPE=0 ALL=1 AVERAGE_STEP=2 PER_STEP=3 ALL_HW=4
 
-struct RooflineModelRecord
+mutable struct RooflineModelRecord
     record_type::RecordType.T
     step_num::UInt32
     rank::UInt64
@@ -278,7 +278,7 @@ function PB._encoded_size(x::RooflineModelRecord)
     return encoded_size
 end
 
-struct RooflineModelDatabase
+mutable struct RooflineModelDatabase
     device_type::String
     megacore::Bool
     has_cmem::Bool
diff --git a/src/proto/tensorflow/profiler/source_info_pb.jl b/src/proto/tensorflow/profiler/source_info_pb.jl
index d9c675f82c..673de00685 100644
--- a/src/proto/tensorflow/profiler/source_info_pb.jl
+++ b/src/proto/tensorflow/profiler/source_info_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export SourceInfo
 
 
-struct SourceInfo
+mutable struct SourceInfo
     file_name::String
     line_number::Int32
     stack_frame::String
diff --git a/src/proto/tensorflow/profiler/source_stats_pb.jl b/src/proto/tensorflow/profiler/source_stats_pb.jl
index 133e12e87c..4c470ec83a 100644
--- a/src/proto/tensorflow/profiler/source_stats_pb.jl
+++ b/src/proto/tensorflow/profiler/source_stats_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export var"SourceStats.Metric", var"SourceStats.FileMetrics", SourceStats
 
 
-struct var"SourceStats.Metric"
+mutable struct var"SourceStats.Metric"
     occurrences::UInt64
     self_time_ps::UInt64
     time_ps::UInt64
@@ -59,7 +59,7 @@ function PB._encoded_size(x::var"SourceStats.Metric")
     return encoded_size
 end
 
-struct var"SourceStats.FileMetrics"
+mutable struct var"SourceStats.FileMetrics"
     line_number_to_metric::Dict{Int32,var"SourceStats.Metric"}
 end
 PB.default_values(::Type{var"SourceStats.FileMetrics"}) = (;line_number_to_metric = Dict{Int32,var"SourceStats.Metric"}())
@@ -89,7 +89,7 @@ function PB._encoded_size(x::var"SourceStats.FileMetrics")
     return encoded_size
 end
 
-struct SourceStats
+mutable struct SourceStats
     file_name_to_metrics::Dict{String,var"SourceStats.FileMetrics"}
 end
 PB.default_values(::Type{SourceStats}) = (;file_name_to_metrics = Dict{String,var"SourceStats.FileMetrics"}())
diff --git a/src/proto/tensorflow/profiler/steps_db_pb.jl b/src/proto/tensorflow/profiler/steps_db_pb.jl
index 1ecc59ff1f..e488155954 100644
--- a/src/proto/tensorflow/profiler/steps_db_pb.jl
+++ b/src/proto/tensorflow/profiler/steps_db_pb.jl
@@ -7,7 +7,7 @@ export TpuStepBreakdown, StepInfoResult, AllReduceDbResult, PerCoreStepInfo
 export StepDatabaseResult
 
 
-struct DeviceMemoryTransfer
+mutable struct DeviceMemoryTransfer
     occurrence::UInt64
     time_us::Float64
     bytes_transferred::UInt64
@@ -49,7 +49,7 @@ function PB._encoded_size(x::DeviceMemoryTransfer)
     return encoded_size
 end
 
-struct GenericStepBreakdown
+mutable struct GenericStepBreakdown
     type_ps::Dict{Int32,UInt64}
     category_ps::Dict{String,UInt64}
 end
@@ -85,7 +85,7 @@ function PB._encoded_size(x::GenericStepBreakdown)
     return encoded_size
 end
 
-struct AllReduceInfo
+mutable struct AllReduceInfo
     id::UInt64
     name::String
     all_reduce_id::UInt64
@@ -145,7 +145,7 @@ function PB._encoded_size(x::AllReduceInfo)
     return encoded_size
 end
 
-struct SparseCoreStepBreakdown
+mutable struct SparseCoreStepBreakdown
     sc_compute_ps::UInt64
     sc_infeed_ps::UInt64
     sc_outfeed_ps::UInt64
@@ -199,7 +199,7 @@ function PB._encoded_size(x::SparseCoreStepBreakdown)
     return encoded_size
 end
 
-struct TpuStepBreakdown
+mutable struct TpuStepBreakdown
     infeed_duration_ps::UInt64
     host_outfeed_ps::UInt64
     wait_for_scv0_duration_ps::UInt64
@@ -332,7 +332,7 @@ function PB._encoded_size(x::TpuStepBreakdown)
     return encoded_size
 end
 
-struct StepInfoResult
+mutable struct StepInfoResult
     step_num::UInt32
     step_name::String
     duration_ps::UInt64
@@ -392,7 +392,7 @@ function PB._encoded_size(x::StepInfoResult)
     return encoded_size
 end
 
-struct AllReduceDbResult
+mutable struct AllReduceDbResult
     all_reduce_info::Vector{AllReduceInfo}
 end
 PB.default_values(::Type{AllReduceDbResult}) = (;all_reduce_info = Vector{AllReduceInfo}())
@@ -422,7 +422,7 @@ function PB._encoded_size(x::AllReduceDbResult)
     return encoded_size
 end
 
-struct PerCoreStepInfo
+mutable struct PerCoreStepInfo
     step_num::UInt32
     step_info_per_core::Dict{UInt32,StepInfoResult}
     hlo_metrics_db::Union{Nothing,OpMetricsDb}
@@ -483,7 +483,7 @@ function PB._encoded_size(x::PerCoreStepInfo)
     return encoded_size
 end
 
-struct StepDatabaseResult
+mutable struct StepDatabaseResult
     step_sequence::Vector{PerCoreStepInfo}
     use_incomplete_step::Bool
     num_steps_dropped::UInt32
diff --git a/src/proto/tensorflow/profiler/task_pb.jl b/src/proto/tensorflow/profiler/task_pb.jl
index 61b9016089..9dd1923b6e 100644
--- a/src/proto/tensorflow/profiler/task_pb.jl
+++ b/src/proto/tensorflow/profiler/task_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Task
 
 
-struct Task
+mutable struct Task
     changelist::Int64
     workspace_id::String
     snapshot::Int64
diff --git a/src/proto/tensorflow/profiler/tf_function_pb.jl b/src/proto/tensorflow/profiler/tf_function_pb.jl
index ca4d411cb0..08d55342a1 100644
--- a/src/proto/tensorflow/profiler/tf_function_pb.jl
+++ b/src/proto/tensorflow/profiler/tf_function_pb.jl
@@ -6,7 +6,7 @@ export TfFunctionMetrics, TfFunctionCompiler, TfFunctionExecutionMode, TfFunctio
 export TfFunctionDb
 
 
-struct TfFunctionMetrics
+mutable struct TfFunctionMetrics
     count::UInt64
     self_time_ps::UInt64
 end
@@ -46,7 +46,7 @@ end
 
 @enumx TfFunctionExecutionMode INVALID_MODE=0 EAGER_MODE=1 TRACED_MODE=2 NOT_TRACED_MODE=3 CONCRETE_MODE=4
 
-struct TfFunction
+mutable struct TfFunction
     metrics::Dict{Int32,TfFunctionMetrics}
     total_tracing_count::Int64
     compiler::TfFunctionCompiler.T
@@ -94,7 +94,7 @@ function PB._encoded_size(x::TfFunction)
     return encoded_size
 end
 
-struct TfFunctionDb
+mutable struct TfFunctionDb
     tf_functions::Dict{String,TfFunction}
 end
 PB.default_values(::Type{TfFunctionDb}) = (;tf_functions = Dict{String,TfFunction}())
diff --git a/src/proto/tensorflow/profiler/topology_pb.jl b/src/proto/tensorflow/profiler/topology_pb.jl
index bb7c14af14..67cdda8f5c 100644
--- a/src/proto/tensorflow/profiler/topology_pb.jl
+++ b/src/proto/tensorflow/profiler/topology_pb.jl
@@ -7,7 +7,7 @@ export var"LogicalTopology.LogicalDevice", Topology, var"LogicalTopology.Logical
 export var"LogicalTopology.LogicalSlice", LogicalTopology
 
 
-struct var"LogicalTopology.HostNetworkAddress"
+mutable struct var"LogicalTopology.HostNetworkAddress"
     address::String
     interface_name::String
 end
@@ -43,7 +43,7 @@ function PB._encoded_size(x::var"LogicalTopology.HostNetworkAddress")
     return encoded_size
 end
 
-struct TopologyLocation
+mutable struct TopologyLocation
     x::Int32
     y::Int32
     z::Int32
@@ -115,7 +115,7 @@ function PB._encoded_size(x::TopologyLocation)
     return encoded_size
 end
 
-struct TopologyDimension
+mutable struct TopologyDimension
     x::Int32
     y::Int32
     z::Int32
@@ -157,7 +157,7 @@ function PB._encoded_size(x::TopologyDimension)
     return encoded_size
 end
 
-struct var"LogicalTopology.LogicalDevice"
+mutable struct var"LogicalTopology.LogicalDevice"
     global_id::Int32
     slice_local_id::Int32
     host_local_id::Int32
@@ -199,7 +199,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalDevice")
     return encoded_size
 end
 
-struct Topology
+mutable struct Topology
     chips_per_host_bounds::Union{Nothing,TopologyDimension}
     host_bounds::Union{Nothing,TopologyDimension}
     mesh_location::Vector{TopologyLocation}
@@ -241,7 +241,7 @@ function PB._encoded_size(x::Topology)
     return encoded_size
 end
 
-struct var"LogicalTopology.LogicalHost"
+mutable struct var"LogicalTopology.LogicalHost"
     slice_local_id::Int32
     network_addresses::Vector{var"LogicalTopology.HostNetworkAddress"}
     devices::Vector{var"LogicalTopology.LogicalDevice"}
@@ -283,7 +283,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalHost")
     return encoded_size
 end
 
-struct var"LogicalTopology.LogicalSlice"
+mutable struct var"LogicalTopology.LogicalSlice"
     global_id::Int32
     hosts::Vector{var"LogicalTopology.LogicalHost"}
 end
@@ -319,7 +319,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalSlice")
     return encoded_size
 end
 
-struct LogicalTopology
+mutable struct LogicalTopology
     slices::Vector{var"LogicalTopology.LogicalSlice"}
 end
 PB.default_values(::Type{LogicalTopology}) = (;slices = Vector{var"LogicalTopology.LogicalSlice"}())
diff --git a/src/proto/tensorflow/profiler/trace_events_pb.jl b/src/proto/tensorflow/profiler/trace_events_pb.jl
index d54024fa74..b222ebb0fb 100644
--- a/src/proto/tensorflow/profiler/trace_events_pb.jl
+++ b/src/proto/tensorflow/profiler/trace_events_pb.jl
@@ -11,7 +11,7 @@ export Device, Trace
 @enumx var"TraceEvent.EventType" EVENT_TYPE_UNSPECIFIED=0 EVENT_TYPE_COMPLETE=1 EVENT_TYPE_ASYNC=3 EVENT_TYPE_COUNTER=4
 PB.reserved_fields(::Type{var"TraceEvent.EventType".T}) = (names = String[], numbers = Union{Int,UnitRange{Int}}[2])
 
-struct Resource
+mutable struct Resource
     name::String
     resource_id::UInt64
     num_events::UInt32
@@ -53,7 +53,7 @@ function PB._encoded_size(x::Resource)
     return encoded_size
 end
 
-struct TraceEvent
+mutable struct TraceEvent
     var"#type"::var"TraceEvent.EventType".T
     device_id::UInt32
     resource_id::UInt64
@@ -165,7 +165,7 @@ function PB._encoded_size(x::TraceEvent)
     return encoded_size
 end
 
-struct Device
+mutable struct Device
     name::String
     device_id::UInt32
     resources::Dict{UInt64,Resource}
@@ -208,7 +208,7 @@ function PB._encoded_size(x::Device)
     return encoded_size
 end
 
-struct Trace
+mutable struct Trace
     devices::Dict{UInt32,Device}
     tasks::Dict{UInt32,Task}
     min_timestamp_ps::UInt64
diff --git a/src/proto/tensorflow/profiler/xplane_pb.jl b/src/proto/tensorflow/profiler/xplane_pb.jl
index 056ab661a9..5434ba765d 100644
--- a/src/proto/tensorflow/profiler/xplane_pb.jl
+++ b/src/proto/tensorflow/profiler/xplane_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export XStatMetadata, XStat, XEventMetadata, XEvent, XLine, XPlane, XSpace
 
 
-struct XStatMetadata
+mutable struct XStatMetadata
     id::Int64
     name::String
     description::String
@@ -47,7 +47,7 @@ function PB._encoded_size(x::XStatMetadata)
     return encoded_size
 end
 
-struct XStat
+mutable struct XStat
     metadata_id::Int64
     value::Union{Nothing,OneOf{<:Union{Float64,UInt64,Int64,String,Vector{UInt8}}}}
 end
@@ -122,7 +122,7 @@ function PB._encoded_size(x::XStat)
     return encoded_size
 end
 
-struct XEventMetadata
+mutable struct XEventMetadata
     id::Int64
     name::String
     display_name::String
@@ -182,7 +182,7 @@ function PB._encoded_size(x::XEventMetadata)
     return encoded_size
 end
 
-struct XEvent
+mutable struct XEvent
     metadata_id::Int64
     data::Union{Nothing,OneOf{Int64}}
     duration_ps::Int64
@@ -245,7 +245,7 @@ function PB._encoded_size(x::XEvent)
     return encoded_size
 end
 
-struct XLine
+mutable struct XLine
     id::Int64
     display_id::Int64
     name::String
@@ -312,7 +312,7 @@ function PB._encoded_size(x::XLine)
     return encoded_size
 end
 
-struct XPlane
+mutable struct XPlane
     id::Int64
     name::String
     lines::Vector{XLine}
@@ -372,7 +372,7 @@ function PB._encoded_size(x::XPlane)
     return encoded_size
 end
 
-struct XSpace
+mutable struct XSpace
     planes::Vector{XPlane}
     errors::Vector{String}
     warnings::Vector{String}
diff --git a/src/proto/tensorflow/profiler_options_pb.jl b/src/proto/tensorflow/profiler_options_pb.jl
index 4fc2810798..8e0dc31a64 100644
--- a/src/proto/tensorflow/profiler_options_pb.jl
+++ b/src/proto/tensorflow/profiler_options_pb.jl
@@ -7,7 +7,7 @@ export var"ProfileOptions.AdvancedConfigValue", ProfileOptions
 export RemoteProfilerSessionManagerOptions
 
 
-struct var"ProfileOptions.TraceOptions"
+mutable struct var"ProfileOptions.TraceOptions"
     host_traceme_filter_mask::UInt64
 end
 PB.default_values(::Type{var"ProfileOptions.TraceOptions"}) = (;host_traceme_filter_mask = zero(UInt64))
@@ -39,7 +39,7 @@ end
 
 @enumx var"ProfileOptions.DeviceType" UNSPECIFIED=0 CPU=1 GPU=2 TPU=3 PLUGGABLE_DEVICE=4
 
-struct var"ProfileOptions.AdvancedConfigValue"
+mutable struct var"ProfileOptions.AdvancedConfigValue"
     value::Union{Nothing,OneOf{<:Union{String,Bool,Int64}}}
 end
 PB.oneof_field_types(::Type{var"ProfileOptions.AdvancedConfigValue"}) = (;
@@ -90,7 +90,7 @@ function PB._encoded_size(x::var"ProfileOptions.AdvancedConfigValue")
     return encoded_size
 end
 
-struct ProfileOptions
+mutable struct ProfileOptions
     version::UInt32
     device_type::var"ProfileOptions.DeviceType".T
     include_dataset_ops::Bool
@@ -198,7 +198,7 @@ function PB._encoded_size(x::ProfileOptions)
     return encoded_size
 end
 
-struct RemoteProfilerSessionManagerOptions
+mutable struct RemoteProfilerSessionManagerOptions
     profiler_options::Union{Nothing,ProfileOptions}
     service_addresses::Vector{String}
     session_creation_timestamp_ns::UInt64
diff --git a/src/proto/tensorflow/tfprof/pprof/profile_pb.jl b/src/proto/tensorflow/tfprof/pprof/profile_pb.jl
index a459b58aa6..42601dda14 100644
--- a/src/proto/tensorflow/tfprof/pprof/profile_pb.jl
+++ b/src/proto/tensorflow/tfprof/pprof/profile_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Line, Label, ValueType, Mapping, Function, Location, Sample, Profile
 
 
-struct Line
+mutable struct Line
     function_id::UInt64
     line::Int64
 end
@@ -41,7 +41,7 @@ function PB._encoded_size(x::Line)
     return encoded_size
 end
 
-struct Label
+mutable struct Label
     key::Int64
     str::Int64
     num::Int64
@@ -83,7 +83,7 @@ function PB._encoded_size(x::Label)
     return encoded_size
 end
 
-struct ValueType
+mutable struct ValueType
     var"#type"::Int64
     unit::Int64
 end
@@ -119,7 +119,7 @@ function PB._encoded_size(x::ValueType)
     return encoded_size
 end
 
-struct Mapping
+mutable struct Mapping
     id::UInt64
     memory_start::UInt64
     memory_limit::UInt64
@@ -203,7 +203,7 @@ function PB._encoded_size(x::Mapping)
     return encoded_size
 end
 
-struct Function
+mutable struct Function
     id::UInt64
     name::Int64
     system_name::Int64
@@ -257,7 +257,7 @@ function PB._encoded_size(x::Function)
     return encoded_size
 end
 
-struct Location
+mutable struct Location
     id::UInt64
     mapping_id::UInt64
     address::UInt64
@@ -305,7 +305,7 @@ function PB._encoded_size(x::Location)
     return encoded_size
 end
 
-struct Sample
+mutable struct Sample
     location_id::Vector{UInt64}
     value::Vector{Int64}
     label::Vector{Label}
@@ -347,7 +347,7 @@ function PB._encoded_size(x::Sample)
     return encoded_size
 end
 
-struct Profile
+mutable struct Profile
     sample_type::Vector{ValueType}
     sample::Vector{Sample}
     mapping::Vector{Mapping}
diff --git a/src/proto/tsl/profiler/trace_events_pb.jl b/src/proto/tsl/profiler/trace_events_pb.jl
index 1eee3aa8f1..bbaab7b90c 100644
--- a/src/proto/tsl/profiler/trace_events_pb.jl
+++ b/src/proto/tsl/profiler/trace_events_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export Resource, TraceEvent, Device, Trace
 
 
-struct Resource
+mutable struct Resource
     name::String
     resource_id::UInt32
     sort_index::UInt32
@@ -47,7 +47,7 @@ function PB._encoded_size(x::Resource)
     return encoded_size
 end
 
-struct TraceEvent
+mutable struct TraceEvent
     device_id::UInt32
     resource_id::UInt32
     name::String
@@ -107,7 +107,7 @@ function PB._encoded_size(x::TraceEvent)
     return encoded_size
 end
 
-struct Device
+mutable struct Device
     name::String
     device_id::UInt32
     resources::Dict{UInt32,Resource}
@@ -149,7 +149,7 @@ function PB._encoded_size(x::Device)
     return encoded_size
 end
 
-struct Trace
+mutable struct Trace
     devices::Dict{UInt32,Device}
     trace_events::Vector{TraceEvent}
 end
diff --git a/src/proto/xla/compile_options_pb.jl b/src/proto/xla/compile_options_pb.jl
index 55249422c0..0b93c5f352 100644
--- a/src/proto/xla/compile_options_pb.jl
+++ b/src/proto/xla/compile_options_pb.jl
@@ -6,7 +6,7 @@ export OptionOverrideProto, ExecutableBuildOptionsProto, CompileOptionsProto
 export ExecutableAndOptionsProto
 
 
-struct OptionOverrideProto
+mutable struct OptionOverrideProto
     value::Union{Nothing,OneOf{<:Union{String,Bool,Int64,Float64}}}
 end
 PB.oneof_field_types(::Type{OptionOverrideProto}) = (;
@@ -63,7 +63,7 @@ function PB._encoded_size(x::OptionOverrideProto)
     return encoded_size
 end
 
-struct ExecutableBuildOptionsProto
+mutable struct ExecutableBuildOptionsProto
     device_ordinal::Int64
     result_layout::Union{Nothing,ShapeProto}
     comp_envs::Union{Nothing,CompilationEnvironmentsProto}
@@ -243,7 +243,7 @@ function PB._encoded_size(x::ExecutableBuildOptionsProto)
     return encoded_size
 end
 
-struct CompileOptionsProto
+mutable struct CompileOptionsProto
     argument_layouts::Vector{ShapeProto}
     parameter_is_tupled_arguments::Bool
     executable_build_options::Union{Nothing,ExecutableBuildOptionsProto}
@@ -327,7 +327,7 @@ function PB._encoded_size(x::CompileOptionsProto)
     return encoded_size
 end
 
-struct ExecutableAndOptionsProto
+mutable struct ExecutableAndOptionsProto
     serialized_executable::Vector{UInt8}
     compile_options::Union{Nothing,CompileOptionsProto}
     pjrt_client_name::String
diff --git a/src/proto/xla/execute_options_pb.jl b/src/proto/xla/execute_options_pb.jl
index 2c136f9b08..bdbfb2509a 100644
--- a/src/proto/xla/execute_options_pb.jl
+++ b/src/proto/xla/execute_options_pb.jl
@@ -7,7 +7,7 @@ export ExecutionModeProto, ExecuteOptionsProto
 
 @enumx ExecutionModeProto EXECUTION_MODE_UNSPECIFIED=0 EXECUTION_MODE_DEFAULT=1 EXECUTION_MODE_SYNCHRONOUS=2 EXECUTION_MODE_ASYNCHRONOUS=3
 
-struct ExecuteOptionsProto
+mutable struct ExecuteOptionsProto
     arguments_are_tupled::Bool
     untuple_result::Bool
     launch_id::Int32
diff --git a/src/proto/xla/hlo_pb.jl b/src/proto/xla/hlo_pb.jl
index be33af12ba..9dba2e2aee 100644
--- a/src/proto/xla/hlo_pb.jl
+++ b/src/proto/xla/hlo_pb.jl
@@ -26,7 +26,7 @@ abstract type var"##Abstract#HloModuleProto" end
 abstract type var"##Abstract#HloUnoptimizedSnapshot" end
 
 
-struct var"HloScheduleProto.InstructionSequence"
+mutable struct var"HloScheduleProto.InstructionSequence"
     instruction_ids::Vector{Int64}
 end
 PB.default_values(::Type{var"HloScheduleProto.InstructionSequence"}) = (;instruction_ids = Vector{Int64}())
@@ -58,7 +58,7 @@ end
 
 @enumx CustomCallApiVersion API_VERSION_UNSPECIFIED=0 API_VERSION_ORIGINAL=1 API_VERSION_STATUS_RETURNING=2 API_VERSION_STATUS_RETURNING_UNIFIED=3 API_VERSION_TYPED_FFI=4
 
-struct var"StackFrameIndexProto.StackFrame"
+mutable struct var"StackFrameIndexProto.StackFrame"
     file_location_id::Int32
     parent_frame_id::Int32
 end
@@ -94,7 +94,7 @@ function PB._encoded_size(x::var"StackFrameIndexProto.StackFrame")
     return encoded_size
 end
 
-struct var"HloInputs.LiteralDescriptor"
+mutable struct var"HloInputs.LiteralDescriptor"
     version::Int32
     argument_size_bytes::UInt64
 end
@@ -130,7 +130,7 @@ function PB._encoded_size(x::var"HloInputs.LiteralDescriptor")
     return encoded_size
 end
 
-struct HloPassMetadata
+mutable struct HloPassMetadata
     pass_id::Int64
     pass_name::String
     pipeline_name::String
@@ -220,7 +220,7 @@ function PB._encoded_size(x::HloPassMetadata)
     return encoded_size
 end
 
-struct var"BufferAllocationProto.Assigned"
+mutable struct var"BufferAllocationProto.Assigned"
     logical_buffer_id::Int64
     offset::Int64
     size::Int64
@@ -270,7 +270,7 @@ end
 
 @enumx Kind UNDEFINED_ALIAS=0 MAY_ALIAS=1 MUST_ALIAS=2
 
-struct var"StackFrameIndexProto.FileLocation"
+mutable struct var"StackFrameIndexProto.FileLocation"
     file_name_id::Int32
     function_name_id::Int32
     line::Int32
@@ -330,7 +330,7 @@ function PB._encoded_size(x::var"StackFrameIndexProto.FileLocation")
     return encoded_size
 end
 
-struct var"LogicalBufferProto.Location"
+mutable struct var"LogicalBufferProto.Location"
     instruction_name::String
     instruction_id::Int64
     shape_index::Vector{Int64}
@@ -373,7 +373,7 @@ function PB._encoded_size(x::var"LogicalBufferProto.Location")
     return encoded_size
 end
 
-struct CrossProgramPrefetch
+mutable struct CrossProgramPrefetch
     parameter::Int64
     index::Vector{Int64}
     offset::Int64
@@ -415,7 +415,7 @@ function PB._encoded_size(x::CrossProgramPrefetch)
     return encoded_size
 end
 
-struct var"HloBufferDonorProto.BufferDonorEntryProto"
+mutable struct var"HloBufferDonorProto.BufferDonorEntryProto"
     parameter_number::Int64
     parameter_shape_index::Vector{Int64}
 end
@@ -451,7 +451,7 @@ function PB._encoded_size(x::var"HloBufferDonorProto.BufferDonorEntryProto")
     return encoded_size
 end
 
-struct var"HloInstructionProto.SliceDimensions"
+mutable struct var"HloInstructionProto.SliceDimensions"
     start::Int64
     limit::Int64
     stride::Int64
@@ -499,7 +499,7 @@ end
 
 @enumx var"HeapSimulatorTrace.Event.Kind" ALLOC=0 FREE=1 SHARE_WITH=2
 
-struct HloScheduleProto
+mutable struct HloScheduleProto
     sequences::Dict{Int64,var"HloScheduleProto.InstructionSequence"}
 end
 PB.default_values(::Type{HloScheduleProto}) = (;sequences = Dict{Int64,var"HloScheduleProto.InstructionSequence"}())
@@ -529,7 +529,7 @@ function PB._encoded_size(x::HloScheduleProto)
     return encoded_size
 end
 
-struct HloInputs
+mutable struct HloInputs
     arguments::Vector{LiteralProto}
     arguments_descriptors::Vector{var"HloInputs.LiteralDescriptor"}
 end
@@ -565,7 +565,7 @@ function PB._encoded_size(x::HloInputs)
     return encoded_size
 end
 
-struct HloModuleMetadataProto
+mutable struct HloModuleMetadataProto
     canonical_module_id::Int64
     module_group_name::String
     original_module_id::Int64
@@ -619,7 +619,7 @@ function PB._encoded_size(x::HloModuleMetadataProto)
     return encoded_size
 end
 
-struct BufferAllocationProto
+mutable struct BufferAllocationProto
     index::Int64
     size::Int64
     is_thread_local::Bool
@@ -715,7 +715,7 @@ function PB._encoded_size(x::BufferAllocationProto)
     return encoded_size
 end
 
-struct var"HloInputOutputAliasProto.AliasEntryProto"
+mutable struct var"HloInputOutputAliasProto.AliasEntryProto"
     output_shape_index::Vector{Int64}
     parameter_number::Int64
     parameter_shape_index::Vector{Int64}
@@ -763,7 +763,7 @@ function PB._encoded_size(x::var"HloInputOutputAliasProto.AliasEntryProto")
     return encoded_size
 end
 
-struct StackFrameIndexProto
+mutable struct StackFrameIndexProto
     file_names::Vector{String}
     function_names::Vector{String}
     file_locations::Vector{var"StackFrameIndexProto.FileLocation"}
@@ -811,7 +811,7 @@ function PB._encoded_size(x::StackFrameIndexProto)
     return encoded_size
 end
 
-struct LogicalBufferProto
+mutable struct LogicalBufferProto
     id::Int64
     size::Int64
     defined_at::Union{Nothing,var"LogicalBufferProto.Location"}
@@ -859,7 +859,7 @@ function PB._encoded_size(x::LogicalBufferProto)
     return encoded_size
 end
 
-struct var"BufferAssignmentProto.BufferAlias"
+mutable struct var"BufferAssignmentProto.BufferAlias"
     source_buffer_id::Int64
     location::Union{Nothing,var"LogicalBufferProto.Location"}
 end
@@ -895,7 +895,7 @@ function PB._encoded_size(x::var"BufferAssignmentProto.BufferAlias")
     return encoded_size
 end
 
-struct HloBufferDonorProto
+mutable struct HloBufferDonorProto
     entries::Vector{var"HloBufferDonorProto.BufferDonorEntryProto"}
 end
 PB.default_values(::Type{HloBufferDonorProto}) = (;entries = Vector{var"HloBufferDonorProto.BufferDonorEntryProto"}())
@@ -925,7 +925,7 @@ function PB._encoded_size(x::HloBufferDonorProto)
     return encoded_size
 end
 
-struct var"HloModuleProto.ProfileInfo"
+mutable struct var"HloModuleProto.ProfileInfo"
     profile_type::var"HloModuleProto.ProfileType".T
     relative_speedup::Float64
     profile_source::ProfileSource.T
@@ -997,7 +997,7 @@ function PB._encoded_size(x::var"HloModuleProto.ProfileInfo")
     return encoded_size
 end
 
-struct HloInstructionProto
+mutable struct HloInstructionProto
     name::String
     opcode::String
     shape::Union{Nothing,ShapeProto}
@@ -1069,7 +1069,7 @@ struct HloInstructionProto
     k::Int64
     largest::Bool
     statistics_viz::Union{Nothing,StatisticsViz}
-    replica_group_list::Union{Nothing,OneOf{<:Union{CollectiveDeviceListProto,IotaReplicaGroupListProto,MeshAxesReplicaGroupListProto}}}
+    collective_device_list::Union{Nothing,CollectiveDeviceListProto}
     original_value::Union{Nothing,OriginalValueProto}
     is_composite::Bool
     result_accuracy::Union{Nothing,ResultAccuracy}
@@ -1077,10 +1077,9 @@ end
 PB.reserved_fields(::Type{HloInstructionProto}) = (names = ["parameter_name", "fused_instructions_computation", "operand_names", "control_predecessor_names", "called_computation_names", "replica_group_ids", "custom_call_opaque", "all_reduce_barrier"], numbers = Union{Int,UnitRange{Int}}[10, 12, 4, 5, 6, 44, 53, 46, 41, 42, 64, 78, 83, 84, 86])
 PB.oneof_field_types(::Type{HloInstructionProto}) = (;
     optional_cross_program_prefetch_index = (;cross_program_prefetch_index=Int32),
-    replica_group_list = (;collective_device_list=CollectiveDeviceListProto, iota_collective_device_list=IotaReplicaGroupListProto, mesh_axes_replica_group_list=MeshAxesReplicaGroupListProto),
 )
-PB.default_values(::Type{HloInstructionProto}) = (;name = "", opcode = "", shape = nothing, metadata = nothing, literal = nothing, parameter_number = zero(Int64), fusion_kind = "", tuple_index = zero(Int64), dimensions = Vector{Int64}(), window = nothing, convolution_dimension_numbers = nothing, feature_group_count = zero(Int64), batch_group_count = zero(Int64), slice_dimensions = Vector{var"HloInstructionProto.SliceDimensions"}(), exponent_bits = zero(Int32), mantissa_bits = zero(Int32), dynamic_slice_sizes = Vector{Int64}(), padding_config = nothing, outfeed_config = UInt8[], distribution = RandomDistribution.RNG_INVALID, epsilon = zero(Float32), feature_index = zero(Int64), channel_id = zero(Int64), infeed_config = UInt8[], custom_call_target = "", outfeed_shape = nothing, dot_dimension_numbers = nothing, ragged_dot_dimension_numbers = nothing, fft_type = FftType.FFT, fft_length = Vector{Int64}(), comparison_direction = "", gather_dimension_numbers = nothing, gather_slice_sizes = Vector{Int64}(), id = zero(Int64), operand_ids = Vector{Int64}(), control_predecessor_ids = Vector{Int64}(), called_computation_ids = Vector{Int64}(), sharding = nothing, backend_config = UInt8[], replica_groups = Vector{ReplicaGroup}(), all_reduce_id = zero(Int64), use_global_device_ids = false, is_host_transfer = false, is_stable = false, scatter_dimension_numbers = nothing, precision_config = nothing, source_target_pairs = Vector{SourceTarget}(), domain_entry_sharding = nothing, domain_exit_sharding = nothing, constrain_layout = false, operand_shapes_with_layout = Vector{ShapeProto}(), triangular_solve_options = nothing, cholesky_options = nothing, parameter_replication = nothing, custom_call_has_side_effect = false, output_operand_aliasing = Vector{OutputOperandAliasing}(), custom_call_schedule = CustomCallSchedule.SCHEDULE_NONE, delta = zero(Int64), indices_are_sorted = false, frontend_attributes = nothing, unique_indices = false, rng_algorithm = RandomAlgorithm.RNG_DEFAULT, comparison_type = "", is_cross_program_prefetch = false, cross_program_prefetch_index = zero(Int32), padding_type = PaddingType.PADDING_INVALID, custom_call_api_version = CustomCallApiVersion.API_VERSION_UNSPECIFIED, async_execution_thread = "", k = zero(Int64), largest = false, statistics_viz = nothing, collective_device_list = nothing, iota_collective_device_list = nothing, mesh_axes_replica_group_list = nothing, original_value = nothing, is_composite = false, result_accuracy = nothing)
-PB.field_numbers(::Type{HloInstructionProto}) = (;name = 1, opcode = 2, shape = 3, metadata = 7, literal = 8, parameter_number = 9, fusion_kind = 11, tuple_index = 13, dimensions = 14, window = 15, convolution_dimension_numbers = 16, feature_group_count = 50, batch_group_count = 58, slice_dimensions = 17, exponent_bits = 18, mantissa_bits = 19, dynamic_slice_sizes = 20, padding_config = 21, outfeed_config = 22, distribution = 23, epsilon = 24, feature_index = 25, channel_id = 26, infeed_config = 27, custom_call_target = 28, outfeed_shape = 29, dot_dimension_numbers = 30, ragged_dot_dimension_numbers = 90, fft_type = 31, fft_length = 32, comparison_direction = 63, gather_dimension_numbers = 33, gather_slice_sizes = 34, id = 35, operand_ids = 36, control_predecessor_ids = 37, called_computation_ids = 38, sharding = 40, backend_config = 43, replica_groups = 49, all_reduce_id = 45, use_global_device_ids = 71, is_host_transfer = 47, is_stable = 60, scatter_dimension_numbers = 48, precision_config = 51, source_target_pairs = 52, domain_entry_sharding = 54, domain_exit_sharding = 55, constrain_layout = 56, operand_shapes_with_layout = 57, triangular_solve_options = 59, cholesky_options = 62, parameter_replication = 61, custom_call_has_side_effect = 65, output_operand_aliasing = 74, custom_call_schedule = 76, delta = 66, indices_are_sorted = 67, frontend_attributes = 68, unique_indices = 69, rng_algorithm = 70, comparison_type = 72, is_cross_program_prefetch = 73, cross_program_prefetch_index = 80, padding_type = 75, custom_call_api_version = 77, async_execution_thread = 79, k = 81, largest = 85, statistics_viz = 82, collective_device_list = 87, iota_collective_device_list = 92, mesh_axes_replica_group_list = 93, original_value = 88, is_composite = 89, result_accuracy = 91)
+PB.default_values(::Type{HloInstructionProto}) = (;name = "", opcode = "", shape = nothing, metadata = nothing, literal = nothing, parameter_number = zero(Int64), fusion_kind = "", tuple_index = zero(Int64), dimensions = Vector{Int64}(), window = nothing, convolution_dimension_numbers = nothing, feature_group_count = zero(Int64), batch_group_count = zero(Int64), slice_dimensions = Vector{var"HloInstructionProto.SliceDimensions"}(), exponent_bits = zero(Int32), mantissa_bits = zero(Int32), dynamic_slice_sizes = Vector{Int64}(), padding_config = nothing, outfeed_config = UInt8[], distribution = RandomDistribution.RNG_INVALID, epsilon = zero(Float32), feature_index = zero(Int64), channel_id = zero(Int64), infeed_config = UInt8[], custom_call_target = "", outfeed_shape = nothing, dot_dimension_numbers = nothing, ragged_dot_dimension_numbers = nothing, fft_type = FftType.FFT, fft_length = Vector{Int64}(), comparison_direction = "", gather_dimension_numbers = nothing, gather_slice_sizes = Vector{Int64}(), id = zero(Int64), operand_ids = Vector{Int64}(), control_predecessor_ids = Vector{Int64}(), called_computation_ids = Vector{Int64}(), sharding = nothing, backend_config = UInt8[], replica_groups = Vector{ReplicaGroup}(), all_reduce_id = zero(Int64), use_global_device_ids = false, is_host_transfer = false, is_stable = false, scatter_dimension_numbers = nothing, precision_config = nothing, source_target_pairs = Vector{SourceTarget}(), domain_entry_sharding = nothing, domain_exit_sharding = nothing, constrain_layout = false, operand_shapes_with_layout = Vector{ShapeProto}(), triangular_solve_options = nothing, cholesky_options = nothing, parameter_replication = nothing, custom_call_has_side_effect = false, output_operand_aliasing = Vector{OutputOperandAliasing}(), custom_call_schedule = CustomCallSchedule.SCHEDULE_NONE, delta = zero(Int64), indices_are_sorted = false, frontend_attributes = nothing, unique_indices = false, rng_algorithm = RandomAlgorithm.RNG_DEFAULT, comparison_type = "", is_cross_program_prefetch = false, cross_program_prefetch_index = zero(Int32), padding_type = PaddingType.PADDING_INVALID, custom_call_api_version = CustomCallApiVersion.API_VERSION_UNSPECIFIED, async_execution_thread = "", k = zero(Int64), largest = false, statistics_viz = nothing, collective_device_list = nothing, original_value = nothing, is_composite = false, result_accuracy = nothing)
+PB.field_numbers(::Type{HloInstructionProto}) = (;name = 1, opcode = 2, shape = 3, metadata = 7, literal = 8, parameter_number = 9, fusion_kind = 11, tuple_index = 13, dimensions = 14, window = 15, convolution_dimension_numbers = 16, feature_group_count = 50, batch_group_count = 58, slice_dimensions = 17, exponent_bits = 18, mantissa_bits = 19, dynamic_slice_sizes = 20, padding_config = 21, outfeed_config = 22, distribution = 23, epsilon = 24, feature_index = 25, channel_id = 26, infeed_config = 27, custom_call_target = 28, outfeed_shape = 29, dot_dimension_numbers = 30, ragged_dot_dimension_numbers = 90, fft_type = 31, fft_length = 32, comparison_direction = 63, gather_dimension_numbers = 33, gather_slice_sizes = 34, id = 35, operand_ids = 36, control_predecessor_ids = 37, called_computation_ids = 38, sharding = 40, backend_config = 43, replica_groups = 49, all_reduce_id = 45, use_global_device_ids = 71, is_host_transfer = 47, is_stable = 60, scatter_dimension_numbers = 48, precision_config = 51, source_target_pairs = 52, domain_entry_sharding = 54, domain_exit_sharding = 55, constrain_layout = 56, operand_shapes_with_layout = 57, triangular_solve_options = 59, cholesky_options = 62, parameter_replication = 61, custom_call_has_side_effect = 65, output_operand_aliasing = 74, custom_call_schedule = 76, delta = 66, indices_are_sorted = 67, frontend_attributes = 68, unique_indices = 69, rng_algorithm = 70, comparison_type = 72, is_cross_program_prefetch = 73, cross_program_prefetch_index = 80, padding_type = 75, custom_call_api_version = 77, async_execution_thread = 79, k = 81, largest = 85, statistics_viz = 82, collective_device_list = 87, original_value = 88, is_composite = 89, result_accuracy = 91)
 
 function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto})
     name = ""
@@ -1154,7 +1153,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto})
     k = zero(Int64)
     largest = false
     statistics_viz = Ref{Union{Nothing,StatisticsViz}}(nothing)
-    replica_group_list = nothing
+    collective_device_list = Ref{Union{Nothing,CollectiveDeviceListProto}}(nothing)
     original_value = Ref{Union{Nothing,OriginalValueProto}}(nothing)
     is_composite = false
     result_accuracy = Ref{Union{Nothing,ResultAccuracy}}(nothing)
@@ -1303,11 +1302,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto})
         elseif field_number == 82
             PB.decode!(d, statistics_viz)
         elseif field_number == 87
-            replica_group_list = OneOf(:collective_device_list, PB.decode(d, Ref{CollectiveDeviceListProto}))
-        elseif field_number == 92
-            replica_group_list = OneOf(:iota_collective_device_list, PB.decode(d, Ref{IotaReplicaGroupListProto}))
-        elseif field_number == 93
-            replica_group_list = OneOf(:mesh_axes_replica_group_list, PB.decode(d, Ref{MeshAxesReplicaGroupListProto}))
+            PB.decode!(d, collective_device_list)
         elseif field_number == 88
             PB.decode!(d, original_value)
         elseif field_number == 89
@@ -1318,7 +1313,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto})
             Base.skip(d, wire_type)
         end
     end
-    return HloInstructionProto(name, opcode, shape[], metadata[], literal[], parameter_number, fusion_kind, tuple_index, dimensions[], window[], convolution_dimension_numbers[], feature_group_count, batch_group_count, slice_dimensions[], exponent_bits, mantissa_bits, dynamic_slice_sizes[], padding_config[], outfeed_config, distribution, epsilon, feature_index, channel_id, infeed_config, custom_call_target, outfeed_shape[], dot_dimension_numbers[], ragged_dot_dimension_numbers[], fft_type, fft_length[], comparison_direction, gather_dimension_numbers[], gather_slice_sizes[], id, operand_ids[], control_predecessor_ids[], called_computation_ids[], sharding[], backend_config, replica_groups[], all_reduce_id, use_global_device_ids, is_host_transfer, is_stable, scatter_dimension_numbers[], precision_config[], source_target_pairs[], domain_entry_sharding[], domain_exit_sharding[], constrain_layout, operand_shapes_with_layout[], triangular_solve_options[], cholesky_options[], parameter_replication[], custom_call_has_side_effect, output_operand_aliasing[], custom_call_schedule, delta, indices_are_sorted, frontend_attributes[], unique_indices, rng_algorithm, comparison_type, is_cross_program_prefetch, optional_cross_program_prefetch_index, padding_type, custom_call_api_version, async_execution_thread, k, largest, statistics_viz[], replica_group_list, original_value[], is_composite, result_accuracy[])
+    return HloInstructionProto(name, opcode, shape[], metadata[], literal[], parameter_number, fusion_kind, tuple_index, dimensions[], window[], convolution_dimension_numbers[], feature_group_count, batch_group_count, slice_dimensions[], exponent_bits, mantissa_bits, dynamic_slice_sizes[], padding_config[], outfeed_config, distribution, epsilon, feature_index, channel_id, infeed_config, custom_call_target, outfeed_shape[], dot_dimension_numbers[], ragged_dot_dimension_numbers[], fft_type, fft_length[], comparison_direction, gather_dimension_numbers[], gather_slice_sizes[], id, operand_ids[], control_predecessor_ids[], called_computation_ids[], sharding[], backend_config, replica_groups[], all_reduce_id, use_global_device_ids, is_host_transfer, is_stable, scatter_dimension_numbers[], precision_config[], source_target_pairs[], domain_entry_sharding[], domain_exit_sharding[], constrain_layout, operand_shapes_with_layout[], triangular_solve_options[], cholesky_options[], parameter_replication[], custom_call_has_side_effect, output_operand_aliasing[], custom_call_schedule, delta, indices_are_sorted, frontend_attributes[], unique_indices, rng_algorithm, comparison_type, is_cross_program_prefetch, optional_cross_program_prefetch_index, padding_type, custom_call_api_version, async_execution_thread, k, largest, statistics_viz[], collective_device_list[], original_value[], is_composite, result_accuracy[])
 end
 
 function PB.encode(e::PB.AbstractProtoEncoder, x::HloInstructionProto)
@@ -1397,14 +1392,7 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::HloInstructionProto)
     x.k != zero(Int64) && PB.encode(e, 81, x.k)
     x.largest != false && PB.encode(e, 85, x.largest)
     !isnothing(x.statistics_viz) && PB.encode(e, 82, x.statistics_viz)
-    if isnothing(x.replica_group_list);
-    elseif x.replica_group_list.name === :collective_device_list
-        PB.encode(e, 87, x.replica_group_list[]::CollectiveDeviceListProto)
-    elseif x.replica_group_list.name === :iota_collective_device_list
-        PB.encode(e, 92, x.replica_group_list[]::IotaReplicaGroupListProto)
-    elseif x.replica_group_list.name === :mesh_axes_replica_group_list
-        PB.encode(e, 93, x.replica_group_list[]::MeshAxesReplicaGroupListProto)
-    end
+    !isnothing(x.collective_device_list) && PB.encode(e, 87, x.collective_device_list)
     !isnothing(x.original_value) && PB.encode(e, 88, x.original_value)
     x.is_composite != false && PB.encode(e, 89, x.is_composite)
     !isnothing(x.result_accuracy) && PB.encode(e, 91, x.result_accuracy)
@@ -1486,21 +1474,14 @@ function PB._encoded_size(x::HloInstructionProto)
     x.k != zero(Int64) && (encoded_size += PB._encoded_size(x.k, 81))
     x.largest != false && (encoded_size += PB._encoded_size(x.largest, 85))
     !isnothing(x.statistics_viz) && (encoded_size += PB._encoded_size(x.statistics_viz, 82))
-    if isnothing(x.replica_group_list);
-    elseif x.replica_group_list.name === :collective_device_list
-        encoded_size += PB._encoded_size(x.replica_group_list[]::CollectiveDeviceListProto, 87)
-    elseif x.replica_group_list.name === :iota_collective_device_list
-        encoded_size += PB._encoded_size(x.replica_group_list[]::IotaReplicaGroupListProto, 92)
-    elseif x.replica_group_list.name === :mesh_axes_replica_group_list
-        encoded_size += PB._encoded_size(x.replica_group_list[]::MeshAxesReplicaGroupListProto, 93)
-    end
+    !isnothing(x.collective_device_list) && (encoded_size += PB._encoded_size(x.collective_device_list, 87))
     !isnothing(x.original_value) && (encoded_size += PB._encoded_size(x.original_value, 88))
     x.is_composite != false && (encoded_size += PB._encoded_size(x.is_composite, 89))
     !isnothing(x.result_accuracy) && (encoded_size += PB._encoded_size(x.result_accuracy, 91))
     return encoded_size
 end
 
-struct var"HeapSimulatorTrace.Event"
+mutable struct var"HeapSimulatorTrace.Event"
     kind::var"HeapSimulatorTrace.Event.Kind".T
     buffer_id::Int64
     computation_name::String
@@ -1554,7 +1535,7 @@ function PB._encoded_size(x::var"HeapSimulatorTrace.Event")
     return encoded_size
 end
 
-struct HloInputOutputAliasProto
+mutable struct HloInputOutputAliasProto
     entries::Vector{var"HloInputOutputAliasProto.AliasEntryProto"}
 end
 PB.default_values(::Type{HloInputOutputAliasProto}) = (;entries = Vector{var"HloInputOutputAliasProto.AliasEntryProto"}())
@@ -1584,7 +1565,7 @@ function PB._encoded_size(x::HloInputOutputAliasProto)
     return encoded_size
 end
 
-struct HloComputationProto
+mutable struct HloComputationProto
     name::String
     instructions::Vector{HloInstructionProto}
     program_shape::Union{Nothing,ProgramShapeProto}
@@ -1651,7 +1632,7 @@ function PB._encoded_size(x::HloComputationProto)
     return encoded_size
 end
 
-struct HeapSimulatorTrace
+mutable struct HeapSimulatorTrace
     events::Vector{var"HeapSimulatorTrace.Event"}
     whole_module_simulation::Bool
     buffer_allocation_index::Int64
@@ -1693,7 +1674,7 @@ function PB._encoded_size(x::HeapSimulatorTrace)
     return encoded_size
 end
 
-struct BufferAssignmentProto
+mutable struct BufferAssignmentProto
     logical_buffers::Vector{LogicalBufferProto}
     buffer_aliases::Vector{var"BufferAssignmentProto.BufferAlias"}
     buffer_allocations::Vector{BufferAllocationProto}
@@ -1742,12 +1723,12 @@ function PB._encoded_size(x::BufferAssignmentProto)
 end
 
 # Stub definitions for cyclic types
-struct var"##Stub#HloModuleGroupProto"{T1<:var"##Abstract#HloModuleProto"} <: var"##Abstract#HloModuleGroupProto"
+mutable struct var"##Stub#HloModuleGroupProto"{T1<:var"##Abstract#HloModuleProto"} <: var"##Abstract#HloModuleGroupProto"
     name::String
     hlo_modules::Vector{T1}
 end
 
-struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloModuleProto"
+mutable struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloModuleProto"
     name::String
     entry_computation_name::String
     entry_computation_id::Int64
@@ -1769,29 +1750,29 @@ struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableP
     original_value_recovery_table::Union{Nothing,T1}
 end
 
-struct var"##Stub#HloProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloProto"
+mutable struct var"##Stub#HloProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloProto"
     hlo_module::Union{Nothing,var"##Stub#HloModuleProto"{T1}}
     buffer_assignment::Union{Nothing,BufferAssignmentProto}
 end
 
-struct var"##Stub#HloSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloSnapshot"
+mutable struct var"##Stub#HloSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloSnapshot"
     hlo::Union{Nothing,var"##Stub#HloProto"{T1}}
     arguments::Vector{LiteralProto}
     result::Union{Nothing,LiteralProto}
     execution_platform::String
 end
 
-struct var"##Stub#HloUnoptimizedSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloUnoptimizedSnapshot"
+mutable struct var"##Stub#HloUnoptimizedSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloUnoptimizedSnapshot"
     hlo_module::Union{Nothing,var"##Stub#HloModuleProto"{T1}}
     partitions::Vector{HloInputs}
     version::Int32
 end
 
-struct var"##Stub#OriginalValueRecoveryTableProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto.Entry"} <: var"##Abstract#OriginalValueRecoveryTableProto"
+mutable struct var"##Stub#OriginalValueRecoveryTableProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto.Entry"} <: var"##Abstract#OriginalValueRecoveryTableProto"
     entries::Vector{T1}
 end
 
-struct var"##Stub#OriginalValueRecoveryTableProto.Entry" <: var"##Abstract#OriginalValueRecoveryTableProto.Entry"
+mutable struct var"##Stub#OriginalValueRecoveryTableProto.Entry" <: var"##Abstract#OriginalValueRecoveryTableProto.Entry"
     old_original_array::Union{Nothing,OriginalArrayProto}
     new_original_array::Union{Nothing,OriginalArrayProto}
     recovery_module::Union{Nothing,var"##Stub#HloModuleProto"{var"##Stub#OriginalValueRecoveryTableProto"{var"##Stub#OriginalValueRecoveryTableProto.Entry"}}}
diff --git a/src/proto/xla/metrics_pb.jl b/src/proto/xla/metrics_pb.jl
index 0db498df8e..677847810d 100644
--- a/src/proto/xla/metrics_pb.jl
+++ b/src/proto/xla/metrics_pb.jl
@@ -6,7 +6,7 @@ export JobInfo, TagMetric, var"CompilationLogEntry.CompilationStage", KeyValueMe
 export PassMetrics, CompilationLogEntry
 
 
-struct JobInfo
+mutable struct JobInfo
     name::String
     cell::String
     user::String
@@ -78,7 +78,7 @@ function PB._encoded_size(x::JobInfo)
     return encoded_size
 end
 
-struct TagMetric
+mutable struct TagMetric
     key::String
     value::String
 end
@@ -116,7 +116,7 @@ end
 
 @enumx var"CompilationLogEntry.CompilationStage" UNSPECIFIED=0 END_TO_END=1 HLO_PASSES=2 CODE_GENERATION=3 BACKEND_PASSES=4
 
-struct KeyValueMetric
+mutable struct KeyValueMetric
     key::String
     value::Int64
 end
@@ -152,7 +152,7 @@ function PB._encoded_size(x::KeyValueMetric)
     return encoded_size
 end
 
-struct PassMetrics
+mutable struct PassMetrics
     module_id::UInt64
     pass_name::String
     pass_duration::Union{Nothing,google.protobuf.Duration}
@@ -206,7 +206,7 @@ function PB._encoded_size(x::PassMetrics)
     return encoded_size
 end
 
-struct CompilationLogEntry
+mutable struct CompilationLogEntry
     timestamp::Union{Nothing,google.protobuf.Timestamp}
     stage::var"CompilationLogEntry.CompilationStage".T
     duration::Union{Nothing,google.protobuf.Duration}
diff --git a/src/proto/xla/xla_data_pb.jl b/src/proto/xla/xla_data_pb.jl
index cbdb348368..588a7787b0 100644
--- a/src/proto/xla/xla_data_pb.jl
+++ b/src/proto/xla/xla_data_pb.jl
@@ -35,7 +35,7 @@ abstract type var"##Abstract#ProgramShapeProto" end
 
 @enumx RandomDistribution RNG_INVALID=0 RNG_UNIFORM=1 RNG_NORMAL=2
 
-struct Statistic
+mutable struct Statistic
     stat_name::String
     stat_val::Float64
 end
@@ -73,7 +73,7 @@ end
 
 @enumx var"TriangularSolveOptions.Transpose" TRANSPOSE_INVALID=0 NO_TRANSPOSE=1 TRANSPOSE=2 ADJOINT=3
 
-struct var"WhileLoopBackendConfig.KnownInitStep"
+mutable struct var"WhileLoopBackendConfig.KnownInitStep"
     init::Int64
     step::Int64
 end
@@ -111,7 +111,7 @@ end
 
 @enumx var"ResultAccuracy.Mode" DEFAULT=0 HIGHEST=1
 
-struct GatherDimensionNumbers
+mutable struct GatherDimensionNumbers
     offset_dims::Vector{Int64}
     collapsed_slice_dims::Vector{Int64}
     start_index_map::Vector{Int64}
@@ -171,7 +171,7 @@ function PB._encoded_size(x::GatherDimensionNumbers)
     return encoded_size
 end
 
-struct var"DeviceAssignmentProto.ComputationDevice"
+mutable struct var"DeviceAssignmentProto.ComputationDevice"
     replica_device_ids::Vector{Int64}
 end
 PB.default_values(::Type{var"DeviceAssignmentProto.ComputationDevice"}) = (;replica_device_ids = Vector{Int64}())
@@ -201,7 +201,7 @@ function PB._encoded_size(x::var"DeviceAssignmentProto.ComputationDevice")
     return encoded_size
 end
 
-struct SplitConfigProto
+mutable struct SplitConfigProto
     dimension::Int64
     split_indices::Vector{Int64}
 end
@@ -243,7 +243,7 @@ end
 
 @enumx DimLevelType DIM_DENSE=0 DIM_COMPRESSED=1 DIM_SINGLETON=2 DIM_LOOSE_COMPRESSED=3
 
-struct var"WhileLoopBackendConfig.KnownTripCount"
+mutable struct var"WhileLoopBackendConfig.KnownTripCount"
     n::Int64
 end
 PB.default_values(::Type{var"WhileLoopBackendConfig.KnownTripCount"}) = (;n = zero(Int64))
@@ -277,7 +277,7 @@ end
 
 @enumx PrimitiveType PRIMITIVE_TYPE_INVALID=0 PRED=1 S1=30 S2=26 S4=21 S8=2 S16=3 S32=4 S64=5 U1=31 U2=27 U4=22 U8=6 U16=7 U32=8 U64=9 F16=10 F32=11 BF16=16 F64=12 F8E5M2=19 F8E4M3=28 F8E4M3FN=20 F8E4M3B11FNUZ=23 F8E3M4=29 F8E5M2FNUZ=24 F8E4M3FNUZ=25 F4E2M1FN=32 F8E8M0FNU=33 C64=15 C128=18 TUPLE=13 OPAQUE_TYPE=14 TOKEN=17 BUFFER=34
 
-struct ParameterReplication
+mutable struct ParameterReplication
     replicated_at_leaf_buffers::Vector{Bool}
 end
 PB.default_values(::Type{ParameterReplication}) = (;replicated_at_leaf_buffers = Vector{Bool}())
@@ -311,7 +311,7 @@ end
 
 @enumx var"ChannelHandle.ChannelType" CHANNEL_TYPE_INVALID=0 DEVICE_TO_DEVICE=1 DEVICE_TO_HOST=2 HOST_TO_DEVICE=3
 
-struct SortOptions
+mutable struct SortOptions
     descending::Bool
 end
 PB.default_values(::Type{SortOptions}) = (;descending = false)
@@ -341,7 +341,7 @@ function PB._encoded_size(x::SortOptions)
     return encoded_size
 end
 
-struct ReplicaGroup
+mutable struct ReplicaGroup
     replica_ids::Vector{Int64}
 end
 PB.default_values(::Type{ReplicaGroup}) = (;replica_ids = Vector{Int64}())
@@ -371,7 +371,7 @@ function PB._encoded_size(x::ReplicaGroup)
     return encoded_size
 end
 
-struct var"ResultAccuracy.Tolerance"
+mutable struct var"ResultAccuracy.Tolerance"
     atol::Float64
     rtol::Float64
     ulps::Int64
@@ -413,7 +413,7 @@ function PB._encoded_size(x::var"ResultAccuracy.Tolerance")
     return encoded_size
 end
 
-struct TileProto
+mutable struct TileProto
     dimensions::Vector{Int64}
 end
 PB.default_values(::Type{TileProto}) = (;dimensions = Vector{Int64}())
@@ -443,7 +443,7 @@ function PB._encoded_size(x::TileProto)
     return encoded_size
 end
 
-struct ScatterDimensionNumbers
+mutable struct ScatterDimensionNumbers
     update_window_dims::Vector{Int64}
     inserted_window_dims::Vector{Int64}
     scatter_dims_to_operand_dims::Vector{Int64}
@@ -503,7 +503,7 @@ function PB._encoded_size(x::ScatterDimensionNumbers)
     return encoded_size
 end
 
-struct SourceTarget
+mutable struct SourceTarget
     source::Int64
     target::Int64
 end
@@ -539,7 +539,7 @@ function PB._encoded_size(x::SourceTarget)
     return encoded_size
 end
 
-struct ExecutionHandle
+mutable struct ExecutionHandle
     handle::Int64
 end
 PB.default_values(::Type{ExecutionHandle}) = (;handle = zero(Int64))
@@ -569,7 +569,7 @@ function PB._encoded_size(x::ExecutionHandle)
     return encoded_size
 end
 
-struct GlobalDataHandle
+mutable struct GlobalDataHandle
     handle::Int64
 end
 PB.default_values(::Type{GlobalDataHandle}) = (;handle = zero(Int64))
@@ -603,7 +603,7 @@ end
 
 @enumx ProfileSource PROFILE_SOURCE_UNKNOWN_SOURCE=0 PROFILE_SOURCE_EMBEDDED=1 PROFILE_SOURCE_REMOTE=2
 
-struct DotDimensionNumbers
+mutable struct DotDimensionNumbers
     lhs_contracting_dimensions::Vector{Int64}
     rhs_contracting_dimensions::Vector{Int64}
     lhs_batch_dimensions::Vector{Int64}
@@ -651,7 +651,7 @@ function PB._encoded_size(x::DotDimensionNumbers)
     return encoded_size
 end
 
-struct DeviceHandle
+mutable struct DeviceHandle
     handle::Int64
     device_count::Int64
 end
@@ -689,7 +689,7 @@ end
 
 @enumx var"OpSharding.Type" REPLICATED=0 MAXIMAL=1 TUPLE=2 OTHER=3 MANUAL=4 UNKNOWN=5 UNREDUCED=6
 
-struct WindowDimension
+mutable struct WindowDimension
     size::Int64
     stride::Int64
     padding_low::Int64
@@ -755,7 +755,7 @@ function PB._encoded_size(x::WindowDimension)
     return encoded_size
 end
 
-struct ConvolutionDimensionNumbers
+mutable struct ConvolutionDimensionNumbers
     input_batch_dimension::Int64
     input_feature_dimension::Int64
     input_spatial_dimensions::Vector{Int64}
@@ -833,7 +833,7 @@ function PB._encoded_size(x::ConvolutionDimensionNumbers)
     return encoded_size
 end
 
-struct IotaReplicaGroupListProto
+mutable struct IotaReplicaGroupListProto
     num_replica_groups::Int64
     num_devices_per_group::Int64
     iota_reshape_dims::Vector{Int64}
@@ -881,7 +881,7 @@ function PB._encoded_size(x::IotaReplicaGroupListProto)
     return encoded_size
 end
 
-struct OriginalArrayProto
+mutable struct OriginalArrayProto
     instruction_name::String
     shape_index::Vector{Int64}
 end
@@ -917,7 +917,7 @@ function PB._encoded_size(x::OriginalArrayProto)
     return encoded_size
 end
 
-struct ComputationStats
+mutable struct ComputationStats
     flop_count::Float64
     transcendental_count::Float64
 end
@@ -953,7 +953,7 @@ function PB._encoded_size(x::ComputationStats)
     return encoded_size
 end
 
-struct FrontendAttributes
+mutable struct FrontendAttributes
     map::Dict{String,String}
 end
 PB.default_values(::Type{FrontendAttributes}) = (;map = Dict{String,String}())
@@ -987,7 +987,7 @@ end
 
 @enumx AsyncStreamKind ASYNC_STREAM_KIND_COLLECTIVE=0 ASYNC_STREAM_KIND_P2P0=1 ASYNC_STREAM_KIND_P2P1=2 ASYNC_STREAM_KIND_MEMCPYP2P=3
 
-struct var"WhileLoopBackendConfig.KnownInductionVariable"
+mutable struct var"WhileLoopBackendConfig.KnownInductionVariable"
     tuple_index::Int64
 end
 PB.default_values(::Type{var"WhileLoopBackendConfig.KnownInductionVariable"}) = (;tuple_index = zero(Int64))
@@ -1017,7 +1017,7 @@ function PB._encoded_size(x::var"WhileLoopBackendConfig.KnownInductionVariable")
     return encoded_size
 end
 
-struct var"PaddingConfig.PaddingConfigDimension"
+mutable struct var"PaddingConfig.PaddingConfigDimension"
     edge_padding_low::Int64
     edge_padding_high::Int64
     interior_padding::Int64
@@ -1059,7 +1059,7 @@ function PB._encoded_size(x::var"PaddingConfig.PaddingConfigDimension")
     return encoded_size
 end
 
-struct GemmPerfTableEntry
+mutable struct GemmPerfTableEntry
     b::Int64
     m::Int64
     n::Int64
@@ -1113,7 +1113,7 @@ function PB._encoded_size(x::GemmPerfTableEntry)
     return encoded_size
 end
 
-struct OutputOperandAliasing
+mutable struct OutputOperandAliasing
     output_shape_index::Vector{Int64}
     operand_index::Int64
     operand_shape_index::Vector{Int64}
@@ -1157,7 +1157,7 @@ end
 
 @enumx var"PrecisionConfig.Precision" DEFAULT=0 HIGH=1 HIGHEST=2
 
-struct ExecutionProfile
+mutable struct ExecutionProfile
     compilation_cache_hit::Bool
     compile_time_ms::Int64
     compute_cycle_count::Int64
@@ -1229,7 +1229,7 @@ function PB._encoded_size(x::ExecutionProfile)
     return encoded_size
 end
 
-struct var"AxisRefProto.SubAxis"
+mutable struct var"AxisRefProto.SubAxis"
     pre_size::Int64
     size::Int64
 end
@@ -1269,7 +1269,7 @@ end
 
 @enumx PaddingType PADDING_INVALID=0 PADDING_VALID=1 PADDING_SAME=2
 
-struct var"MeshProto.MeshAxis"
+mutable struct var"MeshProto.MeshAxis"
     name::String
     size::Int64
 end
@@ -1307,7 +1307,7 @@ end
 
 @enumx var"OpSharding.ShardGroupType" AS=0 LIKE=1
 
-struct CholeskyOptions
+mutable struct CholeskyOptions
     lower::Bool
 end
 PB.default_values(::Type{CholeskyOptions}) = (;lower = false)
@@ -1337,7 +1337,7 @@ function PB._encoded_size(x::CholeskyOptions)
     return encoded_size
 end
 
-struct StatisticsViz
+mutable struct StatisticsViz
     stat_index_to_visualize::Int64
     statistics::Vector{Statistic}
 end
@@ -1373,7 +1373,7 @@ function PB._encoded_size(x::StatisticsViz)
     return encoded_size
 end
 
-struct TriangularSolveOptions
+mutable struct TriangularSolveOptions
     left_side::Bool
     lower::Bool
     unit_diagonal::Bool
@@ -1421,7 +1421,7 @@ function PB._encoded_size(x::TriangularSolveOptions)
     return encoded_size
 end
 
-struct DeviceAssignmentProto
+mutable struct DeviceAssignmentProto
     replica_count::Int32
     computation_count::Int32
     computation_devices::Vector{var"DeviceAssignmentProto.ComputationDevice"}
@@ -1463,7 +1463,7 @@ function PB._encoded_size(x::DeviceAssignmentProto)
     return encoded_size
 end
 
-struct ChannelHandle
+mutable struct ChannelHandle
     handle::Int64
     var"#type"::var"ChannelHandle.ChannelType".T
 end
@@ -1499,7 +1499,7 @@ function PB._encoded_size(x::ChannelHandle)
     return encoded_size
 end
 
-struct ResultAccuracy
+mutable struct ResultAccuracy
     specs::Union{Nothing,OneOf{<:Union{var"ResultAccuracy.Mode".T,var"ResultAccuracy.Tolerance"}}}
 end
 PB.oneof_field_types(::Type{ResultAccuracy}) = (;
@@ -1544,7 +1544,7 @@ function PB._encoded_size(x::ResultAccuracy)
     return encoded_size
 end
 
-struct RaggedDotDimensionNumbers
+mutable struct RaggedDotDimensionNumbers
     dot_dimension_numbers::Union{Nothing,DotDimensionNumbers}
     lhs_ragged_dimensions::Vector{Int64}
     rhs_group_dimensions::Vector{Int64}
@@ -1586,7 +1586,7 @@ function PB._encoded_size(x::RaggedDotDimensionNumbers)
     return encoded_size
 end
 
-struct Window
+mutable struct Window
     dimensions::Vector{WindowDimension}
 end
 PB.default_values(::Type{Window}) = (;dimensions = Vector{WindowDimension}())
@@ -1616,7 +1616,7 @@ function PB._encoded_size(x::Window)
     return encoded_size
 end
 
-struct CollectiveDeviceListProto
+mutable struct CollectiveDeviceListProto
     replica_groups::Vector{ReplicaGroup}
     iota_replica_group_list::Union{Nothing,IotaReplicaGroupListProto}
 end
@@ -1652,7 +1652,7 @@ function PB._encoded_size(x::CollectiveDeviceListProto)
     return encoded_size
 end
 
-struct OriginalValueElementProto
+mutable struct OriginalValueElementProto
     shape_index::Vector{Int64}
     original_array::Union{Nothing,OriginalArrayProto}
 end
@@ -1688,20 +1688,18 @@ function PB._encoded_size(x::OriginalValueElementProto)
     return encoded_size
 end
 
-struct WhileLoopBackendConfig
+mutable struct WhileLoopBackendConfig
     known_trip_count::Union{Nothing,var"WhileLoopBackendConfig.KnownTripCount"}
     known_init_step::Union{Nothing,var"WhileLoopBackendConfig.KnownInitStep"}
     known_induction_variable::Union{Nothing,var"WhileLoopBackendConfig.KnownInductionVariable"}
-    dynamic_variable_tuple_indices::Vector{Int64}
 end
-PB.default_values(::Type{WhileLoopBackendConfig}) = (;known_trip_count = nothing, known_init_step = nothing, known_induction_variable = nothing, dynamic_variable_tuple_indices = Vector{Int64}())
-PB.field_numbers(::Type{WhileLoopBackendConfig}) = (;known_trip_count = 1, known_init_step = 2, known_induction_variable = 3, dynamic_variable_tuple_indices = 4)
+PB.default_values(::Type{WhileLoopBackendConfig}) = (;known_trip_count = nothing, known_init_step = nothing, known_induction_variable = nothing)
+PB.field_numbers(::Type{WhileLoopBackendConfig}) = (;known_trip_count = 1, known_init_step = 2, known_induction_variable = 3)
 
 function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:WhileLoopBackendConfig})
     known_trip_count = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownTripCount"}}(nothing)
     known_init_step = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownInitStep"}}(nothing)
     known_induction_variable = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownInductionVariable"}}(nothing)
-    dynamic_variable_tuple_indices = PB.BufferedVector{Int64}()
     while !PB.message_done(d)
         field_number, wire_type = PB.decode_tag(d)
         if field_number == 1
@@ -1710,13 +1708,11 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:WhileLoopBackendConfig})
             PB.decode!(d, known_init_step)
         elseif field_number == 3
             PB.decode!(d, known_induction_variable)
-        elseif field_number == 4
-            PB.decode!(d, wire_type, dynamic_variable_tuple_indices)
         else
             Base.skip(d, wire_type)
         end
     end
-    return WhileLoopBackendConfig(known_trip_count[], known_init_step[], known_induction_variable[], dynamic_variable_tuple_indices[])
+    return WhileLoopBackendConfig(known_trip_count[], known_init_step[], known_induction_variable[])
 end
 
 function PB.encode(e::PB.AbstractProtoEncoder, x::WhileLoopBackendConfig)
@@ -1724,7 +1720,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::WhileLoopBackendConfig)
     !isnothing(x.known_trip_count) && PB.encode(e, 1, x.known_trip_count)
     !isnothing(x.known_init_step) && PB.encode(e, 2, x.known_init_step)
     !isnothing(x.known_induction_variable) && PB.encode(e, 3, x.known_induction_variable)
-    !isempty(x.dynamic_variable_tuple_indices) && PB.encode(e, 4, x.dynamic_variable_tuple_indices)
     return position(e.io) - initpos
 end
 function PB._encoded_size(x::WhileLoopBackendConfig)
@@ -1732,11 +1727,10 @@ function PB._encoded_size(x::WhileLoopBackendConfig)
     !isnothing(x.known_trip_count) && (encoded_size += PB._encoded_size(x.known_trip_count, 1))
     !isnothing(x.known_init_step) && (encoded_size += PB._encoded_size(x.known_init_step, 2))
     !isnothing(x.known_induction_variable) && (encoded_size += PB._encoded_size(x.known_induction_variable, 3))
-    !isempty(x.dynamic_variable_tuple_indices) && (encoded_size += PB._encoded_size(x.dynamic_variable_tuple_indices, 4))
     return encoded_size
 end
 
-struct PaddingConfig
+mutable struct PaddingConfig
     dimensions::Vector{var"PaddingConfig.PaddingConfigDimension"}
 end
 PB.default_values(::Type{PaddingConfig}) = (;dimensions = Vector{var"PaddingConfig.PaddingConfigDimension"}())
@@ -1766,7 +1760,7 @@ function PB._encoded_size(x::PaddingConfig)
     return encoded_size
 end
 
-struct GemmPerfTableEntryValues
+mutable struct GemmPerfTableEntryValues
     entries::Vector{GemmPerfTableEntry}
 end
 PB.default_values(::Type{GemmPerfTableEntryValues}) = (;entries = Vector{GemmPerfTableEntry}())
@@ -1796,7 +1790,7 @@ function PB._encoded_size(x::GemmPerfTableEntryValues)
     return encoded_size
 end
 
-struct PrecisionConfig
+mutable struct PrecisionConfig
     operand_precision::Vector{var"PrecisionConfig.Precision".T}
     algorithm::var"PrecisionConfig.Algorithm".T
 end
@@ -1832,7 +1826,7 @@ function PB._encoded_size(x::PrecisionConfig)
     return encoded_size
 end
 
-struct AxisRefProto
+mutable struct AxisRefProto
     mesh_axis_index::Int64
     sub_axis_info::Union{Nothing,var"AxisRefProto.SubAxis"}
 end
@@ -1868,7 +1862,7 @@ function PB._encoded_size(x::AxisRefProto)
     return encoded_size
 end
 
-struct var"OpMetadata.ProfileInfo"
+mutable struct var"OpMetadata.ProfileInfo"
     profile_type::Vector{ProfileType.T}
     relative_speedup::Float64
     profile_source::ProfileSource.T
@@ -1922,7 +1916,7 @@ function PB._encoded_size(x::var"OpMetadata.ProfileInfo")
     return encoded_size
 end
 
-struct MeshProto
+mutable struct MeshProto
     axes::Vector{var"MeshProto.MeshAxis"}
     device_ids::Vector{Int64}
 end
@@ -1958,7 +1952,7 @@ function PB._encoded_size(x::MeshProto)
     return encoded_size
 end
 
-struct OriginalValueProto
+mutable struct OriginalValueProto
     elements::Vector{OriginalValueElementProto}
     is_synthetic_call::Bool
 end
@@ -1994,7 +1988,7 @@ function PB._encoded_size(x::OriginalValueProto)
     return encoded_size
 end
 
-struct GemmPerfTable
+mutable struct GemmPerfTable
     entries::Dict{String,GemmPerfTableEntryValues}
 end
 PB.default_values(::Type{GemmPerfTable}) = (;entries = Dict{String,GemmPerfTableEntryValues}())
@@ -2024,7 +2018,7 @@ function PB._encoded_size(x::GemmPerfTable)
     return encoded_size
 end
 
-struct var"NamedShardingProto.DimensionSharding"
+mutable struct var"NamedShardingProto.DimensionSharding"
     axes::Vector{AxisRefProto}
     is_closed::Bool
 end
@@ -2060,7 +2054,7 @@ function PB._encoded_size(x::var"NamedShardingProto.DimensionSharding")
     return encoded_size
 end
 
-struct OpMetadata
+mutable struct OpMetadata
     op_type::String
     op_name::String
     source_file::String
@@ -2169,7 +2163,7 @@ function PB._encoded_size(x::OpMetadata)
     return encoded_size
 end
 
-struct MeshAxesReplicaGroupListProto
+mutable struct MeshAxesReplicaGroupListProto
     mesh::Union{Nothing,MeshProto}
     axes::Vector{AxisRefProto}
 end
@@ -2205,7 +2199,7 @@ function PB._encoded_size(x::MeshAxesReplicaGroupListProto)
     return encoded_size
 end
 
-struct NamedShardingProto
+mutable struct NamedShardingProto
     mesh::Union{Nothing,MeshProto}
     dim_shardings::Vector{var"NamedShardingProto.DimensionSharding"}
     replicated_axes::Vector{AxisRefProto}
@@ -2261,7 +2255,7 @@ function PB._encoded_size(x::NamedShardingProto)
 end
 
 # Stub definitions for cyclic types
-struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LayoutProto"
+mutable struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LayoutProto"
     minor_to_major::Vector{Int64}
     dim_level_types::Vector{DimLevelType.T}
     dim_unique::Vector{Bool}
@@ -2277,7 +2271,7 @@ struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract
     split_configs::Vector{SplitConfigProto}
 end
 
-struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LiteralProto"
+mutable struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LiteralProto"
     shape::Union{Nothing,T1}
     preds::Vector{Bool}
     s1s::Vector{UInt8}
@@ -2313,7 +2307,7 @@ struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstrac
     sparse_indices::Vector{Int64}
 end
 
-struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#OpSharding"
+mutable struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#OpSharding"
     var"#type"::var"OpSharding.Type".T
     tile_shape::Union{Nothing,T1}
     tile_assignment_dimensions::Vector{Int64}
@@ -2330,13 +2324,13 @@ struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#
     named_sharding::Union{Nothing,NamedShardingProto}
 end
 
-struct var"##Stub#ProgramShapeProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#ProgramShapeProto"
+mutable struct var"##Stub#ProgramShapeProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#ProgramShapeProto"
     parameters::Vector{T1}
     result::Union{Nothing,T1}
     parameter_names::Vector{String}
 end
 
-struct var"##Stub#ShapeProto" <: var"##Abstract#ShapeProto"
+mutable struct var"##Stub#ShapeProto" <: var"##Abstract#ShapeProto"
     element_type::PrimitiveType.T
     dimensions::Vector{Int64}
     is_dynamic_dimension::Vector{Bool}
diff --git a/src/proto/xla/xla_pb.jl b/src/proto/xla/xla_pb.jl
index 7cb7a44090..19936eab5a 100644
--- a/src/proto/xla/xla_pb.jl
+++ b/src/proto/xla/xla_pb.jl
@@ -7,13 +7,12 @@ export var"DebugOptions.LibNvJitLinkMode", var"DebugOptions.LibraryFusionType"
 export var"DebugOptions.PipelineParallelismOptLevel", ShardableValueUpdatePairProto
 export var"DebugOptions.AutotuneCacheMode", var"DebugOptions.CommandBufferSchedulingMode"
 export var"DebugOptions.PartitioningAlgorithm", var"HloModuleConfigProto.Int64List"
-export var"ScheduleProto.SchedulerStatisticsProto", var"DebugOptions.AutotuneBackend"
-export var"HloModuleConfigProto.BoolList", var"DebugOptions.StepMarkerLocation"
-export var"DebugOptions.CollectiveOpType", var"DebugOptions.ShapeChecks"
-export var"DebugOptions.XnnGraphFusionMode", GpuCompilationEnvironment
-export var"ScheduleConfigProto.Instruction", var"DebugOptions.WhileLoopUnrolling"
-export NodeShardingConfigProto, IntRangeInclusive, CompilationEnvironmentsProto
-export var"DebugOptions.PGLEStrictnessLevel"
+export var"ScheduleProto.SchedulerStatisticsProto", var"HloModuleConfigProto.BoolList"
+export var"DebugOptions.StepMarkerLocation", var"DebugOptions.CollectiveOpType"
+export var"DebugOptions.ShapeChecks", var"DebugOptions.XnnGraphFusionMode"
+export GpuCompilationEnvironment, var"ScheduleConfigProto.Instruction"
+export var"DebugOptions.WhileLoopUnrolling", NodeShardingConfigProto, IntRangeInclusive
+export CompilationEnvironmentsProto, var"DebugOptions.PGLEStrictnessLevel"
 export var"HloModuleConfigProto.FusionConfigCollection", var"DebugOptions.DetectionMode"
 export var"DebugOptions.CommandBufferCmdType", var"HloModuleConfigProto.Int64ListList"
 export var"ScheduleProto.ComputationScheduleProto", ScheduleConfigProto
@@ -21,7 +20,7 @@ export ShardingConfigProto, ThunkBufferDebugFilter, ScheduleProto, DebugOptions
 export HloModuleConfigProto, ExecutionOptions, HloModuleProtoWithConfig
 
 
-struct var"ScheduleProto.Instruction"
+mutable struct var"ScheduleProto.Instruction"
     id::Int64
     start_timestamp_cycles::Float64
     end_timestamp_cycles::Float64
@@ -84,7 +83,7 @@ end
 @enumx var"DebugOptions.PipelineParallelismOptLevel" PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE=0 PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE=1
 PB.reserved_fields(::Type{var"DebugOptions.PipelineParallelismOptLevel".T}) = (names = ["PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER"], numbers = Union{Int,UnitRange{Int}}[2])
 
-struct ShardableValueUpdatePairProto
+mutable struct ShardableValueUpdatePairProto
     input_parameter_number::Int64
     parameter_shape_index::Vector{Int64}
     output_shape_index::Vector{Int64}
@@ -132,7 +131,7 @@ end
 
 @enumx var"DebugOptions.PartitioningAlgorithm" PARTITIONING_ALGORITHM_NOOP=0 PARTITIONING_ALGORITHM_EXP0=1 PARTITIONING_ALGORITHM_EXP1=2 PARTITIONING_ALGORITHM_EXP2=3
 
-struct var"HloModuleConfigProto.Int64List"
+mutable struct var"HloModuleConfigProto.Int64List"
     vals::Vector{Int64}
 end
 PB.default_values(::Type{var"HloModuleConfigProto.Int64List"}) = (;vals = Vector{Int64}())
@@ -162,7 +161,7 @@ function PB._encoded_size(x::var"HloModuleConfigProto.Int64List")
     return encoded_size
 end
 
-struct var"ScheduleProto.SchedulerStatisticsProto"
+mutable struct var"ScheduleProto.SchedulerStatisticsProto"
     all_gather_wasted_cycles::Float64
     all_reduce_wasted_cycles::Float64
     collective_broadcast_wasted_cycles::Float64
@@ -264,9 +263,7 @@ function PB._encoded_size(x::var"ScheduleProto.SchedulerStatisticsProto")
     return encoded_size
 end
 
-@enumx var"DebugOptions.AutotuneBackend" AUTOTUNE_BACKEND_ALL=0 AUTOTUNE_BACKEND_CUDNN=1 AUTOTUNE_BACKEND_TRITON=2 AUTOTUNE_BACKEND_CUBLAS=3 AUTOTUNE_BACKEND_CUBLASLT=4
-
-struct var"HloModuleConfigProto.BoolList"
+mutable struct var"HloModuleConfigProto.BoolList"
     vals::Vector{Bool}
 end
 PB.default_values(::Type{var"HloModuleConfigProto.BoolList"}) = (;vals = Vector{Bool}())
@@ -304,7 +301,7 @@ end
 
 @enumx var"DebugOptions.XnnGraphFusionMode" XNN_GRAPH_FUSION_MODE_DISABLED=0 XNN_GRAPH_FUSION_MODE_GREEDY=1 XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY=2 XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL=3
 
-struct GpuCompilationEnvironment
+mutable struct GpuCompilationEnvironment
     dummy_flag::Int64
 end
 PB.default_values(::Type{GpuCompilationEnvironment}) = (;dummy_flag = zero(Int64))
@@ -334,7 +331,7 @@ function PB._encoded_size(x::GpuCompilationEnvironment)
     return encoded_size
 end
 
-struct var"ScheduleConfigProto.Instruction"
+mutable struct var"ScheduleConfigProto.Instruction"
     name::String
 end
 PB.default_values(::Type{var"ScheduleConfigProto.Instruction"}) = (;name = "")
@@ -366,7 +363,7 @@ end
 
 @enumx var"DebugOptions.WhileLoopUnrolling" WHILE_LOOP_UNROLLING_NO_UNROLL=0 WHILE_LOOP_UNROLLING_DOUBLE_BUFFER=1 WHILE_LOOP_UNROLLING_FULL_UNROLL=2 WHILE_LOOP_UNROLLING_AUTO_UNROLL=3
 
-struct NodeShardingConfigProto
+mutable struct NodeShardingConfigProto
     sharding::Union{Nothing,OpSharding}
     nodes::Vector{NodeShardingConfigProto}
 end
@@ -402,7 +399,7 @@ function PB._encoded_size(x::NodeShardingConfigProto)
     return encoded_size
 end
 
-struct IntRangeInclusive
+mutable struct IntRangeInclusive
     first::Int64
     last::Int64
 end
@@ -438,7 +435,7 @@ function PB._encoded_size(x::IntRangeInclusive)
     return encoded_size
 end
 
-struct CompilationEnvironmentsProto
+mutable struct CompilationEnvironmentsProto
     environments::Vector{google.protobuf.var"#Any"}
 end
 PB.default_values(::Type{CompilationEnvironmentsProto}) = (;environments = Vector{google.protobuf.var"#Any"}())
@@ -476,7 +473,7 @@ end
 
 @enumx var"DebugOptions.CommandBufferCmdType" INVALID=0 FUSION=1 CUBLAS=2 CUDNN=3 COLLECTIVES=4 CONDITIONAL=5 WHILE=6 CUSTOM_CALL=7 CUBLASLT=8 DYNAMIC_SLICE_FUSION=9 DYNAMIC_SLICE_COPY_FUSION=10
 
-struct var"HloModuleConfigProto.Int64ListList"
+mutable struct var"HloModuleConfigProto.Int64ListList"
     lists::Vector{var"HloModuleConfigProto.Int64List"}
 end
 PB.default_values(::Type{var"HloModuleConfigProto.Int64ListList"}) = (;lists = Vector{var"HloModuleConfigProto.Int64List"}())
@@ -506,7 +503,7 @@ function PB._encoded_size(x::var"HloModuleConfigProto.Int64ListList")
     return encoded_size
 end
 
-struct var"ScheduleProto.ComputationScheduleProto"
+mutable struct var"ScheduleProto.ComputationScheduleProto"
     computation_id::Int64
     instructions::Vector{var"ScheduleProto.Instruction"}
     scheduler_statistics::Union{Nothing,var"ScheduleProto.SchedulerStatisticsProto"}
@@ -554,7 +551,7 @@ function PB._encoded_size(x::var"ScheduleProto.ComputationScheduleProto")
     return encoded_size
 end
 
-struct ScheduleConfigProto
+mutable struct ScheduleConfigProto
     sequence::Vector{var"ScheduleConfigProto.Instruction"}
 end
 PB.default_values(::Type{ScheduleConfigProto}) = (;sequence = Vector{var"ScheduleConfigProto.Instruction"}())
@@ -584,7 +581,7 @@ function PB._encoded_size(x::ScheduleConfigProto)
     return encoded_size
 end
 
-struct ShardingConfigProto
+mutable struct ShardingConfigProto
     nodes::Vector{NodeShardingConfigProto}
 end
 PB.default_values(::Type{ShardingConfigProto}) = (;nodes = Vector{NodeShardingConfigProto}())
@@ -614,7 +611,7 @@ function PB._encoded_size(x::ShardingConfigProto)
     return encoded_size
 end
 
-struct ThunkBufferDebugFilter
+mutable struct ThunkBufferDebugFilter
     thunk_id_ranges::Vector{IntRangeInclusive}
     profile_annotation_regexes::Vector{String}
 end
@@ -650,7 +647,7 @@ function PB._encoded_size(x::ThunkBufferDebugFilter)
     return encoded_size
 end
 
-struct ScheduleProto
+mutable struct ScheduleProto
     hlo_module::Union{Nothing,HloModuleProto}
     computation_schedules::Vector{var"ScheduleProto.ComputationScheduleProto"}
 end
@@ -686,7 +683,7 @@ function PB._encoded_size(x::ScheduleProto)
     return encoded_size
 end
 
-struct DebugOptions
+mutable struct DebugOptions
     xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled::Bool
     xla_disable_automatic_host_compute_offload::Bool
     xla_enable_scoped_logging_timers::Bool
@@ -747,7 +744,6 @@ struct DebugOptions
     xla_gpu_cuda_data_dir::String
     xla_gpu_cudnn_gemm_fusion_level::Int32
     xla_gpu_cudnn_gemm_max_plans::Int32
-    xla_gpu_default_to_alg_dot_bf16_bf16_f32::Bool
     xla_gpu_deterministic_ops::Bool
     xla_gpu_disable_async_collectives::Vector{var"DebugOptions.CollectiveOpType".T}
     xla_gpu_disable_gpuasm_optimizations::Bool
@@ -778,7 +774,6 @@ struct DebugOptions
     xla_gpu_enable_pipelined_all_gather::Bool
     xla_gpu_enable_pipelined_all_reduce::Bool
     xla_gpu_enable_pipelined_collectives::Bool
-    xla_gpu_enable_pipelined_host_offloading::Bool
     xla_gpu_enable_pipelined_p2p::Bool
     xla_gpu_enable_pipelined_reduce_scatter::Bool
     xla_gpu_enable_reassociation_for_converted_ar::Bool
@@ -798,7 +793,6 @@ struct DebugOptions
     xla_gpu_exhaustive_tiling_search::Bool
     xla_gpu_experimental_allow_unroll_factor_eight::Bool
     xla_gpu_experimental_aot_compiled_thunks::Bool
-    xla_gpu_experimental_autotune_backends::Vector{var"DebugOptions.AutotuneBackend".T}
     xla_gpu_experimental_autotune_cache_mode::var"DebugOptions.AutotuneCacheMode".T
     xla_gpu_experimental_autotuner_cache_dir::String
     xla_gpu_experimental_collective_cse_distance_threshold::Int64
@@ -959,8 +953,8 @@ struct DebugOptions
     xla_backend_extra_options::Dict{String,String}
 end
 PB.reserved_fields(::Type{DebugOptions}) = (names = ["hlo_reduce_precision_options", "legacy_command_buffer_custom_call_targets", "xla_allow_get_default_platform", "xla_cpu_dump_unoptimized_hlo_snapshots", "xla_cpu_enable_custom_matmul_tiling", "xla_cpu_enable_experimental_deallocation", "xla_cpu_enable_mlir_fusion_outlining", "xla_cpu_enable_mlir_lowering", "xla_cpu_enable_mlir_tiling_and_fusion", "xla_cpu_matmul_tiling_k_dim", "xla_cpu_matmul_tiling_m_dim", "xla_cpu_matmul_tiling_n_dim", "xla_cpu_sparse_cuda_threads", "xla_cpu_use_thunk_runtime", "xla_cpu_use_xla_runtime", "xla_detailed_logging_and_dumping", "xla_dump_ir", "xla_experimental_exec_time_optimization_effort", "xla_gpu_all_reduce_contiguous", "xla_gpu_allow_all_reduce_kernel", "xla_gpu_asm_extra_flags", "xla_gpu_bef_executable", "xla_gpu_bef_thunk", "xla_gpu_deterministic_reductions", "xla_gpu_disable_multi_streaming", "xla_gpu_dump_hlo_unoptimized_snapshots", "xla_gpu_enable_all_reduce_splitter", "xla_gpu_enable_async_all_gather", "xla_gpu_enable_async_all_reduce", "xla_gpu_enable_async_all_to_all", "xla_gpu_enable_async_collective_broadcast", "xla_gpu_enable_async_collective_permute", "xla_gpu_enable_async_collectives", "xla_gpu_enable_async_reduce_scatter", "xla_gpu_enable_bf16_3way_gemm", "xla_gpu_enable_bf16_6way_gemm", "xla_gpu_enable_cuda_graphs", "xla_gpu_enable_cudnn_fmha", "xla_gpu_enable_cudnn_frontend", "xla_gpu_enable_custom_fusions_re", "xla_gpu_enable_custom_fusions", "xla_gpu_enable_dot_strength_reduction", "xla_gpu_enable_experimental_block_size", "xla_gpu_enable_gpu2_hal", "xla_gpu_enable_gpu2_runtime", "xla_gpu_enable_heuristic_pass_configuration", "xla_gpu_enable_libnvjitlink", "xla_gpu_enable_mlir_emitters", "xla_gpu_enable_mlir_lowering", "xla_gpu_enable_nccl_per_stream_comms", "xla_gpu_enable_persistent_temp_buffers", "xla_gpu_enable_pgle_accuracy_checker", "xla_gpu_enable_priority_fusion", "xla_gpu_enable_softmax_fusion", "xla_gpu_enable_triton_gemm_int4", "xla_gpu_enable_triton_hopper", "xla_gpu_enable_triton_softmax_fusion", "xla_gpu_enable_triton_softmax_priority_fusion", "xla_gpu_enable_xla_runtime_executable", "xla_gpu_ensure_minor_dot_contraction_dims", "xla_gpu_experimental_enable_dynamic_dot_search_space", "xla_gpu_experimental_enable_nan_counter_on_thunks", "xla_gpu_experimental_enable_triton_i4_rewrites", "xla_gpu_experimental_enable_triton_softmax_priority_fusion", "xla_gpu_graph_eviction_timeout_seconds", "xla_gpu_graph_level", "xla_gpu_graph_num_runs_to_instantiate", "xla_gpu_lhs_enable_gpu_async_tracker", "xla_gpu_max_kernel_unroll_factor", "xla_gpu_max_mlir_kernels", "xla_gpu_mlir_emitter_level", "xla_gpu_normalize_layouts", "xla_gpu_redzone_scratch_max_megabytes", "xla_gpu_run_post_layout_collective_pipeliner", "xla_gpu_simplify_all_fp_conversions", "xla_gpu_simplify_gathers", "xla_gpu_simplify_scatters", "xla_gpu_single_wave_autotuning", "xla_gpu_skip_mlir_kernels", "xla_gpu_triton_fusion_level", "xla_gpu_triton_gemm_disable_reduced_precision_reduction", "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error", "xla_gpu_unsafe_pipelined_loop_annotator", "xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms", "xla_gpu_unsupported_force_triton_gemm", "xla_gpu_unsupported_generic_triton_emitter_features", "xla_gpu_use_cudnn_batchnorm", "xla_gpu_use_horizontal_fusion", "xla_gpu_use_random_streams", "xla_hlo_dump_as_graphdef", "xla_hlo_tfgraph_device_scopes", "xla_use_shardy", "xla_gpu_unsupported_annotate_with_emitter_loc", "xla_gpu_experimental_enable_command_buffer_on_thunks", "xla_gpu_experimental_enable_triton_tma"], numbers = Union{Int,UnitRange{Int}}[5, 63, 80, 93, 94, 98, 117, 130, 133, 134, 139, 141, 143, 152, 158, 160, 161, 162, 167, 168, 169, 171, 172, 173, 176, 177, 178, 179, 180, 183, 184, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 206, 207, 211, 214, 218, 220, 221, 226, 229, 230, 233, 234, 238, 242, 249, 263, 264, 266, 270, 271, 275, 276, 278, 279, 281, 282, 286, 298, 299, 302, 303, 309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 355, 358, 361, 367, 369, 371, 385, 394, 398, 402, 423])
-PB.default_values(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false, xla_disable_automatic_host_compute_offload = false, xla_enable_scoped_logging_timers = false, xla_hlo_pass_fix_detect_cycles = false, xla_keep_shardings_after_spmd = false, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = false, xla_unsupported_crash_on_hlo_pass_noop_change = false, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = false, xla_cpu_collective_call_terminate_timeout_seconds = zero(Int32), xla_cpu_collective_call_warn_stuck_seconds = zero(Int32), xla_cpu_collective_timeout_seconds = zero(Int32), xla_cpu_copy_insertion_use_region_analysis = false, xla_cpu_emitter_verification_level = zero(Int32), xla_cpu_enable_concurrency_optimized_scheduler = false, xla_cpu_enable_fast_math = false, xla_cpu_enable_fast_min_max = false, xla_cpu_enable_platform_dependent_math = false, xla_cpu_experimental_onednn_custom_call = false, xla_cpu_experimental_onednn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_graph_fusion_mode = var"DebugOptions.XnnGraphFusionMode".XNN_GRAPH_FUSION_MODE_DISABLED, xla_cpu_experimental_ynn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_fast_math_honor_division = false, xla_cpu_fast_math_honor_functions = false, xla_cpu_fast_math_honor_infs = false, xla_cpu_fast_math_honor_nans = false, xla_cpu_generate_unique_c_style_kernel_entry_points = false, xla_cpu_max_isa = "", xla_cpu_parallel_codegen_split_count = zero(Int32), xla_cpu_prefer_vector_width = zero(Int32), xla_cpu_use_fusion_emitters = false, xla_cpu_use_xnnpack = false, xla_enable_fast_math = false, xla_gpu_experimental_thunk_buffer_debug_filter = nothing, xla_dump_hlo_unoptimized_snapshots = false, xla_enable_enzyme_comms_opt = false, xla_gpu_algorithm_denylist_path = "", xla_gpu_all_gather_combine_threshold_bytes = zero(Int64), xla_gpu_all_reduce_blueconnect_num_devices_per_host = zero(Int32), xla_gpu_all_reduce_combine_threshold_bytes = zero(Int64), xla_gpu_analytical_latency_estimator_options = Dict{String,String}(), xla_gpu_async_dot = false, xla_gpu_auto_spmd_partitioning_memory_budget_gb = zero(Int32), xla_gpu_auto_spmd_partitioning_memory_budget_ratio = zero(Float32), xla_gpu_autotune_gemm_rtol = zero(Float32), xla_gpu_autotune_level = zero(Int32), xla_gpu_autotune_max_solutions = zero(Int64), xla_gpu_collect_cost_model_stats = false, xla_gpu_collective_inflation_factor = zero(Int32), xla_gpu_collective_permute_combine_threshold_bytes = zero(Int64), xla_gpu_collective_permute_decomposer_threshold = zero(Int64), xla_gpu_collectives_use_persistent_cliques = false, xla_gpu_command_buffer_scheduling_mode = var"DebugOptions.CommandBufferSchedulingMode".SERIALIZE, xla_gpu_command_buffer_unroll_loops = false, xla_gpu_copy_insertion_use_region_analysis = false, xla_gpu_crash_on_verification_failures = false, xla_gpu_cublas_fallback = false, xla_gpu_cuda_data_dir = "", xla_gpu_cudnn_gemm_fusion_level = zero(Int32), xla_gpu_cudnn_gemm_max_plans = zero(Int32), xla_gpu_default_to_alg_dot_bf16_bf16_f32 = false, xla_gpu_deterministic_ops = false, xla_gpu_disable_async_collectives = Vector{var"DebugOptions.CollectiveOpType".T}(), xla_gpu_disable_gpuasm_optimizations = false, xla_gpu_dot_merger_threshold_mb = zero(Int32), xla_gpu_dump_autotune_logs_to = "", xla_gpu_dump_autotune_results_to = "", xla_gpu_dump_autotuned_gemm_fusions = false, xla_gpu_dump_llvmir = false, xla_gpu_enable_all_gather_combine_by_dim = false, xla_gpu_enable_analytical_latency_estimator = false, xla_gpu_enable_analytical_sol_latency_estimator = false, xla_gpu_enable_approx_costly_collectives = false, xla_gpu_enable_command_buffer = Vector{var"DebugOptions.CommandBufferCmdType".T}(), xla_gpu_enable_cub_radix_sort = false, xla_gpu_enable_cublaslt = false, xla_gpu_enable_cudnn_int8x32_convolution_reordering = false, xla_gpu_enable_cudnn_layer_norm = false, xla_gpu_enable_dynamic_slice_fusion = false, xla_gpu_enable_fast_min_max = false, xla_gpu_enable_highest_priority_async_stream = false, xla_gpu_enable_host_memory_offloading = false, xla_gpu_enable_latency_hiding_scheduler = false, xla_gpu_enable_libnvptxcompiler = false, xla_gpu_enable_llvm_module_compilation_parallelism = false, xla_gpu_enable_nccl_clique_optimization = false, xla_gpu_enable_nccl_comm_splitting = false, xla_gpu_enable_nccl_user_buffers = false, xla_gpu_enable_pipelined_all_gather = false, xla_gpu_enable_pipelined_all_reduce = false, xla_gpu_enable_pipelined_collectives = false, xla_gpu_enable_pipelined_host_offloading = false, xla_gpu_enable_pipelined_p2p = false, xla_gpu_enable_pipelined_reduce_scatter = false, xla_gpu_enable_reassociation_for_converted_ar = false, xla_gpu_enable_reduce_scatter_combine_by_dim = false, xla_gpu_enable_reduction_epilogue_fusion = false, xla_gpu_enable_scatter_determinism_expander = false, xla_gpu_enable_shared_constants = false, xla_gpu_enable_split_k_autotuning = false, xla_gpu_enable_triton_gemm = false, xla_gpu_enable_while_loop_double_buffering = false, xla_gpu_enable_while_loop_reduce_scatter_code_motion = false, xla_gpu_enable_while_loop_unrolling = var"DebugOptions.WhileLoopUnrolling".WHILE_LOOP_UNROLLING_NO_UNROLL, xla_gpu_exclude_nondeterministic_ops = false, xla_gpu_executable_embed_debug_info = false, xla_gpu_executable_terminate_timeout_seconds = zero(Int32), xla_gpu_executable_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_exhaustive_tiling_search = false, xla_gpu_experimental_allow_unroll_factor_eight = false, xla_gpu_experimental_aot_compiled_thunks = false, xla_gpu_experimental_autotune_backends = Vector{var"DebugOptions.AutotuneBackend".T}(), xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED, xla_gpu_experimental_autotuner_cache_dir = "", xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64), xla_gpu_experimental_collective_perf_table_path = "", xla_gpu_experimental_disable_binary_libraries = false, xla_gpu_experimental_dump_fdo_profiles = false, xla_gpu_experimental_dump_gpu_executable = false, xla_gpu_experimental_enable_alltoall_windowed_einsum = false, xla_gpu_experimental_enable_buffer_saver_on_thunks = false, xla_gpu_experimental_enable_checksum_tracing_on_thunks = false, xla_gpu_experimental_enable_fusion_autotuner = false, xla_gpu_experimental_enable_fusion_block_level_rewriter = false, xla_gpu_experimental_enable_heuristic_collective_combining = false, xla_gpu_experimental_enable_nccl_symmetric_buffers = false, xla_gpu_experimental_enable_nvshmem = false, xla_gpu_experimental_enable_split_k_rewrite = false, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = false, xla_gpu_experimental_enable_triton_heroless_priority_fusion = false, xla_gpu_experimental_enable_triton_warp_specialization = false, xla_gpu_experimental_pack_dot_operands_along_k_dimension = false, xla_gpu_experimental_parallel_collective_overlap_limit = zero(Int32), xla_gpu_experimental_pipeline_parallelism_opt_level = var"DebugOptions.PipelineParallelismOptLevel".PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE, xla_gpu_experimental_stream_annotation = false, xla_gpu_experimental_use_autotuner_pass = false, xla_gpu_experimental_use_ragged_dot_fusion = false, xla_gpu_fail_ptx_compilation_on_register_spilling = false, xla_gpu_filter_kernels_spilling_registers_on_autotuning = false, xla_gpu_first_collective_call_terminate_timeout_seconds = zero(Int32), xla_gpu_first_collective_call_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_force_compilation_parallelism = zero(Int32), xla_gpu_force_conv_nchw = false, xla_gpu_force_conv_nhwc = false, xla_gpu_ftz = false, xla_gpu_fused_attention_use_cudnn_rng = false, xla_gpu_gemm_autotuner_override_file = "", xla_gpu_gemm_rewrite_size_threshold = zero(Int64), xla_gpu_generate_debug_info = false, xla_gpu_generate_line_info = false, xla_gpu_graph_enable_concurrent_region = false, xla_gpu_graph_min_graph_size = zero(Int32), xla_gpu_kernel_cache_file = "", xla_gpu_libnvjitlink_mode = var"DebugOptions.LibNvJitLinkMode".LIB_NV_JIT_LINK_MODE_AUTO, xla_gpu_llvm_ir_file = Vector{String}(), xla_gpu_llvm_verification_level = zero(Int32), xla_gpu_load_autotune_results_from = "", xla_gpu_memory_limit_slop_factor = zero(Int32), xla_gpu_mock_custom_calls = false, xla_gpu_multi_streamed_windowed_einsum = false, xla_gpu_nccl_async_execution = false, xla_gpu_nccl_blocking_communicators = false, xla_gpu_nccl_collective_max_nchannels = zero(Int64), xla_gpu_nccl_init_max_rank_per_root_ratio = zero(Int64), xla_gpu_nccl_p2p_max_nchannels = zero(Int64), xla_gpu_nccl_terminate_on_error = false, xla_gpu_nccl_termination_timeout_seconds = zero(Int64), xla_gpu_operand_bytes_threshold_for_windowed_einsum = zero(Int64), xla_gpu_override_gemm_autotuner = "", xla_gpu_per_fusion_autotune_cache_dir = "", xla_gpu_pgle_accuracy_checker = var"DebugOptions.PGLEStrictnessLevel".PGLE_STRICTNESS_LEVEL_OFF, xla_gpu_pgle_profile_file_or_directory_path = "", xla_gpu_ptx_file = Vector{String}(), xla_gpu_reduce_scatter_combine_threshold_bytes = zero(Int64), xla_gpu_redzone_padding_bytes = zero(Int64), xla_gpu_require_complete_aot_autotune_results = false, xla_gpu_require_exclusive_lock = false, xla_gpu_shape_checks = var"DebugOptions.ShapeChecks".IGNORE, xla_gpu_shard_autotuning = false, xla_gpu_strict_conv_algorithm_picker = false, xla_gpu_target_config_filename = "", xla_gpu_temp_buffer_use_separate_color = false, xla_gpu_threshold_for_windowed_einsum_mib = zero(Int64), xla_gpu_triton_gemm_any = false, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = false, xla_gpu_unsupported_enable_all_reduce_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = false, xla_gpu_unsupported_enable_triton_gemm = false, xla_gpu_unsupported_enable_triton_multi_output_fusion = false, xla_gpu_unsupported_override_fast_interconnect_slice_size = zero(Int64), xla_gpu_unsupported_use_all_reduce_one_shot_kernel = false, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = false, xla_gpu_use_embeded_device_lib = false, xla_gpu_use_inprocess_lld = false, xla_gpu_use_memcpy_local_p2p = false, xla_gpu_use_runtime_fusion = false, xla_gpu_verify_triton_fusion_numerics = false, xla_hlo_graph_addresses = false, xla_hlo_profile = false, xla_disable_hlo_passes = Vector{String}(), xla_enable_hlo_passes_only = Vector{String}(), xla_disable_all_hlo_passes = false, xla_backend_optimization_level = zero(Int32), xla_embed_ir_in_executable = false, xla_eliminate_hlo_implicit_broadcast = false, xla_cpu_multi_thread_eigen = false, xla_llvm_enable_alias_scope_metadata = false, xla_llvm_enable_noalias_metadata = false, xla_llvm_enable_invariant_load_metadata = false, xla_llvm_disable_expensive_passes = false, xla_test_all_output_layouts = false, xla_test_all_input_layouts = false, xla_hlo_graph_sharding_color = false, xla_cpu_use_onednn = false, xla_allow_excess_precision = false, xla_force_host_platform_device_count = zero(Int32), xla_hlo_evaluator_use_fast_path = false, xla_allow_scalar_index_dynamic_ops = false, xla_step_marker_location = var"DebugOptions.StepMarkerLocation".STEP_MARK_AT_ENTRY, xla_dump_to = "", xla_flags_reset = false, xla_dump_hlo_module_re = "", xla_dump_hlo_pass_re = "", xla_dump_emitter_re = "", xla_dump_hlo_as_text = false, xla_dump_hlo_as_proto = false, xla_dump_hlo_as_dot = false, xla_dump_hlo_as_url = false, xla_dump_hlo_as_html = false, xla_dump_fusion_visualization = false, xla_dump_hlo_snapshots = false, xla_dump_include_timestamp = false, xla_dump_max_hlo_modules = zero(Int32), xla_dump_module_metadata = false, xla_dump_compress_protos = false, xla_dump_hlo_as_long_text = false, xla_dump_enable_mlir_pretty_form = false, xla_dump_full_hlo_config = false, xla_tpu_detect_nan = false, xla_tpu_detect_inf = false, xla_cpu_enable_xprof_traceme = false, xla_multiheap_size_constraint_per_heap = zero(Int32), xla_detailed_logging = false, xla_enable_dumping = false, xla_llvm_force_inline_before_split = false, xla_dump_disable_metadata = false, xla_dump_hlo_pipeline_re = "", xla_cpu_use_acl = false, xla_cpu_strict_dot_conv_math = false, xla_dump_latency_hiding_schedule = false, xla_partitioning_algorithm = var"DebugOptions.PartitioningAlgorithm".PARTITIONING_ALGORITHM_NOOP, xla_debug_buffer_assignment_show_max = zero(Int64), xla_detect_unstable_reductions = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_detect_unstable_reductions_post_optimizations = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_nan = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_inf = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_dump_large_constants = false, xla_reduce_window_rewrite_base_length = zero(Int64), xla_cmd_buffer_trace_cache_size = zero(Int64), xla_syntax_sugar_async_ops = false, xla_enable_command_buffers_during_profiling = false, xla_ignore_channel_id = false, xla_pjrt_allow_auto_layout_in_hlo = false, xla_test_add_command_buffer_mode = false, xla_gpu_experimental_matmul_perf_table_path = "", xla_early_exit_with_layouts = false, xla_gpu_experimental_scaled_dot_with_triton = false, xla_gpu_experimental_use_raft_select_k = false, xla_backend_extra_options = Dict{String,String}())
-PB.field_numbers(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439, xla_disable_automatic_host_compute_offload = 408, xla_enable_scoped_logging_timers = 436, xla_hlo_pass_fix_detect_cycles = 370, xla_keep_shardings_after_spmd = 419, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363, xla_unsupported_crash_on_hlo_pass_noop_change = 379, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380, xla_cpu_collective_call_terminate_timeout_seconds = 417, xla_cpu_collective_call_warn_stuck_seconds = 418, xla_cpu_collective_timeout_seconds = 438, xla_cpu_copy_insertion_use_region_analysis = 337, xla_cpu_emitter_verification_level = 395, xla_cpu_enable_concurrency_optimized_scheduler = 307, xla_cpu_enable_fast_math = 99, xla_cpu_enable_fast_min_max = 140, xla_cpu_enable_platform_dependent_math = 425, xla_cpu_experimental_onednn_custom_call = 412, xla_cpu_experimental_onednn_fusion_type = 399, xla_cpu_experimental_xnn_fusion_type = 400, xla_cpu_experimental_xnn_graph_fusion_mode = 365, xla_cpu_experimental_ynn_fusion_type = 422, xla_cpu_fast_math_honor_division = 126, xla_cpu_fast_math_honor_functions = 129, xla_cpu_fast_math_honor_infs = 121, xla_cpu_fast_math_honor_nans = 120, xla_cpu_generate_unique_c_style_kernel_entry_points = 372, xla_cpu_max_isa = 333, xla_cpu_parallel_codegen_split_count = 323, xla_cpu_prefer_vector_width = 308, xla_cpu_use_fusion_emitters = 376, xla_cpu_use_xnnpack = 359, xla_enable_fast_math = 335, xla_gpu_experimental_thunk_buffer_debug_filter = 424, xla_dump_hlo_unoptimized_snapshots = 405, xla_enable_enzyme_comms_opt = 429, xla_gpu_algorithm_denylist_path = 128, xla_gpu_all_gather_combine_threshold_bytes = 212, xla_gpu_all_reduce_blueconnect_num_devices_per_host = 159, xla_gpu_all_reduce_combine_threshold_bytes = 157, xla_gpu_analytical_latency_estimator_options = 357, xla_gpu_async_dot = 321, xla_gpu_auto_spmd_partitioning_memory_budget_gb = 224, xla_gpu_auto_spmd_partitioning_memory_budget_ratio = 225, xla_gpu_autotune_gemm_rtol = 316, xla_gpu_autotune_level = 123, xla_gpu_autotune_max_solutions = 288, xla_gpu_collect_cost_model_stats = 240, xla_gpu_collective_inflation_factor = 205, xla_gpu_collective_permute_combine_threshold_bytes = 378, xla_gpu_collective_permute_decomposer_threshold = 237, xla_gpu_collectives_use_persistent_cliques = 354, xla_gpu_command_buffer_scheduling_mode = 404, xla_gpu_command_buffer_unroll_loops = 411, xla_gpu_copy_insertion_use_region_analysis = 236, xla_gpu_crash_on_verification_failures = 101, xla_gpu_cublas_fallback = 247, xla_gpu_cuda_data_dir = 61, xla_gpu_cudnn_gemm_fusion_level = 285, xla_gpu_cudnn_gemm_max_plans = 318, xla_gpu_default_to_alg_dot_bf16_bf16_f32 = 441, xla_gpu_deterministic_ops = 148, xla_gpu_disable_async_collectives = 289, xla_gpu_disable_gpuasm_optimizations = 103, xla_gpu_dot_merger_threshold_mb = 331, xla_gpu_dump_autotune_logs_to = 292, xla_gpu_dump_autotune_results_to = 222, xla_gpu_dump_autotuned_gemm_fusions = 232, xla_gpu_dump_llvmir = 155, xla_gpu_enable_all_gather_combine_by_dim = 254, xla_gpu_enable_analytical_latency_estimator = 255, xla_gpu_enable_analytical_sol_latency_estimator = 356, xla_gpu_enable_approx_costly_collectives = 305, xla_gpu_enable_command_buffer = 258, xla_gpu_enable_cub_radix_sort = 259, xla_gpu_enable_cublaslt = 166, xla_gpu_enable_cudnn_int8x32_convolution_reordering = 189, xla_gpu_enable_cudnn_layer_norm = 262, xla_gpu_enable_dynamic_slice_fusion = 105, xla_gpu_enable_fast_min_max = 100, xla_gpu_enable_highest_priority_async_stream = 216, xla_gpu_enable_host_memory_offloading = 296, xla_gpu_enable_latency_hiding_scheduler = 186, xla_gpu_enable_libnvptxcompiler = 269, xla_gpu_enable_llvm_module_compilation_parallelism = 268, xla_gpu_enable_nccl_clique_optimization = 244, xla_gpu_enable_nccl_comm_splitting = 272, xla_gpu_enable_nccl_user_buffers = 267, xla_gpu_enable_pipelined_all_gather = 227, xla_gpu_enable_pipelined_all_reduce = 217, xla_gpu_enable_pipelined_collectives = 239, xla_gpu_enable_pipelined_host_offloading = 440, xla_gpu_enable_pipelined_p2p = 246, xla_gpu_enable_pipelined_reduce_scatter = 231, xla_gpu_enable_reassociation_for_converted_ar = 209, xla_gpu_enable_reduce_scatter_combine_by_dim = 257, xla_gpu_enable_reduction_epilogue_fusion = 243, xla_gpu_enable_scatter_determinism_expander = 345, xla_gpu_enable_shared_constants = 165, xla_gpu_enable_split_k_autotuning = 241, xla_gpu_enable_triton_gemm = 188, xla_gpu_enable_while_loop_double_buffering = 248, xla_gpu_enable_while_loop_reduce_scatter_code_motion = 203, xla_gpu_enable_while_loop_unrolling = 294, xla_gpu_exclude_nondeterministic_ops = 297, xla_gpu_executable_embed_debug_info = 437, xla_gpu_executable_terminate_timeout_seconds = 328, xla_gpu_executable_warn_stuck_timeout_seconds = 327, xla_gpu_exhaustive_tiling_search = 219, xla_gpu_experimental_allow_unroll_factor_eight = 430, xla_gpu_experimental_aot_compiled_thunks = 435, xla_gpu_experimental_autotune_backends = 442, xla_gpu_experimental_autotune_cache_mode = 324, xla_gpu_experimental_autotuner_cache_dir = 407, xla_gpu_experimental_collective_cse_distance_threshold = 374, xla_gpu_experimental_collective_perf_table_path = 377, xla_gpu_experimental_disable_binary_libraries = 329, xla_gpu_experimental_dump_fdo_profiles = 338, xla_gpu_experimental_dump_gpu_executable = 427, xla_gpu_experimental_enable_alltoall_windowed_einsum = 360, xla_gpu_experimental_enable_buffer_saver_on_thunks = 431, xla_gpu_experimental_enable_checksum_tracing_on_thunks = 414, xla_gpu_experimental_enable_fusion_autotuner = 409, xla_gpu_experimental_enable_fusion_block_level_rewriter = 334, xla_gpu_experimental_enable_heuristic_collective_combining = 366, xla_gpu_experimental_enable_nccl_symmetric_buffers = 406, xla_gpu_experimental_enable_nvshmem = 388, xla_gpu_experimental_enable_split_k_rewrite = 386, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = 368, xla_gpu_experimental_enable_triton_heroless_priority_fusion = 340, xla_gpu_experimental_enable_triton_warp_specialization = 421, xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362, xla_gpu_experimental_parallel_collective_overlap_limit = 336, xla_gpu_experimental_pipeline_parallelism_opt_level = 351, xla_gpu_experimental_stream_annotation = 342, xla_gpu_experimental_use_autotuner_pass = 396, xla_gpu_experimental_use_ragged_dot_fusion = 401, xla_gpu_fail_ptx_compilation_on_register_spilling = 353, xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250, xla_gpu_first_collective_call_terminate_timeout_seconds = 392, xla_gpu_first_collective_call_warn_stuck_timeout_seconds = 391, xla_gpu_force_compilation_parallelism = 147, xla_gpu_force_conv_nchw = 125, xla_gpu_force_conv_nhwc = 146, xla_gpu_ftz = 62, xla_gpu_fused_attention_use_cudnn_rng = 235, xla_gpu_gemm_autotuner_override_file = 434, xla_gpu_gemm_rewrite_size_threshold = 283, xla_gpu_generate_debug_info = 348, xla_gpu_generate_line_info = 349, xla_gpu_graph_enable_concurrent_region = 215, xla_gpu_graph_min_graph_size = 208, xla_gpu_kernel_cache_file = 306, xla_gpu_libnvjitlink_mode = 343, xla_gpu_llvm_ir_file = 150, xla_gpu_llvm_verification_level = 256, xla_gpu_load_autotune_results_from = 223, xla_gpu_memory_limit_slop_factor = 260, xla_gpu_mock_custom_calls = 245, xla_gpu_multi_streamed_windowed_einsum = 280, xla_gpu_nccl_async_execution = 393, xla_gpu_nccl_blocking_communicators = 390, xla_gpu_nccl_collective_max_nchannels = 273, xla_gpu_nccl_init_max_rank_per_root_ratio = 277, xla_gpu_nccl_p2p_max_nchannels = 274, xla_gpu_nccl_terminate_on_error = 301, xla_gpu_nccl_termination_timeout_seconds = 163, xla_gpu_operand_bytes_threshold_for_windowed_einsum = 339, xla_gpu_override_gemm_autotuner = 295, xla_gpu_per_fusion_autotune_cache_dir = 310, xla_gpu_pgle_accuracy_checker = 341, xla_gpu_pgle_profile_file_or_directory_path = 210, xla_gpu_ptx_file = 127, xla_gpu_reduce_scatter_combine_threshold_bytes = 213, xla_gpu_redzone_padding_bytes = 228, xla_gpu_require_complete_aot_autotune_results = 284, xla_gpu_require_exclusive_lock = 347, xla_gpu_shape_checks = 170, xla_gpu_shard_autotuning = 304, xla_gpu_strict_conv_algorithm_picker = 156, xla_gpu_target_config_filename = 261, xla_gpu_temp_buffer_use_separate_color = 312, xla_gpu_threshold_for_windowed_einsum_mib = 265, xla_gpu_triton_gemm_any = 190, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138, xla_gpu_unsupported_enable_all_reduce_decomposer = 384, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = 350, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = 415, xla_gpu_unsupported_enable_triton_gemm = 322, xla_gpu_unsupported_enable_triton_multi_output_fusion = 382, xla_gpu_unsupported_override_fast_interconnect_slice_size = 416, xla_gpu_unsupported_use_all_reduce_one_shot_kernel = 387, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = 375, xla_gpu_use_embeded_device_lib = 420, xla_gpu_use_inprocess_lld = 389, xla_gpu_use_memcpy_local_p2p = 287, xla_gpu_use_runtime_fusion = 181, xla_gpu_verify_triton_fusion_numerics = 291, xla_hlo_graph_addresses = 2, xla_hlo_profile = 9, xla_disable_hlo_passes = 30, xla_enable_hlo_passes_only = 124, xla_disable_all_hlo_passes = 104, xla_backend_optimization_level = 31, xla_embed_ir_in_executable = 33, xla_eliminate_hlo_implicit_broadcast = 35, xla_cpu_multi_thread_eigen = 60, xla_llvm_enable_alias_scope_metadata = 70, xla_llvm_enable_noalias_metadata = 71, xla_llvm_enable_invariant_load_metadata = 72, xla_llvm_disable_expensive_passes = 73, xla_test_all_output_layouts = 90, xla_test_all_input_layouts = 91, xla_hlo_graph_sharding_color = 92, xla_cpu_use_onednn = 97, xla_allow_excess_precision = 122, xla_force_host_platform_device_count = 102, xla_hlo_evaluator_use_fast_path = 106, xla_allow_scalar_index_dynamic_ops = 107, xla_step_marker_location = 108, xla_dump_to = 109, xla_flags_reset = 364, xla_dump_hlo_module_re = 110, xla_dump_hlo_pass_re = 111, xla_dump_emitter_re = 433, xla_dump_hlo_as_text = 112, xla_dump_hlo_as_proto = 113, xla_dump_hlo_as_dot = 114, xla_dump_hlo_as_url = 115, xla_dump_hlo_as_html = 116, xla_dump_fusion_visualization = 149, xla_dump_hlo_snapshots = 118, xla_dump_include_timestamp = 131, xla_dump_max_hlo_modules = 132, xla_dump_module_metadata = 144, xla_dump_compress_protos = 151, xla_dump_hlo_as_long_text = 164, xla_dump_enable_mlir_pretty_form = 185, xla_dump_full_hlo_config = 381, xla_tpu_detect_nan = 135, xla_tpu_detect_inf = 136, xla_cpu_enable_xprof_traceme = 137, xla_multiheap_size_constraint_per_heap = 142, xla_detailed_logging = 252, xla_enable_dumping = 253, xla_llvm_force_inline_before_split = 300, xla_dump_disable_metadata = 153, xla_dump_hlo_pipeline_re = 154, xla_cpu_use_acl = 174, xla_cpu_strict_dot_conv_math = 175, xla_dump_latency_hiding_schedule = 182, xla_partitioning_algorithm = 187, xla_debug_buffer_assignment_show_max = 251, xla_detect_unstable_reductions = 403, xla_detect_unstable_reductions_post_optimizations = 432, xla_gpu_detect_nan = 426, xla_gpu_detect_inf = 428, xla_dump_large_constants = 290, xla_reduce_window_rewrite_base_length = 293, xla_cmd_buffer_trace_cache_size = 311, xla_syntax_sugar_async_ops = 315, xla_enable_command_buffers_during_profiling = 317, xla_ignore_channel_id = 330, xla_pjrt_allow_auto_layout_in_hlo = 344, xla_test_add_command_buffer_mode = 373, xla_gpu_experimental_matmul_perf_table_path = 383, xla_early_exit_with_layouts = 397, xla_gpu_experimental_scaled_dot_with_triton = 410, xla_gpu_experimental_use_raft_select_k = 413, xla_backend_extra_options = 500)
+PB.default_values(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false, xla_disable_automatic_host_compute_offload = false, xla_enable_scoped_logging_timers = false, xla_hlo_pass_fix_detect_cycles = false, xla_keep_shardings_after_spmd = false, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = false, xla_unsupported_crash_on_hlo_pass_noop_change = false, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = false, xla_cpu_collective_call_terminate_timeout_seconds = zero(Int32), xla_cpu_collective_call_warn_stuck_seconds = zero(Int32), xla_cpu_collective_timeout_seconds = zero(Int32), xla_cpu_copy_insertion_use_region_analysis = false, xla_cpu_emitter_verification_level = zero(Int32), xla_cpu_enable_concurrency_optimized_scheduler = false, xla_cpu_enable_fast_math = false, xla_cpu_enable_fast_min_max = false, xla_cpu_enable_platform_dependent_math = false, xla_cpu_experimental_onednn_custom_call = false, xla_cpu_experimental_onednn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_graph_fusion_mode = var"DebugOptions.XnnGraphFusionMode".XNN_GRAPH_FUSION_MODE_DISABLED, xla_cpu_experimental_ynn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_fast_math_honor_division = false, xla_cpu_fast_math_honor_functions = false, xla_cpu_fast_math_honor_infs = false, xla_cpu_fast_math_honor_nans = false, xla_cpu_generate_unique_c_style_kernel_entry_points = false, xla_cpu_max_isa = "", xla_cpu_parallel_codegen_split_count = zero(Int32), xla_cpu_prefer_vector_width = zero(Int32), xla_cpu_use_fusion_emitters = false, xla_cpu_use_xnnpack = false, xla_enable_fast_math = false, xla_gpu_experimental_thunk_buffer_debug_filter = nothing, xla_dump_hlo_unoptimized_snapshots = false, xla_enable_enzyme_comms_opt = false, xla_gpu_algorithm_denylist_path = "", xla_gpu_all_gather_combine_threshold_bytes = zero(Int64), xla_gpu_all_reduce_blueconnect_num_devices_per_host = zero(Int32), xla_gpu_all_reduce_combine_threshold_bytes = zero(Int64), xla_gpu_analytical_latency_estimator_options = Dict{String,String}(), xla_gpu_async_dot = false, xla_gpu_auto_spmd_partitioning_memory_budget_gb = zero(Int32), xla_gpu_auto_spmd_partitioning_memory_budget_ratio = zero(Float32), xla_gpu_autotune_gemm_rtol = zero(Float32), xla_gpu_autotune_level = zero(Int32), xla_gpu_autotune_max_solutions = zero(Int64), xla_gpu_collect_cost_model_stats = false, xla_gpu_collective_inflation_factor = zero(Int32), xla_gpu_collective_permute_combine_threshold_bytes = zero(Int64), xla_gpu_collective_permute_decomposer_threshold = zero(Int64), xla_gpu_collectives_use_persistent_cliques = false, xla_gpu_command_buffer_scheduling_mode = var"DebugOptions.CommandBufferSchedulingMode".SERIALIZE, xla_gpu_command_buffer_unroll_loops = false, xla_gpu_copy_insertion_use_region_analysis = false, xla_gpu_crash_on_verification_failures = false, xla_gpu_cublas_fallback = false, xla_gpu_cuda_data_dir = "", xla_gpu_cudnn_gemm_fusion_level = zero(Int32), xla_gpu_cudnn_gemm_max_plans = zero(Int32), xla_gpu_deterministic_ops = false, xla_gpu_disable_async_collectives = Vector{var"DebugOptions.CollectiveOpType".T}(), xla_gpu_disable_gpuasm_optimizations = false, xla_gpu_dot_merger_threshold_mb = zero(Int32), xla_gpu_dump_autotune_logs_to = "", xla_gpu_dump_autotune_results_to = "", xla_gpu_dump_autotuned_gemm_fusions = false, xla_gpu_dump_llvmir = false, xla_gpu_enable_all_gather_combine_by_dim = false, xla_gpu_enable_analytical_latency_estimator = false, xla_gpu_enable_analytical_sol_latency_estimator = false, xla_gpu_enable_approx_costly_collectives = false, xla_gpu_enable_command_buffer = Vector{var"DebugOptions.CommandBufferCmdType".T}(), xla_gpu_enable_cub_radix_sort = false, xla_gpu_enable_cublaslt = false, xla_gpu_enable_cudnn_int8x32_convolution_reordering = false, xla_gpu_enable_cudnn_layer_norm = false, xla_gpu_enable_dynamic_slice_fusion = false, xla_gpu_enable_fast_min_max = false, xla_gpu_enable_highest_priority_async_stream = false, xla_gpu_enable_host_memory_offloading = false, xla_gpu_enable_latency_hiding_scheduler = false, xla_gpu_enable_libnvptxcompiler = false, xla_gpu_enable_llvm_module_compilation_parallelism = false, xla_gpu_enable_nccl_clique_optimization = false, xla_gpu_enable_nccl_comm_splitting = false, xla_gpu_enable_nccl_user_buffers = false, xla_gpu_enable_pipelined_all_gather = false, xla_gpu_enable_pipelined_all_reduce = false, xla_gpu_enable_pipelined_collectives = false, xla_gpu_enable_pipelined_p2p = false, xla_gpu_enable_pipelined_reduce_scatter = false, xla_gpu_enable_reassociation_for_converted_ar = false, xla_gpu_enable_reduce_scatter_combine_by_dim = false, xla_gpu_enable_reduction_epilogue_fusion = false, xla_gpu_enable_scatter_determinism_expander = false, xla_gpu_enable_shared_constants = false, xla_gpu_enable_split_k_autotuning = false, xla_gpu_enable_triton_gemm = false, xla_gpu_enable_while_loop_double_buffering = false, xla_gpu_enable_while_loop_reduce_scatter_code_motion = false, xla_gpu_enable_while_loop_unrolling = var"DebugOptions.WhileLoopUnrolling".WHILE_LOOP_UNROLLING_NO_UNROLL, xla_gpu_exclude_nondeterministic_ops = false, xla_gpu_executable_embed_debug_info = false, xla_gpu_executable_terminate_timeout_seconds = zero(Int32), xla_gpu_executable_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_exhaustive_tiling_search = false, xla_gpu_experimental_allow_unroll_factor_eight = false, xla_gpu_experimental_aot_compiled_thunks = false, xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED, xla_gpu_experimental_autotuner_cache_dir = "", xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64), xla_gpu_experimental_collective_perf_table_path = "", xla_gpu_experimental_disable_binary_libraries = false, xla_gpu_experimental_dump_fdo_profiles = false, xla_gpu_experimental_dump_gpu_executable = false, xla_gpu_experimental_enable_alltoall_windowed_einsum = false, xla_gpu_experimental_enable_buffer_saver_on_thunks = false, xla_gpu_experimental_enable_checksum_tracing_on_thunks = false, xla_gpu_experimental_enable_fusion_autotuner = false, xla_gpu_experimental_enable_fusion_block_level_rewriter = false, xla_gpu_experimental_enable_heuristic_collective_combining = false, xla_gpu_experimental_enable_nccl_symmetric_buffers = false, xla_gpu_experimental_enable_nvshmem = false, xla_gpu_experimental_enable_split_k_rewrite = false, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = false, xla_gpu_experimental_enable_triton_heroless_priority_fusion = false, xla_gpu_experimental_enable_triton_warp_specialization = false, xla_gpu_experimental_pack_dot_operands_along_k_dimension = false, xla_gpu_experimental_parallel_collective_overlap_limit = zero(Int32), xla_gpu_experimental_pipeline_parallelism_opt_level = var"DebugOptions.PipelineParallelismOptLevel".PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE, xla_gpu_experimental_stream_annotation = false, xla_gpu_experimental_use_autotuner_pass = false, xla_gpu_experimental_use_ragged_dot_fusion = false, xla_gpu_fail_ptx_compilation_on_register_spilling = false, xla_gpu_filter_kernels_spilling_registers_on_autotuning = false, xla_gpu_first_collective_call_terminate_timeout_seconds = zero(Int32), xla_gpu_first_collective_call_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_force_compilation_parallelism = zero(Int32), xla_gpu_force_conv_nchw = false, xla_gpu_force_conv_nhwc = false, xla_gpu_ftz = false, xla_gpu_fused_attention_use_cudnn_rng = false, xla_gpu_gemm_autotuner_override_file = "", xla_gpu_gemm_rewrite_size_threshold = zero(Int64), xla_gpu_generate_debug_info = false, xla_gpu_generate_line_info = false, xla_gpu_graph_enable_concurrent_region = false, xla_gpu_graph_min_graph_size = zero(Int32), xla_gpu_kernel_cache_file = "", xla_gpu_libnvjitlink_mode = var"DebugOptions.LibNvJitLinkMode".LIB_NV_JIT_LINK_MODE_AUTO, xla_gpu_llvm_ir_file = Vector{String}(), xla_gpu_llvm_verification_level = zero(Int32), xla_gpu_load_autotune_results_from = "", xla_gpu_memory_limit_slop_factor = zero(Int32), xla_gpu_mock_custom_calls = false, xla_gpu_multi_streamed_windowed_einsum = false, xla_gpu_nccl_async_execution = false, xla_gpu_nccl_blocking_communicators = false, xla_gpu_nccl_collective_max_nchannels = zero(Int64), xla_gpu_nccl_init_max_rank_per_root_ratio = zero(Int64), xla_gpu_nccl_p2p_max_nchannels = zero(Int64), xla_gpu_nccl_terminate_on_error = false, xla_gpu_nccl_termination_timeout_seconds = zero(Int64), xla_gpu_operand_bytes_threshold_for_windowed_einsum = zero(Int64), xla_gpu_override_gemm_autotuner = "", xla_gpu_per_fusion_autotune_cache_dir = "", xla_gpu_pgle_accuracy_checker = var"DebugOptions.PGLEStrictnessLevel".PGLE_STRICTNESS_LEVEL_OFF, xla_gpu_pgle_profile_file_or_directory_path = "", xla_gpu_ptx_file = Vector{String}(), xla_gpu_reduce_scatter_combine_threshold_bytes = zero(Int64), xla_gpu_redzone_padding_bytes = zero(Int64), xla_gpu_require_complete_aot_autotune_results = false, xla_gpu_require_exclusive_lock = false, xla_gpu_shape_checks = var"DebugOptions.ShapeChecks".IGNORE, xla_gpu_shard_autotuning = false, xla_gpu_strict_conv_algorithm_picker = false, xla_gpu_target_config_filename = "", xla_gpu_temp_buffer_use_separate_color = false, xla_gpu_threshold_for_windowed_einsum_mib = zero(Int64), xla_gpu_triton_gemm_any = false, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = false, xla_gpu_unsupported_enable_all_reduce_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = false, xla_gpu_unsupported_enable_triton_gemm = false, xla_gpu_unsupported_enable_triton_multi_output_fusion = false, xla_gpu_unsupported_override_fast_interconnect_slice_size = zero(Int64), xla_gpu_unsupported_use_all_reduce_one_shot_kernel = false, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = false, xla_gpu_use_embeded_device_lib = false, xla_gpu_use_inprocess_lld = false, xla_gpu_use_memcpy_local_p2p = false, xla_gpu_use_runtime_fusion = false, xla_gpu_verify_triton_fusion_numerics = false, xla_hlo_graph_addresses = false, xla_hlo_profile = false, xla_disable_hlo_passes = Vector{String}(), xla_enable_hlo_passes_only = Vector{String}(), xla_disable_all_hlo_passes = false, xla_backend_optimization_level = zero(Int32), xla_embed_ir_in_executable = false, xla_eliminate_hlo_implicit_broadcast = false, xla_cpu_multi_thread_eigen = false, xla_llvm_enable_alias_scope_metadata = false, xla_llvm_enable_noalias_metadata = false, xla_llvm_enable_invariant_load_metadata = false, xla_llvm_disable_expensive_passes = false, xla_test_all_output_layouts = false, xla_test_all_input_layouts = false, xla_hlo_graph_sharding_color = false, xla_cpu_use_onednn = false, xla_allow_excess_precision = false, xla_force_host_platform_device_count = zero(Int32), xla_hlo_evaluator_use_fast_path = false, xla_allow_scalar_index_dynamic_ops = false, xla_step_marker_location = var"DebugOptions.StepMarkerLocation".STEP_MARK_AT_ENTRY, xla_dump_to = "", xla_flags_reset = false, xla_dump_hlo_module_re = "", xla_dump_hlo_pass_re = "", xla_dump_emitter_re = "", xla_dump_hlo_as_text = false, xla_dump_hlo_as_proto = false, xla_dump_hlo_as_dot = false, xla_dump_hlo_as_url = false, xla_dump_hlo_as_html = false, xla_dump_fusion_visualization = false, xla_dump_hlo_snapshots = false, xla_dump_include_timestamp = false, xla_dump_max_hlo_modules = zero(Int32), xla_dump_module_metadata = false, xla_dump_compress_protos = false, xla_dump_hlo_as_long_text = false, xla_dump_enable_mlir_pretty_form = false, xla_dump_full_hlo_config = false, xla_tpu_detect_nan = false, xla_tpu_detect_inf = false, xla_cpu_enable_xprof_traceme = false, xla_multiheap_size_constraint_per_heap = zero(Int32), xla_detailed_logging = false, xla_enable_dumping = false, xla_llvm_force_inline_before_split = false, xla_dump_disable_metadata = false, xla_dump_hlo_pipeline_re = "", xla_cpu_use_acl = false, xla_cpu_strict_dot_conv_math = false, xla_dump_latency_hiding_schedule = false, xla_partitioning_algorithm = var"DebugOptions.PartitioningAlgorithm".PARTITIONING_ALGORITHM_NOOP, xla_debug_buffer_assignment_show_max = zero(Int64), xla_detect_unstable_reductions = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_detect_unstable_reductions_post_optimizations = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_nan = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_inf = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_dump_large_constants = false, xla_reduce_window_rewrite_base_length = zero(Int64), xla_cmd_buffer_trace_cache_size = zero(Int64), xla_syntax_sugar_async_ops = false, xla_enable_command_buffers_during_profiling = false, xla_ignore_channel_id = false, xla_pjrt_allow_auto_layout_in_hlo = false, xla_test_add_command_buffer_mode = false, xla_gpu_experimental_matmul_perf_table_path = "", xla_early_exit_with_layouts = false, xla_gpu_experimental_scaled_dot_with_triton = false, xla_gpu_experimental_use_raft_select_k = false, xla_backend_extra_options = Dict{String,String}())
+PB.field_numbers(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439, xla_disable_automatic_host_compute_offload = 408, xla_enable_scoped_logging_timers = 436, xla_hlo_pass_fix_detect_cycles = 370, xla_keep_shardings_after_spmd = 419, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363, xla_unsupported_crash_on_hlo_pass_noop_change = 379, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380, xla_cpu_collective_call_terminate_timeout_seconds = 417, xla_cpu_collective_call_warn_stuck_seconds = 418, xla_cpu_collective_timeout_seconds = 438, xla_cpu_copy_insertion_use_region_analysis = 337, xla_cpu_emitter_verification_level = 395, xla_cpu_enable_concurrency_optimized_scheduler = 307, xla_cpu_enable_fast_math = 99, xla_cpu_enable_fast_min_max = 140, xla_cpu_enable_platform_dependent_math = 425, xla_cpu_experimental_onednn_custom_call = 412, xla_cpu_experimental_onednn_fusion_type = 399, xla_cpu_experimental_xnn_fusion_type = 400, xla_cpu_experimental_xnn_graph_fusion_mode = 365, xla_cpu_experimental_ynn_fusion_type = 422, xla_cpu_fast_math_honor_division = 126, xla_cpu_fast_math_honor_functions = 129, xla_cpu_fast_math_honor_infs = 121, xla_cpu_fast_math_honor_nans = 120, xla_cpu_generate_unique_c_style_kernel_entry_points = 372, xla_cpu_max_isa = 333, xla_cpu_parallel_codegen_split_count = 323, xla_cpu_prefer_vector_width = 308, xla_cpu_use_fusion_emitters = 376, xla_cpu_use_xnnpack = 359, xla_enable_fast_math = 335, xla_gpu_experimental_thunk_buffer_debug_filter = 424, xla_dump_hlo_unoptimized_snapshots = 405, xla_enable_enzyme_comms_opt = 429, xla_gpu_algorithm_denylist_path = 128, xla_gpu_all_gather_combine_threshold_bytes = 212, xla_gpu_all_reduce_blueconnect_num_devices_per_host = 159, xla_gpu_all_reduce_combine_threshold_bytes = 157, xla_gpu_analytical_latency_estimator_options = 357, xla_gpu_async_dot = 321, xla_gpu_auto_spmd_partitioning_memory_budget_gb = 224, xla_gpu_auto_spmd_partitioning_memory_budget_ratio = 225, xla_gpu_autotune_gemm_rtol = 316, xla_gpu_autotune_level = 123, xla_gpu_autotune_max_solutions = 288, xla_gpu_collect_cost_model_stats = 240, xla_gpu_collective_inflation_factor = 205, xla_gpu_collective_permute_combine_threshold_bytes = 378, xla_gpu_collective_permute_decomposer_threshold = 237, xla_gpu_collectives_use_persistent_cliques = 354, xla_gpu_command_buffer_scheduling_mode = 404, xla_gpu_command_buffer_unroll_loops = 411, xla_gpu_copy_insertion_use_region_analysis = 236, xla_gpu_crash_on_verification_failures = 101, xla_gpu_cublas_fallback = 247, xla_gpu_cuda_data_dir = 61, xla_gpu_cudnn_gemm_fusion_level = 285, xla_gpu_cudnn_gemm_max_plans = 318, xla_gpu_deterministic_ops = 148, xla_gpu_disable_async_collectives = 289, xla_gpu_disable_gpuasm_optimizations = 103, xla_gpu_dot_merger_threshold_mb = 331, xla_gpu_dump_autotune_logs_to = 292, xla_gpu_dump_autotune_results_to = 222, xla_gpu_dump_autotuned_gemm_fusions = 232, xla_gpu_dump_llvmir = 155, xla_gpu_enable_all_gather_combine_by_dim = 254, xla_gpu_enable_analytical_latency_estimator = 255, xla_gpu_enable_analytical_sol_latency_estimator = 356, xla_gpu_enable_approx_costly_collectives = 305, xla_gpu_enable_command_buffer = 258, xla_gpu_enable_cub_radix_sort = 259, xla_gpu_enable_cublaslt = 166, xla_gpu_enable_cudnn_int8x32_convolution_reordering = 189, xla_gpu_enable_cudnn_layer_norm = 262, xla_gpu_enable_dynamic_slice_fusion = 105, xla_gpu_enable_fast_min_max = 100, xla_gpu_enable_highest_priority_async_stream = 216, xla_gpu_enable_host_memory_offloading = 296, xla_gpu_enable_latency_hiding_scheduler = 186, xla_gpu_enable_libnvptxcompiler = 269, xla_gpu_enable_llvm_module_compilation_parallelism = 268, xla_gpu_enable_nccl_clique_optimization = 244, xla_gpu_enable_nccl_comm_splitting = 272, xla_gpu_enable_nccl_user_buffers = 267, xla_gpu_enable_pipelined_all_gather = 227, xla_gpu_enable_pipelined_all_reduce = 217, xla_gpu_enable_pipelined_collectives = 239, xla_gpu_enable_pipelined_p2p = 246, xla_gpu_enable_pipelined_reduce_scatter = 231, xla_gpu_enable_reassociation_for_converted_ar = 209, xla_gpu_enable_reduce_scatter_combine_by_dim = 257, xla_gpu_enable_reduction_epilogue_fusion = 243, xla_gpu_enable_scatter_determinism_expander = 345, xla_gpu_enable_shared_constants = 165, xla_gpu_enable_split_k_autotuning = 241, xla_gpu_enable_triton_gemm = 188, xla_gpu_enable_while_loop_double_buffering = 248, xla_gpu_enable_while_loop_reduce_scatter_code_motion = 203, xla_gpu_enable_while_loop_unrolling = 294, xla_gpu_exclude_nondeterministic_ops = 297, xla_gpu_executable_embed_debug_info = 437, xla_gpu_executable_terminate_timeout_seconds = 328, xla_gpu_executable_warn_stuck_timeout_seconds = 327, xla_gpu_exhaustive_tiling_search = 219, xla_gpu_experimental_allow_unroll_factor_eight = 430, xla_gpu_experimental_aot_compiled_thunks = 435, xla_gpu_experimental_autotune_cache_mode = 324, xla_gpu_experimental_autotuner_cache_dir = 407, xla_gpu_experimental_collective_cse_distance_threshold = 374, xla_gpu_experimental_collective_perf_table_path = 377, xla_gpu_experimental_disable_binary_libraries = 329, xla_gpu_experimental_dump_fdo_profiles = 338, xla_gpu_experimental_dump_gpu_executable = 427, xla_gpu_experimental_enable_alltoall_windowed_einsum = 360, xla_gpu_experimental_enable_buffer_saver_on_thunks = 431, xla_gpu_experimental_enable_checksum_tracing_on_thunks = 414, xla_gpu_experimental_enable_fusion_autotuner = 409, xla_gpu_experimental_enable_fusion_block_level_rewriter = 334, xla_gpu_experimental_enable_heuristic_collective_combining = 366, xla_gpu_experimental_enable_nccl_symmetric_buffers = 406, xla_gpu_experimental_enable_nvshmem = 388, xla_gpu_experimental_enable_split_k_rewrite = 386, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = 368, xla_gpu_experimental_enable_triton_heroless_priority_fusion = 340, xla_gpu_experimental_enable_triton_warp_specialization = 421, xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362, xla_gpu_experimental_parallel_collective_overlap_limit = 336, xla_gpu_experimental_pipeline_parallelism_opt_level = 351, xla_gpu_experimental_stream_annotation = 342, xla_gpu_experimental_use_autotuner_pass = 396, xla_gpu_experimental_use_ragged_dot_fusion = 401, xla_gpu_fail_ptx_compilation_on_register_spilling = 353, xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250, xla_gpu_first_collective_call_terminate_timeout_seconds = 392, xla_gpu_first_collective_call_warn_stuck_timeout_seconds = 391, xla_gpu_force_compilation_parallelism = 147, xla_gpu_force_conv_nchw = 125, xla_gpu_force_conv_nhwc = 146, xla_gpu_ftz = 62, xla_gpu_fused_attention_use_cudnn_rng = 235, xla_gpu_gemm_autotuner_override_file = 434, xla_gpu_gemm_rewrite_size_threshold = 283, xla_gpu_generate_debug_info = 348, xla_gpu_generate_line_info = 349, xla_gpu_graph_enable_concurrent_region = 215, xla_gpu_graph_min_graph_size = 208, xla_gpu_kernel_cache_file = 306, xla_gpu_libnvjitlink_mode = 343, xla_gpu_llvm_ir_file = 150, xla_gpu_llvm_verification_level = 256, xla_gpu_load_autotune_results_from = 223, xla_gpu_memory_limit_slop_factor = 260, xla_gpu_mock_custom_calls = 245, xla_gpu_multi_streamed_windowed_einsum = 280, xla_gpu_nccl_async_execution = 393, xla_gpu_nccl_blocking_communicators = 390, xla_gpu_nccl_collective_max_nchannels = 273, xla_gpu_nccl_init_max_rank_per_root_ratio = 277, xla_gpu_nccl_p2p_max_nchannels = 274, xla_gpu_nccl_terminate_on_error = 301, xla_gpu_nccl_termination_timeout_seconds = 163, xla_gpu_operand_bytes_threshold_for_windowed_einsum = 339, xla_gpu_override_gemm_autotuner = 295, xla_gpu_per_fusion_autotune_cache_dir = 310, xla_gpu_pgle_accuracy_checker = 341, xla_gpu_pgle_profile_file_or_directory_path = 210, xla_gpu_ptx_file = 127, xla_gpu_reduce_scatter_combine_threshold_bytes = 213, xla_gpu_redzone_padding_bytes = 228, xla_gpu_require_complete_aot_autotune_results = 284, xla_gpu_require_exclusive_lock = 347, xla_gpu_shape_checks = 170, xla_gpu_shard_autotuning = 304, xla_gpu_strict_conv_algorithm_picker = 156, xla_gpu_target_config_filename = 261, xla_gpu_temp_buffer_use_separate_color = 312, xla_gpu_threshold_for_windowed_einsum_mib = 265, xla_gpu_triton_gemm_any = 190, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138, xla_gpu_unsupported_enable_all_reduce_decomposer = 384, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = 350, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = 415, xla_gpu_unsupported_enable_triton_gemm = 322, xla_gpu_unsupported_enable_triton_multi_output_fusion = 382, xla_gpu_unsupported_override_fast_interconnect_slice_size = 416, xla_gpu_unsupported_use_all_reduce_one_shot_kernel = 387, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = 375, xla_gpu_use_embeded_device_lib = 420, xla_gpu_use_inprocess_lld = 389, xla_gpu_use_memcpy_local_p2p = 287, xla_gpu_use_runtime_fusion = 181, xla_gpu_verify_triton_fusion_numerics = 291, xla_hlo_graph_addresses = 2, xla_hlo_profile = 9, xla_disable_hlo_passes = 30, xla_enable_hlo_passes_only = 124, xla_disable_all_hlo_passes = 104, xla_backend_optimization_level = 31, xla_embed_ir_in_executable = 33, xla_eliminate_hlo_implicit_broadcast = 35, xla_cpu_multi_thread_eigen = 60, xla_llvm_enable_alias_scope_metadata = 70, xla_llvm_enable_noalias_metadata = 71, xla_llvm_enable_invariant_load_metadata = 72, xla_llvm_disable_expensive_passes = 73, xla_test_all_output_layouts = 90, xla_test_all_input_layouts = 91, xla_hlo_graph_sharding_color = 92, xla_cpu_use_onednn = 97, xla_allow_excess_precision = 122, xla_force_host_platform_device_count = 102, xla_hlo_evaluator_use_fast_path = 106, xla_allow_scalar_index_dynamic_ops = 107, xla_step_marker_location = 108, xla_dump_to = 109, xla_flags_reset = 364, xla_dump_hlo_module_re = 110, xla_dump_hlo_pass_re = 111, xla_dump_emitter_re = 433, xla_dump_hlo_as_text = 112, xla_dump_hlo_as_proto = 113, xla_dump_hlo_as_dot = 114, xla_dump_hlo_as_url = 115, xla_dump_hlo_as_html = 116, xla_dump_fusion_visualization = 149, xla_dump_hlo_snapshots = 118, xla_dump_include_timestamp = 131, xla_dump_max_hlo_modules = 132, xla_dump_module_metadata = 144, xla_dump_compress_protos = 151, xla_dump_hlo_as_long_text = 164, xla_dump_enable_mlir_pretty_form = 185, xla_dump_full_hlo_config = 381, xla_tpu_detect_nan = 135, xla_tpu_detect_inf = 136, xla_cpu_enable_xprof_traceme = 137, xla_multiheap_size_constraint_per_heap = 142, xla_detailed_logging = 252, xla_enable_dumping = 253, xla_llvm_force_inline_before_split = 300, xla_dump_disable_metadata = 153, xla_dump_hlo_pipeline_re = 154, xla_cpu_use_acl = 174, xla_cpu_strict_dot_conv_math = 175, xla_dump_latency_hiding_schedule = 182, xla_partitioning_algorithm = 187, xla_debug_buffer_assignment_show_max = 251, xla_detect_unstable_reductions = 403, xla_detect_unstable_reductions_post_optimizations = 432, xla_gpu_detect_nan = 426, xla_gpu_detect_inf = 428, xla_dump_large_constants = 290, xla_reduce_window_rewrite_base_length = 293, xla_cmd_buffer_trace_cache_size = 311, xla_syntax_sugar_async_ops = 315, xla_enable_command_buffers_during_profiling = 317, xla_ignore_channel_id = 330, xla_pjrt_allow_auto_layout_in_hlo = 344, xla_test_add_command_buffer_mode = 373, xla_gpu_experimental_matmul_perf_table_path = 383, xla_early_exit_with_layouts = 397, xla_gpu_experimental_scaled_dot_with_triton = 410, xla_gpu_experimental_use_raft_select_k = 413, xla_backend_extra_options = 500)
 
 function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
     xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false
@@ -1023,7 +1017,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
     xla_gpu_cuda_data_dir = ""
     xla_gpu_cudnn_gemm_fusion_level = zero(Int32)
     xla_gpu_cudnn_gemm_max_plans = zero(Int32)
-    xla_gpu_default_to_alg_dot_bf16_bf16_f32 = false
     xla_gpu_deterministic_ops = false
     xla_gpu_disable_async_collectives = PB.BufferedVector{var"DebugOptions.CollectiveOpType".T}()
     xla_gpu_disable_gpuasm_optimizations = false
@@ -1054,7 +1047,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
     xla_gpu_enable_pipelined_all_gather = false
     xla_gpu_enable_pipelined_all_reduce = false
     xla_gpu_enable_pipelined_collectives = false
-    xla_gpu_enable_pipelined_host_offloading = false
     xla_gpu_enable_pipelined_p2p = false
     xla_gpu_enable_pipelined_reduce_scatter = false
     xla_gpu_enable_reassociation_for_converted_ar = false
@@ -1074,7 +1066,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
     xla_gpu_exhaustive_tiling_search = false
     xla_gpu_experimental_allow_unroll_factor_eight = false
     xla_gpu_experimental_aot_compiled_thunks = false
-    xla_gpu_experimental_autotune_backends = PB.BufferedVector{var"DebugOptions.AutotuneBackend".T}()
     xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED
     xla_gpu_experimental_autotuner_cache_dir = ""
     xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64)
@@ -1355,8 +1346,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
             xla_gpu_cudnn_gemm_fusion_level = PB.decode(d, Int32)
         elseif field_number == 318
             xla_gpu_cudnn_gemm_max_plans = PB.decode(d, Int32)
-        elseif field_number == 441
-            xla_gpu_default_to_alg_dot_bf16_bf16_f32 = PB.decode(d, Bool)
         elseif field_number == 148
             xla_gpu_deterministic_ops = PB.decode(d, Bool)
         elseif field_number == 289
@@ -1417,8 +1406,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
             xla_gpu_enable_pipelined_all_reduce = PB.decode(d, Bool)
         elseif field_number == 239
             xla_gpu_enable_pipelined_collectives = PB.decode(d, Bool)
-        elseif field_number == 440
-            xla_gpu_enable_pipelined_host_offloading = PB.decode(d, Bool)
         elseif field_number == 246
             xla_gpu_enable_pipelined_p2p = PB.decode(d, Bool)
         elseif field_number == 231
@@ -1457,8 +1444,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
             xla_gpu_experimental_allow_unroll_factor_eight = PB.decode(d, Bool)
         elseif field_number == 435
             xla_gpu_experimental_aot_compiled_thunks = PB.decode(d, Bool)
-        elseif field_number == 442
-            PB.decode!(d, wire_type, xla_gpu_experimental_autotune_backends)
         elseif field_number == 324
             xla_gpu_experimental_autotune_cache_mode = PB.decode(d, var"DebugOptions.AutotuneCacheMode".T)
         elseif field_number == 407
@@ -1779,7 +1764,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions})
             Base.skip(d, wire_type)
         end
     end
-    return DebugOptions(xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled, xla_disable_automatic_host_compute_offload, xla_enable_scoped_logging_timers, xla_hlo_pass_fix_detect_cycles, xla_keep_shardings_after_spmd, xla_unsupported_crash_on_hlo_pass_fix_max_iterations, xla_unsupported_crash_on_hlo_pass_noop_change, xla_unsupported_crash_on_hlo_pass_silent_hlo_change, xla_cpu_collective_call_terminate_timeout_seconds, xla_cpu_collective_call_warn_stuck_seconds, xla_cpu_collective_timeout_seconds, xla_cpu_copy_insertion_use_region_analysis, xla_cpu_emitter_verification_level, xla_cpu_enable_concurrency_optimized_scheduler, xla_cpu_enable_fast_math, xla_cpu_enable_fast_min_max, xla_cpu_enable_platform_dependent_math, xla_cpu_experimental_onednn_custom_call, xla_cpu_experimental_onednn_fusion_type[], xla_cpu_experimental_xnn_fusion_type[], xla_cpu_experimental_xnn_graph_fusion_mode, xla_cpu_experimental_ynn_fusion_type[], xla_cpu_fast_math_honor_division, xla_cpu_fast_math_honor_functions, xla_cpu_fast_math_honor_infs, xla_cpu_fast_math_honor_nans, xla_cpu_generate_unique_c_style_kernel_entry_points, xla_cpu_max_isa, xla_cpu_parallel_codegen_split_count, xla_cpu_prefer_vector_width, xla_cpu_use_fusion_emitters, xla_cpu_use_xnnpack, xla_enable_fast_math, xla_gpu_experimental_thunk_buffer_debug_filter[], xla_dump_hlo_unoptimized_snapshots, xla_enable_enzyme_comms_opt, xla_gpu_algorithm_denylist_path, xla_gpu_all_gather_combine_threshold_bytes, xla_gpu_all_reduce_blueconnect_num_devices_per_host, xla_gpu_all_reduce_combine_threshold_bytes, xla_gpu_analytical_latency_estimator_options, xla_gpu_async_dot, xla_gpu_auto_spmd_partitioning_memory_budget_gb, xla_gpu_auto_spmd_partitioning_memory_budget_ratio, xla_gpu_autotune_gemm_rtol, xla_gpu_autotune_level, xla_gpu_autotune_max_solutions, xla_gpu_collect_cost_model_stats, xla_gpu_collective_inflation_factor, xla_gpu_collective_permute_combine_threshold_bytes, xla_gpu_collective_permute_decomposer_threshold, xla_gpu_collectives_use_persistent_cliques, xla_gpu_command_buffer_scheduling_mode, xla_gpu_command_buffer_unroll_loops, xla_gpu_copy_insertion_use_region_analysis, xla_gpu_crash_on_verification_failures, xla_gpu_cublas_fallback, xla_gpu_cuda_data_dir, xla_gpu_cudnn_gemm_fusion_level, xla_gpu_cudnn_gemm_max_plans, xla_gpu_default_to_alg_dot_bf16_bf16_f32, xla_gpu_deterministic_ops, xla_gpu_disable_async_collectives[], xla_gpu_disable_gpuasm_optimizations, xla_gpu_dot_merger_threshold_mb, xla_gpu_dump_autotune_logs_to, xla_gpu_dump_autotune_results_to, xla_gpu_dump_autotuned_gemm_fusions, xla_gpu_dump_llvmir, xla_gpu_enable_all_gather_combine_by_dim, xla_gpu_enable_analytical_latency_estimator, xla_gpu_enable_analytical_sol_latency_estimator, xla_gpu_enable_approx_costly_collectives, xla_gpu_enable_command_buffer[], xla_gpu_enable_cub_radix_sort, xla_gpu_enable_cublaslt, xla_gpu_enable_cudnn_int8x32_convolution_reordering, xla_gpu_enable_cudnn_layer_norm, xla_gpu_enable_dynamic_slice_fusion, xla_gpu_enable_fast_min_max, xla_gpu_enable_highest_priority_async_stream, xla_gpu_enable_host_memory_offloading, xla_gpu_enable_latency_hiding_scheduler, xla_gpu_enable_libnvptxcompiler, xla_gpu_enable_llvm_module_compilation_parallelism, xla_gpu_enable_nccl_clique_optimization, xla_gpu_enable_nccl_comm_splitting, xla_gpu_enable_nccl_user_buffers, xla_gpu_enable_pipelined_all_gather, xla_gpu_enable_pipelined_all_reduce, xla_gpu_enable_pipelined_collectives, xla_gpu_enable_pipelined_host_offloading, xla_gpu_enable_pipelined_p2p, xla_gpu_enable_pipelined_reduce_scatter, xla_gpu_enable_reassociation_for_converted_ar, xla_gpu_enable_reduce_scatter_combine_by_dim, xla_gpu_enable_reduction_epilogue_fusion, xla_gpu_enable_scatter_determinism_expander, xla_gpu_enable_shared_constants, xla_gpu_enable_split_k_autotuning, xla_gpu_enable_triton_gemm, xla_gpu_enable_while_loop_double_buffering, xla_gpu_enable_while_loop_reduce_scatter_code_motion, xla_gpu_enable_while_loop_unrolling, xla_gpu_exclude_nondeterministic_ops, xla_gpu_executable_embed_debug_info, xla_gpu_executable_terminate_timeout_seconds, xla_gpu_executable_warn_stuck_timeout_seconds, xla_gpu_exhaustive_tiling_search, xla_gpu_experimental_allow_unroll_factor_eight, xla_gpu_experimental_aot_compiled_thunks, xla_gpu_experimental_autotune_backends[], xla_gpu_experimental_autotune_cache_mode, xla_gpu_experimental_autotuner_cache_dir, xla_gpu_experimental_collective_cse_distance_threshold, xla_gpu_experimental_collective_perf_table_path, xla_gpu_experimental_disable_binary_libraries, xla_gpu_experimental_dump_fdo_profiles, xla_gpu_experimental_dump_gpu_executable, xla_gpu_experimental_enable_alltoall_windowed_einsum, xla_gpu_experimental_enable_buffer_saver_on_thunks, xla_gpu_experimental_enable_checksum_tracing_on_thunks, xla_gpu_experimental_enable_fusion_autotuner, xla_gpu_experimental_enable_fusion_block_level_rewriter, xla_gpu_experimental_enable_heuristic_collective_combining, xla_gpu_experimental_enable_nccl_symmetric_buffers, xla_gpu_experimental_enable_nvshmem, xla_gpu_experimental_enable_split_k_rewrite, xla_gpu_experimental_enable_subchannel_dequantisation_fusion, xla_gpu_experimental_enable_triton_heroless_priority_fusion, xla_gpu_experimental_enable_triton_warp_specialization, xla_gpu_experimental_pack_dot_operands_along_k_dimension, xla_gpu_experimental_parallel_collective_overlap_limit, xla_gpu_experimental_pipeline_parallelism_opt_level, xla_gpu_experimental_stream_annotation, xla_gpu_experimental_use_autotuner_pass, xla_gpu_experimental_use_ragged_dot_fusion, xla_gpu_fail_ptx_compilation_on_register_spilling, xla_gpu_filter_kernels_spilling_registers_on_autotuning, xla_gpu_first_collective_call_terminate_timeout_seconds, xla_gpu_first_collective_call_warn_stuck_timeout_seconds, xla_gpu_force_compilation_parallelism, xla_gpu_force_conv_nchw, xla_gpu_force_conv_nhwc, xla_gpu_ftz, xla_gpu_fused_attention_use_cudnn_rng, xla_gpu_gemm_autotuner_override_file, xla_gpu_gemm_rewrite_size_threshold, xla_gpu_generate_debug_info, xla_gpu_generate_line_info, xla_gpu_graph_enable_concurrent_region, xla_gpu_graph_min_graph_size, xla_gpu_kernel_cache_file, xla_gpu_libnvjitlink_mode, xla_gpu_llvm_ir_file[], xla_gpu_llvm_verification_level, xla_gpu_load_autotune_results_from, xla_gpu_memory_limit_slop_factor, xla_gpu_mock_custom_calls, xla_gpu_multi_streamed_windowed_einsum, xla_gpu_nccl_async_execution, xla_gpu_nccl_blocking_communicators, xla_gpu_nccl_collective_max_nchannels, xla_gpu_nccl_init_max_rank_per_root_ratio, xla_gpu_nccl_p2p_max_nchannels, xla_gpu_nccl_terminate_on_error, xla_gpu_nccl_termination_timeout_seconds, xla_gpu_operand_bytes_threshold_for_windowed_einsum, xla_gpu_override_gemm_autotuner, xla_gpu_per_fusion_autotune_cache_dir, xla_gpu_pgle_accuracy_checker, xla_gpu_pgle_profile_file_or_directory_path, xla_gpu_ptx_file[], xla_gpu_reduce_scatter_combine_threshold_bytes, xla_gpu_redzone_padding_bytes, xla_gpu_require_complete_aot_autotune_results, xla_gpu_require_exclusive_lock, xla_gpu_shape_checks, xla_gpu_shard_autotuning, xla_gpu_strict_conv_algorithm_picker, xla_gpu_target_config_filename, xla_gpu_temp_buffer_use_separate_color, xla_gpu_threshold_for_windowed_einsum_mib, xla_gpu_triton_gemm_any, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found, xla_gpu_unsupported_enable_all_reduce_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer, xla_gpu_unsupported_enable_triton_gemm, xla_gpu_unsupported_enable_triton_multi_output_fusion, xla_gpu_unsupported_override_fast_interconnect_slice_size, xla_gpu_unsupported_use_all_reduce_one_shot_kernel, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel, xla_gpu_use_embeded_device_lib, xla_gpu_use_inprocess_lld, xla_gpu_use_memcpy_local_p2p, xla_gpu_use_runtime_fusion, xla_gpu_verify_triton_fusion_numerics, xla_hlo_graph_addresses, xla_hlo_profile, xla_disable_hlo_passes[], xla_enable_hlo_passes_only[], xla_disable_all_hlo_passes, xla_backend_optimization_level, xla_embed_ir_in_executable, xla_eliminate_hlo_implicit_broadcast, xla_cpu_multi_thread_eigen, xla_llvm_enable_alias_scope_metadata, xla_llvm_enable_noalias_metadata, xla_llvm_enable_invariant_load_metadata, xla_llvm_disable_expensive_passes, xla_test_all_output_layouts, xla_test_all_input_layouts, xla_hlo_graph_sharding_color, xla_cpu_use_onednn, xla_allow_excess_precision, xla_force_host_platform_device_count, xla_hlo_evaluator_use_fast_path, xla_allow_scalar_index_dynamic_ops, xla_step_marker_location, xla_dump_to, xla_flags_reset, xla_dump_hlo_module_re, xla_dump_hlo_pass_re, xla_dump_emitter_re, xla_dump_hlo_as_text, xla_dump_hlo_as_proto, xla_dump_hlo_as_dot, xla_dump_hlo_as_url, xla_dump_hlo_as_html, xla_dump_fusion_visualization, xla_dump_hlo_snapshots, xla_dump_include_timestamp, xla_dump_max_hlo_modules, xla_dump_module_metadata, xla_dump_compress_protos, xla_dump_hlo_as_long_text, xla_dump_enable_mlir_pretty_form, xla_dump_full_hlo_config, xla_tpu_detect_nan, xla_tpu_detect_inf, xla_cpu_enable_xprof_traceme, xla_multiheap_size_constraint_per_heap, xla_detailed_logging, xla_enable_dumping, xla_llvm_force_inline_before_split, xla_dump_disable_metadata, xla_dump_hlo_pipeline_re, xla_cpu_use_acl, xla_cpu_strict_dot_conv_math, xla_dump_latency_hiding_schedule, xla_partitioning_algorithm, xla_debug_buffer_assignment_show_max, xla_detect_unstable_reductions, xla_detect_unstable_reductions_post_optimizations, xla_gpu_detect_nan, xla_gpu_detect_inf, xla_dump_large_constants, xla_reduce_window_rewrite_base_length, xla_cmd_buffer_trace_cache_size, xla_syntax_sugar_async_ops, xla_enable_command_buffers_during_profiling, xla_ignore_channel_id, xla_pjrt_allow_auto_layout_in_hlo, xla_test_add_command_buffer_mode, xla_gpu_experimental_matmul_perf_table_path, xla_early_exit_with_layouts, xla_gpu_experimental_scaled_dot_with_triton, xla_gpu_experimental_use_raft_select_k, xla_backend_extra_options)
+    return DebugOptions(xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled, xla_disable_automatic_host_compute_offload, xla_enable_scoped_logging_timers, xla_hlo_pass_fix_detect_cycles, xla_keep_shardings_after_spmd, xla_unsupported_crash_on_hlo_pass_fix_max_iterations, xla_unsupported_crash_on_hlo_pass_noop_change, xla_unsupported_crash_on_hlo_pass_silent_hlo_change, xla_cpu_collective_call_terminate_timeout_seconds, xla_cpu_collective_call_warn_stuck_seconds, xla_cpu_collective_timeout_seconds, xla_cpu_copy_insertion_use_region_analysis, xla_cpu_emitter_verification_level, xla_cpu_enable_concurrency_optimized_scheduler, xla_cpu_enable_fast_math, xla_cpu_enable_fast_min_max, xla_cpu_enable_platform_dependent_math, xla_cpu_experimental_onednn_custom_call, xla_cpu_experimental_onednn_fusion_type[], xla_cpu_experimental_xnn_fusion_type[], xla_cpu_experimental_xnn_graph_fusion_mode, xla_cpu_experimental_ynn_fusion_type[], xla_cpu_fast_math_honor_division, xla_cpu_fast_math_honor_functions, xla_cpu_fast_math_honor_infs, xla_cpu_fast_math_honor_nans, xla_cpu_generate_unique_c_style_kernel_entry_points, xla_cpu_max_isa, xla_cpu_parallel_codegen_split_count, xla_cpu_prefer_vector_width, xla_cpu_use_fusion_emitters, xla_cpu_use_xnnpack, xla_enable_fast_math, xla_gpu_experimental_thunk_buffer_debug_filter[], xla_dump_hlo_unoptimized_snapshots, xla_enable_enzyme_comms_opt, xla_gpu_algorithm_denylist_path, xla_gpu_all_gather_combine_threshold_bytes, xla_gpu_all_reduce_blueconnect_num_devices_per_host, xla_gpu_all_reduce_combine_threshold_bytes, xla_gpu_analytical_latency_estimator_options, xla_gpu_async_dot, xla_gpu_auto_spmd_partitioning_memory_budget_gb, xla_gpu_auto_spmd_partitioning_memory_budget_ratio, xla_gpu_autotune_gemm_rtol, xla_gpu_autotune_level, xla_gpu_autotune_max_solutions, xla_gpu_collect_cost_model_stats, xla_gpu_collective_inflation_factor, xla_gpu_collective_permute_combine_threshold_bytes, xla_gpu_collective_permute_decomposer_threshold, xla_gpu_collectives_use_persistent_cliques, xla_gpu_command_buffer_scheduling_mode, xla_gpu_command_buffer_unroll_loops, xla_gpu_copy_insertion_use_region_analysis, xla_gpu_crash_on_verification_failures, xla_gpu_cublas_fallback, xla_gpu_cuda_data_dir, xla_gpu_cudnn_gemm_fusion_level, xla_gpu_cudnn_gemm_max_plans, xla_gpu_deterministic_ops, xla_gpu_disable_async_collectives[], xla_gpu_disable_gpuasm_optimizations, xla_gpu_dot_merger_threshold_mb, xla_gpu_dump_autotune_logs_to, xla_gpu_dump_autotune_results_to, xla_gpu_dump_autotuned_gemm_fusions, xla_gpu_dump_llvmir, xla_gpu_enable_all_gather_combine_by_dim, xla_gpu_enable_analytical_latency_estimator, xla_gpu_enable_analytical_sol_latency_estimator, xla_gpu_enable_approx_costly_collectives, xla_gpu_enable_command_buffer[], xla_gpu_enable_cub_radix_sort, xla_gpu_enable_cublaslt, xla_gpu_enable_cudnn_int8x32_convolution_reordering, xla_gpu_enable_cudnn_layer_norm, xla_gpu_enable_dynamic_slice_fusion, xla_gpu_enable_fast_min_max, xla_gpu_enable_highest_priority_async_stream, xla_gpu_enable_host_memory_offloading, xla_gpu_enable_latency_hiding_scheduler, xla_gpu_enable_libnvptxcompiler, xla_gpu_enable_llvm_module_compilation_parallelism, xla_gpu_enable_nccl_clique_optimization, xla_gpu_enable_nccl_comm_splitting, xla_gpu_enable_nccl_user_buffers, xla_gpu_enable_pipelined_all_gather, xla_gpu_enable_pipelined_all_reduce, xla_gpu_enable_pipelined_collectives, xla_gpu_enable_pipelined_p2p, xla_gpu_enable_pipelined_reduce_scatter, xla_gpu_enable_reassociation_for_converted_ar, xla_gpu_enable_reduce_scatter_combine_by_dim, xla_gpu_enable_reduction_epilogue_fusion, xla_gpu_enable_scatter_determinism_expander, xla_gpu_enable_shared_constants, xla_gpu_enable_split_k_autotuning, xla_gpu_enable_triton_gemm, xla_gpu_enable_while_loop_double_buffering, xla_gpu_enable_while_loop_reduce_scatter_code_motion, xla_gpu_enable_while_loop_unrolling, xla_gpu_exclude_nondeterministic_ops, xla_gpu_executable_embed_debug_info, xla_gpu_executable_terminate_timeout_seconds, xla_gpu_executable_warn_stuck_timeout_seconds, xla_gpu_exhaustive_tiling_search, xla_gpu_experimental_allow_unroll_factor_eight, xla_gpu_experimental_aot_compiled_thunks, xla_gpu_experimental_autotune_cache_mode, xla_gpu_experimental_autotuner_cache_dir, xla_gpu_experimental_collective_cse_distance_threshold, xla_gpu_experimental_collective_perf_table_path, xla_gpu_experimental_disable_binary_libraries, xla_gpu_experimental_dump_fdo_profiles, xla_gpu_experimental_dump_gpu_executable, xla_gpu_experimental_enable_alltoall_windowed_einsum, xla_gpu_experimental_enable_buffer_saver_on_thunks, xla_gpu_experimental_enable_checksum_tracing_on_thunks, xla_gpu_experimental_enable_fusion_autotuner, xla_gpu_experimental_enable_fusion_block_level_rewriter, xla_gpu_experimental_enable_heuristic_collective_combining, xla_gpu_experimental_enable_nccl_symmetric_buffers, xla_gpu_experimental_enable_nvshmem, xla_gpu_experimental_enable_split_k_rewrite, xla_gpu_experimental_enable_subchannel_dequantisation_fusion, xla_gpu_experimental_enable_triton_heroless_priority_fusion, xla_gpu_experimental_enable_triton_warp_specialization, xla_gpu_experimental_pack_dot_operands_along_k_dimension, xla_gpu_experimental_parallel_collective_overlap_limit, xla_gpu_experimental_pipeline_parallelism_opt_level, xla_gpu_experimental_stream_annotation, xla_gpu_experimental_use_autotuner_pass, xla_gpu_experimental_use_ragged_dot_fusion, xla_gpu_fail_ptx_compilation_on_register_spilling, xla_gpu_filter_kernels_spilling_registers_on_autotuning, xla_gpu_first_collective_call_terminate_timeout_seconds, xla_gpu_first_collective_call_warn_stuck_timeout_seconds, xla_gpu_force_compilation_parallelism, xla_gpu_force_conv_nchw, xla_gpu_force_conv_nhwc, xla_gpu_ftz, xla_gpu_fused_attention_use_cudnn_rng, xla_gpu_gemm_autotuner_override_file, xla_gpu_gemm_rewrite_size_threshold, xla_gpu_generate_debug_info, xla_gpu_generate_line_info, xla_gpu_graph_enable_concurrent_region, xla_gpu_graph_min_graph_size, xla_gpu_kernel_cache_file, xla_gpu_libnvjitlink_mode, xla_gpu_llvm_ir_file[], xla_gpu_llvm_verification_level, xla_gpu_load_autotune_results_from, xla_gpu_memory_limit_slop_factor, xla_gpu_mock_custom_calls, xla_gpu_multi_streamed_windowed_einsum, xla_gpu_nccl_async_execution, xla_gpu_nccl_blocking_communicators, xla_gpu_nccl_collective_max_nchannels, xla_gpu_nccl_init_max_rank_per_root_ratio, xla_gpu_nccl_p2p_max_nchannels, xla_gpu_nccl_terminate_on_error, xla_gpu_nccl_termination_timeout_seconds, xla_gpu_operand_bytes_threshold_for_windowed_einsum, xla_gpu_override_gemm_autotuner, xla_gpu_per_fusion_autotune_cache_dir, xla_gpu_pgle_accuracy_checker, xla_gpu_pgle_profile_file_or_directory_path, xla_gpu_ptx_file[], xla_gpu_reduce_scatter_combine_threshold_bytes, xla_gpu_redzone_padding_bytes, xla_gpu_require_complete_aot_autotune_results, xla_gpu_require_exclusive_lock, xla_gpu_shape_checks, xla_gpu_shard_autotuning, xla_gpu_strict_conv_algorithm_picker, xla_gpu_target_config_filename, xla_gpu_temp_buffer_use_separate_color, xla_gpu_threshold_for_windowed_einsum_mib, xla_gpu_triton_gemm_any, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found, xla_gpu_unsupported_enable_all_reduce_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer, xla_gpu_unsupported_enable_triton_gemm, xla_gpu_unsupported_enable_triton_multi_output_fusion, xla_gpu_unsupported_override_fast_interconnect_slice_size, xla_gpu_unsupported_use_all_reduce_one_shot_kernel, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel, xla_gpu_use_embeded_device_lib, xla_gpu_use_inprocess_lld, xla_gpu_use_memcpy_local_p2p, xla_gpu_use_runtime_fusion, xla_gpu_verify_triton_fusion_numerics, xla_hlo_graph_addresses, xla_hlo_profile, xla_disable_hlo_passes[], xla_enable_hlo_passes_only[], xla_disable_all_hlo_passes, xla_backend_optimization_level, xla_embed_ir_in_executable, xla_eliminate_hlo_implicit_broadcast, xla_cpu_multi_thread_eigen, xla_llvm_enable_alias_scope_metadata, xla_llvm_enable_noalias_metadata, xla_llvm_enable_invariant_load_metadata, xla_llvm_disable_expensive_passes, xla_test_all_output_layouts, xla_test_all_input_layouts, xla_hlo_graph_sharding_color, xla_cpu_use_onednn, xla_allow_excess_precision, xla_force_host_platform_device_count, xla_hlo_evaluator_use_fast_path, xla_allow_scalar_index_dynamic_ops, xla_step_marker_location, xla_dump_to, xla_flags_reset, xla_dump_hlo_module_re, xla_dump_hlo_pass_re, xla_dump_emitter_re, xla_dump_hlo_as_text, xla_dump_hlo_as_proto, xla_dump_hlo_as_dot, xla_dump_hlo_as_url, xla_dump_hlo_as_html, xla_dump_fusion_visualization, xla_dump_hlo_snapshots, xla_dump_include_timestamp, xla_dump_max_hlo_modules, xla_dump_module_metadata, xla_dump_compress_protos, xla_dump_hlo_as_long_text, xla_dump_enable_mlir_pretty_form, xla_dump_full_hlo_config, xla_tpu_detect_nan, xla_tpu_detect_inf, xla_cpu_enable_xprof_traceme, xla_multiheap_size_constraint_per_heap, xla_detailed_logging, xla_enable_dumping, xla_llvm_force_inline_before_split, xla_dump_disable_metadata, xla_dump_hlo_pipeline_re, xla_cpu_use_acl, xla_cpu_strict_dot_conv_math, xla_dump_latency_hiding_schedule, xla_partitioning_algorithm, xla_debug_buffer_assignment_show_max, xla_detect_unstable_reductions, xla_detect_unstable_reductions_post_optimizations, xla_gpu_detect_nan, xla_gpu_detect_inf, xla_dump_large_constants, xla_reduce_window_rewrite_base_length, xla_cmd_buffer_trace_cache_size, xla_syntax_sugar_async_ops, xla_enable_command_buffers_during_profiling, xla_ignore_channel_id, xla_pjrt_allow_auto_layout_in_hlo, xla_test_add_command_buffer_mode, xla_gpu_experimental_matmul_perf_table_path, xla_early_exit_with_layouts, xla_gpu_experimental_scaled_dot_with_triton, xla_gpu_experimental_use_raft_select_k, xla_backend_extra_options)
 end
 
 function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions)
@@ -1844,7 +1829,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions)
     !isempty(x.xla_gpu_cuda_data_dir) && PB.encode(e, 61, x.xla_gpu_cuda_data_dir)
     x.xla_gpu_cudnn_gemm_fusion_level != zero(Int32) && PB.encode(e, 285, x.xla_gpu_cudnn_gemm_fusion_level)
     x.xla_gpu_cudnn_gemm_max_plans != zero(Int32) && PB.encode(e, 318, x.xla_gpu_cudnn_gemm_max_plans)
-    x.xla_gpu_default_to_alg_dot_bf16_bf16_f32 != false && PB.encode(e, 441, x.xla_gpu_default_to_alg_dot_bf16_bf16_f32)
     x.xla_gpu_deterministic_ops != false && PB.encode(e, 148, x.xla_gpu_deterministic_ops)
     !isempty(x.xla_gpu_disable_async_collectives) && PB.encode(e, 289, x.xla_gpu_disable_async_collectives)
     x.xla_gpu_disable_gpuasm_optimizations != false && PB.encode(e, 103, x.xla_gpu_disable_gpuasm_optimizations)
@@ -1875,7 +1859,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions)
     x.xla_gpu_enable_pipelined_all_gather != false && PB.encode(e, 227, x.xla_gpu_enable_pipelined_all_gather)
     x.xla_gpu_enable_pipelined_all_reduce != false && PB.encode(e, 217, x.xla_gpu_enable_pipelined_all_reduce)
     x.xla_gpu_enable_pipelined_collectives != false && PB.encode(e, 239, x.xla_gpu_enable_pipelined_collectives)
-    x.xla_gpu_enable_pipelined_host_offloading != false && PB.encode(e, 440, x.xla_gpu_enable_pipelined_host_offloading)
     x.xla_gpu_enable_pipelined_p2p != false && PB.encode(e, 246, x.xla_gpu_enable_pipelined_p2p)
     x.xla_gpu_enable_pipelined_reduce_scatter != false && PB.encode(e, 231, x.xla_gpu_enable_pipelined_reduce_scatter)
     x.xla_gpu_enable_reassociation_for_converted_ar != false && PB.encode(e, 209, x.xla_gpu_enable_reassociation_for_converted_ar)
@@ -1895,7 +1878,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions)
     x.xla_gpu_exhaustive_tiling_search != false && PB.encode(e, 219, x.xla_gpu_exhaustive_tiling_search)
     x.xla_gpu_experimental_allow_unroll_factor_eight != false && PB.encode(e, 430, x.xla_gpu_experimental_allow_unroll_factor_eight)
     x.xla_gpu_experimental_aot_compiled_thunks != false && PB.encode(e, 435, x.xla_gpu_experimental_aot_compiled_thunks)
-    !isempty(x.xla_gpu_experimental_autotune_backends) && PB.encode(e, 442, x.xla_gpu_experimental_autotune_backends)
     x.xla_gpu_experimental_autotune_cache_mode != var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED && PB.encode(e, 324, x.xla_gpu_experimental_autotune_cache_mode)
     !isempty(x.xla_gpu_experimental_autotuner_cache_dir) && PB.encode(e, 407, x.xla_gpu_experimental_autotuner_cache_dir)
     x.xla_gpu_experimental_collective_cse_distance_threshold != zero(Int64) && PB.encode(e, 374, x.xla_gpu_experimental_collective_cse_distance_threshold)
@@ -2118,7 +2100,6 @@ function PB._encoded_size(x::DebugOptions)
     !isempty(x.xla_gpu_cuda_data_dir) && (encoded_size += PB._encoded_size(x.xla_gpu_cuda_data_dir, 61))
     x.xla_gpu_cudnn_gemm_fusion_level != zero(Int32) && (encoded_size += PB._encoded_size(x.xla_gpu_cudnn_gemm_fusion_level, 285))
     x.xla_gpu_cudnn_gemm_max_plans != zero(Int32) && (encoded_size += PB._encoded_size(x.xla_gpu_cudnn_gemm_max_plans, 318))
-    x.xla_gpu_default_to_alg_dot_bf16_bf16_f32 != false && (encoded_size += PB._encoded_size(x.xla_gpu_default_to_alg_dot_bf16_bf16_f32, 441))
     x.xla_gpu_deterministic_ops != false && (encoded_size += PB._encoded_size(x.xla_gpu_deterministic_ops, 148))
     !isempty(x.xla_gpu_disable_async_collectives) && (encoded_size += PB._encoded_size(x.xla_gpu_disable_async_collectives, 289))
     x.xla_gpu_disable_gpuasm_optimizations != false && (encoded_size += PB._encoded_size(x.xla_gpu_disable_gpuasm_optimizations, 103))
@@ -2149,7 +2130,6 @@ function PB._encoded_size(x::DebugOptions)
     x.xla_gpu_enable_pipelined_all_gather != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_all_gather, 227))
     x.xla_gpu_enable_pipelined_all_reduce != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_all_reduce, 217))
     x.xla_gpu_enable_pipelined_collectives != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_collectives, 239))
-    x.xla_gpu_enable_pipelined_host_offloading != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_host_offloading, 440))
     x.xla_gpu_enable_pipelined_p2p != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_p2p, 246))
     x.xla_gpu_enable_pipelined_reduce_scatter != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_reduce_scatter, 231))
     x.xla_gpu_enable_reassociation_for_converted_ar != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_reassociation_for_converted_ar, 209))
@@ -2169,7 +2149,6 @@ function PB._encoded_size(x::DebugOptions)
     x.xla_gpu_exhaustive_tiling_search != false && (encoded_size += PB._encoded_size(x.xla_gpu_exhaustive_tiling_search, 219))
     x.xla_gpu_experimental_allow_unroll_factor_eight != false && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_allow_unroll_factor_eight, 430))
     x.xla_gpu_experimental_aot_compiled_thunks != false && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_aot_compiled_thunks, 435))
-    !isempty(x.xla_gpu_experimental_autotune_backends) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotune_backends, 442))
     x.xla_gpu_experimental_autotune_cache_mode != var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotune_cache_mode, 324))
     !isempty(x.xla_gpu_experimental_autotuner_cache_dir) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotuner_cache_dir, 407))
     x.xla_gpu_experimental_collective_cse_distance_threshold != zero(Int64) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_collective_cse_distance_threshold, 374))
@@ -2331,7 +2310,7 @@ function PB._encoded_size(x::DebugOptions)
     return encoded_size
 end
 
-struct HloModuleConfigProto
+mutable struct HloModuleConfigProto
     entry_computation_layout::Union{Nothing,ProgramShapeProto}
     seed::UInt64
     launch_id::Int32
@@ -2602,7 +2581,7 @@ function PB._encoded_size(x::HloModuleConfigProto)
     return encoded_size
 end
 
-struct ExecutionOptions
+mutable struct ExecutionOptions
     shape_with_output_layout::Union{Nothing,ShapeProto}
     seed::UInt64
     debug_options::Union{Nothing,DebugOptions}
@@ -2783,7 +2762,7 @@ function PB._encoded_size(x::ExecutionOptions)
     return encoded_size
 end
 
-struct HloModuleProtoWithConfig
+mutable struct HloModuleProtoWithConfig
     hlo_module::Union{Nothing,HloModuleProto}
     config::Union{Nothing,HloModuleConfigProto}
 end
diff --git a/src/proto/xla_autotuning/autotune_results_pb.jl b/src/proto/xla_autotuning/autotune_results_pb.jl
index 653e78ccd2..70feec955e 100644
--- a/src/proto/xla_autotuning/autotune_results_pb.jl
+++ b/src/proto/xla_autotuning/autotune_results_pb.jl
@@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx
 export AutotuningLogs, var"AutotuneResults.Entry", AutotuneResults
 
 
-struct AutotuningLogs
+mutable struct AutotuningLogs
     logs::Vector{AutotuningLog}
 end
 PB.default_values(::Type{AutotuningLogs}) = (;logs = Vector{AutotuningLog}())
@@ -35,7 +35,7 @@ function PB._encoded_size(x::AutotuningLogs)
     return encoded_size
 end
 
-struct var"AutotuneResults.Entry"
+mutable struct var"AutotuneResults.Entry"
     device::String
     hlo::String
     result::Union{Nothing,AutotuneResult}
@@ -83,7 +83,7 @@ function PB._encoded_size(x::var"AutotuneResults.Entry")
     return encoded_size
 end
 
-struct AutotuneResults
+mutable struct AutotuneResults
     version::Int32
     results::Vector{var"AutotuneResults.Entry"}
 end
diff --git a/src/proto/xla_autotuning/autotuning_pb.jl b/src/proto/xla_autotuning/autotuning_pb.jl
index 57eeeafe8d..dcdcd193e9 100644
--- a/src/proto/xla_autotuning/autotuning_pb.jl
+++ b/src/proto/xla_autotuning/autotuning_pb.jl
@@ -9,7 +9,7 @@ export var"AutotuneResult.CudaConvPlanKey", ComputeCapability, TritonGemmConfigs
 export var"AutotuneResult.FailureResult", AutotuneResult, AutotuningLog
 
 
-struct var"AutotuneResult.TritonGemmKey"
+mutable struct var"AutotuneResult.TritonGemmKey"
     block_m::Int64
     block_n::Int64
     block_k::Int64
@@ -87,7 +87,7 @@ function PB._encoded_size(x::var"AutotuneResult.TritonGemmKey")
     return encoded_size
 end
 
-struct var"AutotuneResult.BackendConfigKey"
+mutable struct var"AutotuneResult.BackendConfigKey"
     name::String
     config::Union{Nothing,google.protobuf.var"#Any"}
 end
@@ -123,7 +123,7 @@ function PB._encoded_size(x::var"AutotuneResult.BackendConfigKey")
     return encoded_size
 end
 
-struct var"AutotuneResult.ConvKey"
+mutable struct var"AutotuneResult.ConvKey"
     algorithm::Int64
     tensor_ops_enabled::Bool
 end
@@ -159,7 +159,7 @@ function PB._encoded_size(x::var"AutotuneResult.ConvKey")
     return encoded_size
 end
 
-struct CudnnVersion
+mutable struct CudnnVersion
     major::Int32
     minor::Int32
     patch::Int32
@@ -203,43 +203,37 @@ end
 
 @enumx var"AutotuneResult.FailureKind" UNKNOWN=0 REDZONE_MODIFIED=1 WRONG_RESULT=2 DISQUALIFIED=3
 
-struct var"AutotuneResult.GemmKey"
+mutable struct var"AutotuneResult.GemmKey"
     algorithm::Int64
-    autotune_workspace_size::Int64
 end
-PB.default_values(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = zero(Int64), autotune_workspace_size = zero(Int64))
-PB.field_numbers(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = 1, autotune_workspace_size = 2)
+PB.default_values(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = zero(Int64))
+PB.field_numbers(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = 1)
 
 function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:var"AutotuneResult.GemmKey"})
     algorithm = zero(Int64)
-    autotune_workspace_size = zero(Int64)
     while !PB.message_done(d)
         field_number, wire_type = PB.decode_tag(d)
         if field_number == 1
             algorithm = PB.decode(d, Int64)
-        elseif field_number == 2
-            autotune_workspace_size = PB.decode(d, Int64)
         else
             Base.skip(d, wire_type)
         end
     end
-    return var"AutotuneResult.GemmKey"(algorithm, autotune_workspace_size)
+    return var"AutotuneResult.GemmKey"(algorithm)
 end
 
 function PB.encode(e::PB.AbstractProtoEncoder, x::var"AutotuneResult.GemmKey")
     initpos = position(e.io)
     x.algorithm != zero(Int64) && PB.encode(e, 1, x.algorithm)
-    x.autotune_workspace_size != zero(Int64) && PB.encode(e, 2, x.autotune_workspace_size)
     return position(e.io) - initpos
 end
 function PB._encoded_size(x::var"AutotuneResult.GemmKey")
     encoded_size = 0
     x.algorithm != zero(Int64) && (encoded_size += PB._encoded_size(x.algorithm, 1))
-    x.autotune_workspace_size != zero(Int64) && (encoded_size += PB._encoded_size(x.autotune_workspace_size, 2))
     return encoded_size
 end
 
-struct var"AutotuneResult.CustomKernelFusionKey"
+mutable struct var"AutotuneResult.CustomKernelFusionKey"
     kernel_index::Int64
 end
 PB.default_values(::Type{var"AutotuneResult.CustomKernelFusionKey"}) = (;kernel_index = zero(Int64))
@@ -269,7 +263,7 @@ function PB._encoded_size(x::var"AutotuneResult.CustomKernelFusionKey")
     return encoded_size
 end
 
-struct var"AutotuneResult.CudaConvPlanKey"
+mutable struct var"AutotuneResult.CudaConvPlanKey"
     exec_plan_id::String
 end
 PB.default_values(::Type{var"AutotuneResult.CudaConvPlanKey"}) = (;exec_plan_id = "")
@@ -299,7 +293,7 @@ function PB._encoded_size(x::var"AutotuneResult.CudaConvPlanKey")
     return encoded_size
 end
 
-struct ComputeCapability
+mutable struct ComputeCapability
     major::Int32
     minor::Int32
 end
@@ -335,7 +329,7 @@ function PB._encoded_size(x::ComputeCapability)
     return encoded_size
 end
 
-struct TritonGemmConfigsProto
+mutable struct TritonGemmConfigsProto
     config::Vector{var"AutotuneResult.TritonGemmKey"}
 end
 PB.default_values(::Type{TritonGemmConfigsProto}) = (;config = Vector{var"AutotuneResult.TritonGemmKey"}())
@@ -365,7 +359,7 @@ function PB._encoded_size(x::TritonGemmConfigsProto)
     return encoded_size
 end
 
-struct var"AutotuneResult.FailureResult"
+mutable struct var"AutotuneResult.FailureResult"
     kind::var"AutotuneResult.FailureKind".T
     msg::String
     key::Union{Nothing,OneOf{<:Union{var"AutotuneResult.ConvKey",var"AutotuneResult.GemmKey",var"AutotuneResult.CudaConvPlanKey",xla_tsl_dnn.AlgorithmProto}}}
@@ -440,7 +434,7 @@ function PB._encoded_size(x::var"AutotuneResult.FailureResult")
     return encoded_size
 end
 
-struct AutotuneResult
+mutable struct AutotuneResult
     scratch_bytes::Int64
     run_time::Union{Nothing,google.protobuf.Duration}
     failure::Union{Nothing,var"AutotuneResult.FailureResult"}
@@ -533,7 +527,7 @@ function PB._encoded_size(x::AutotuneResult)
     return encoded_size
 end
 
-struct AutotuningLog
+mutable struct AutotuningLog
     instr::Union{Nothing,google.protobuf.var"#Any"}
     results::Vector{AutotuneResult}
     cudnn_version::Union{Nothing,CudnnVersion}
diff --git a/src/proto/xla_tsl_dnn/dnn_pb.jl b/src/proto/xla_tsl_dnn/dnn_pb.jl
index aa361c237d..61414b51b8 100644
--- a/src/proto/xla_tsl_dnn/dnn_pb.jl
+++ b/src/proto/xla_tsl_dnn/dnn_pb.jl
@@ -28,7 +28,7 @@ export AlgorithmConfigProto
 
 @enumx FilterLayout kOutputInputYX=0 kOutputYXInput=1 kOutputInputYX4=2 kOutputInputYX32=5 kOutputInputYX32_CudnnReordered=6 kInputYXOutput=3 kYXInputOutput=4
 
-struct ConvolutionDescriptorProto
+mutable struct ConvolutionDescriptorProto
     paddings::Vector{Int64}
     strides::Vector{Int64}
     dilations::Vector{Int64}
@@ -94,7 +94,7 @@ function PB._encoded_size(x::ConvolutionDescriptorProto)
     return encoded_size
 end
 
-struct AlgorithmProto
+mutable struct AlgorithmProto
     algo_id::Int64
     math_type::var"AlgorithmProto.MathType".T
     tuning_knobs::Dict{Int64,Int64}
@@ -149,7 +149,7 @@ function PB._encoded_size(x::AlgorithmProto)
     return encoded_size
 end
 
-struct TensorDescriptorProto
+mutable struct TensorDescriptorProto
     dimensions::Vector{Int64}
     data_type::var"#DataType".T
     layout_oneof::Union{Nothing,OneOf{<:Union{DataLayout.T,FilterLayout.T}}}
@@ -206,7 +206,7 @@ function PB._encoded_size(x::TensorDescriptorProto)
     return encoded_size
 end
 
-struct AlgorithmConfigProto
+mutable struct AlgorithmConfigProto
     optional_algorithm::Union{Nothing,OneOf{AlgorithmProto}}
     optional_algorithm_no_scratch::Union{Nothing,OneOf{AlgorithmProto}}
     optional_scratch_size::Union{Nothing,OneOf{Int64}}
diff --git a/src/xla/CompileOptions.jl b/src/xla/CompileOptions.jl
index d7e188b5b3..c3469a20bc 100644
--- a/src/xla/CompileOptions.jl
+++ b/src/xla/CompileOptions.jl
@@ -1,29 +1,13 @@
-const DEFAULT_XLA_DEBUG_OPTIONS = Ref{Union{Nothing,Reactant.Proto.xla.DebugOptions}}(
-    nothing
-)
-const DEFAULT_XLA_COMPILE_OPTIONS = Ref{
-    Union{Nothing,Reactant.Proto.xla.CompileOptionsProto}
-}(
-    nothing
-)
-
 function get_default_debug_options()
-    if !isnothing(DEFAULT_XLA_DEBUG_OPTIONS[])
-        return DEFAULT_XLA_DEBUG_OPTIONS[]::Reactant.Proto.xla.DebugOptions
-    end
     size = Ref{Csize_t}(0)
     data = @ccall MLIR.API.mlir_c.ReactantGetDebugOptions(size::Ptr{Csize_t})::Ptr{UInt8}
     bytes = unsafe_wrap(Array, data, (size[],); own=false)
     proto = Reactant.ProtoUtils.proto_from_bytes(Reactant.Proto.xla.DebugOptions, bytes)
     @ccall free(data::Ptr{UInt8})::Cvoid
-    DEFAULT_XLA_DEBUG_OPTIONS[] = proto
     return proto
 end
 
 function get_default_compile_options()
-    if !isnothing(DEFAULT_XLA_COMPILE_OPTIONS[])
-        return DEFAULT_XLA_COMPILE_OPTIONS[]::Reactant.Proto.xla.CompileOptionsProto
-    end
     size = Ref{Csize_t}(0)
     data = @ccall MLIR.API.mlir_c.ReactantGetCompileOptions(size::Ptr{Csize_t})::Ptr{UInt8}
     bytes = unsafe_wrap(Array, data, (size[],); own=false)
@@ -31,40 +15,46 @@ function get_default_compile_options()
         Reactant.Proto.xla.CompileOptionsProto, bytes
     )
     @ccall free(data::Ptr{UInt8})::Cvoid
-    DEFAULT_XLA_COMPILE_OPTIONS[] = proto
     return proto
 end
 
 function get_debug_options(; kwargs...)
     debug_options = get_default_debug_options()
 
-    # default overrides. can we changed by the user by passing in kwargs
-    @set! debug_options.xla_gpu_cuda_data_dir = CUDA_DATA_DIR[]
-    @set! debug_options.xla_enable_enzyme_comms_opt = true
-    @set! debug_options.xla_gpu_experimental_use_raft_select_k = true
+    # default overrides. can be changed by the user by passing in kwargs
+    debug_options.xla_gpu_cuda_data_dir = CUDA_DATA_DIR[]
+    debug_options.xla_enable_enzyme_comms_opt = true
+    debug_options.xla_gpu_experimental_use_raft_select_k = true
 
     if Reactant.PersistentCompileCache.kernel_cache_enabled()
-        @set! debug_options.xla_gpu_kernel_cache_file = Reactant.PersistentCompileCache.get_kernel_cache_path()
-        @set! debug_options.xla_gpu_enable_llvm_module_compilation_parallelism = true
+        debug_options.xla_gpu_kernel_cache_file = Reactant.PersistentCompileCache.get_kernel_cache_path()
+        debug_options.xla_gpu_enable_llvm_module_compilation_parallelism = true
     end
     if Reactant.PersistentCompileCache.autotune_cache_enabled()
-        @set! debug_options.xla_gpu_per_fusion_autotune_cache_dir = Reactant.PersistentCompileCache.get_autotune_cache_directory()
+        debug_options.xla_gpu_per_fusion_autotune_cache_dir = Reactant.PersistentCompileCache.get_autotune_cache_directory()
         if Reactant.Distributed.local_rank() <= 0
-            @set! debug_options.xla_gpu_experimental_autotune_cache_mode =
+            debug_options.xla_gpu_experimental_autotune_cache_mode =
                 Reactant.Proto.xla.var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UPDATE
         else
-            @set! debug_options.xla_gpu_experimental_autotune_cache_mode =
+            debug_options.xla_gpu_experimental_autotune_cache_mode =
                 Reactant.Proto.xla.var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_READ
         end
     end
 
     for (key, value) in pairs(kwargs)
-        debug_options = Setfield.set(debug_options, Setfield.PropertyLens{key}(), value)
+        setproperty!(debug_options, key, value)
     end
 
     return debug_options
 end
 
+struct CompileOptionsWithoutProto
+    device_id::Int64
+    global_device_ids::Vector{Int64}
+    use_shardy_partitioner::Bool
+    use_spmd_partitioning::Bool
+end
+
 function make_compile_options(;
     device_id::Int64,
     num_replicas::Int64=1,
@@ -74,16 +64,35 @@ function make_compile_options(;
     xla_executable_build_options=(;),
     xla_compile_options=(;),
 )
+    if (
+        isempty(xla_debug_options) &&
+        (
+            isempty(xla_executable_build_options) || (
+                length(xla_executable_build_options) == 2 &&
+                haskey(xla_executable_build_options, :use_shardy_partitioner) &&
+                haskey(xla_executable_build_options, :use_spmd_partitioning)
+            )
+        ) &&
+        isempty(xla_compile_options)
+    )
+        return CompileOptionsWithoutProto(
+            device_id,
+            mesh_ids === nothing ? Int64[] : mesh_ids,
+            get(xla_executable_build_options, :use_shardy_partitioner, false),
+            get(xla_executable_build_options, :use_spmd_partitioning, false),
+        )
+    end
+
     compile_options = get_default_compile_options()
     executable_build_options = compile_options.executable_build_options
 
-    @set! executable_build_options.debug_options = get_debug_options(; xla_debug_options...)
-    @set! executable_build_options.num_replicas = num_replicas
-    @set! executable_build_options.num_partitions = num_partitions
+    executable_build_options.debug_options = get_debug_options(; xla_debug_options...)
+    executable_build_options.num_replicas = num_replicas
+    executable_build_options.num_partitions = num_partitions
 
-    # default overrides. can we changed by the user by passing in kwargs
-    @set! executable_build_options.allow_spmd_sharding_propagation_to_parameters = [false]
-    @set! executable_build_options.allow_spmd_sharding_propagation_to_output = [false]
+    # default overrides. can be changed by the user by passing in kwargs
+    executable_build_options.allow_spmd_sharding_propagation_to_parameters = [false]
+    executable_build_options.allow_spmd_sharding_propagation_to_output = [false]
 
     if device_id < 0
         @assert mesh_ids !== nothing
@@ -94,12 +103,12 @@ function make_compile_options(;
                 mesh_ids[(i - 1) * num_partitions + j] for i in 1:num_replicas
             ]) for j in 1:num_partitions
         ]
-        @set! executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto(
+        executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto(
             Int32(num_replicas), Int32(num_partitions), computation_devices
         )
     else
-        @set! executable_build_options.device_ordinal = device_id
-        @set! executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto(
+        executable_build_options.device_ordinal = device_id
+        executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto(
             Int32(1),
             Int32(1),
             [Reactant.Proto.xla.var"DeviceAssignmentProto.ComputationDevice"([device_id])],
@@ -107,15 +116,13 @@ function make_compile_options(;
     end
 
     for (key, val) in pairs(xla_executable_build_options)
-        executable_build_options = Setfield.set(
-            executable_build_options, Setfield.PropertyLens{key}(), val
-        )
+        setproperty!(executable_build_options, key, val)
     end
 
-    @set! compile_options.executable_build_options = executable_build_options
+    compile_options.executable_build_options = executable_build_options
 
     for (key, val) in pairs(xla_compile_options)
-        compile_options = Setfield.set(compile_options, Setfield.PropertyLens{key}(), val)
+        setproperty!(compile_options, key, val)
     end
 
     return compile_options
diff --git a/src/xla/IFRT/Array.jl b/src/xla/IFRT/Array.jl
index d3d6af708e..f3eef62fcd 100644
--- a/src/xla/IFRT/Array.jl
+++ b/src/xla/IFRT/Array.jl
@@ -370,8 +370,8 @@ function replicate_array_to_all_devices(array::Array, sharding, mesh, size_arr)
 
         exec = XLA.compile(
             XLA.client(array),
-            mod;
-            compile_options,
+            mod,
+            compile_options;
             num_outputs=1,                # unused
             num_parameters=1,             # unused
             is_sharded=true,
diff --git a/src/xla/IFRT/LoadedExecutable.jl b/src/xla/IFRT/LoadedExecutable.jl
index 45bfeaddb4..6162a67852 100644
--- a/src/xla/IFRT/LoadedExecutable.jl
+++ b/src/xla/IFRT/LoadedExecutable.jl
@@ -73,8 +73,8 @@ end
 
 function XLA.compile(
     client::Client,
-    mod::MLIR.IR.Module;
-    compile_options::Reactant.Proto.xla.CompileOptionsProto,
+    mod::MLIR.IR.Module,
+    compile_options::Reactant.Proto.xla.CompileOptionsProto;
     num_parameters::Int64,
     num_outputs::Int64,
     is_sharded::Bool,
@@ -97,6 +97,42 @@ function XLA.compile(
     )
 end
 
+function XLA.compile(
+    client::Client,
+    mod::MLIR.IR.Module,
+    compile_options::Reactant.XLA.CompileOptionsWithoutProto;
+    num_parameters::Int64,
+    num_outputs::Int64,
+    is_sharded::Bool,
+    num_replicas::Int64,
+    num_partitions::Int64,
+)
+    GC.@preserve client mod begin
+        exec = MLIR.IR.try_compile_dump_mlir(mod) do
+            @ccall MLIR.API.mlir_c.ifrt_compile(
+                client.client::Ptr{Cvoid},
+                mod.module_::MLIR.API.MlirModule,
+                compile_options.device_id::Clong,
+                compile_options.global_device_ids::Ptr{Clong},
+                length(compile_options.global_device_ids)::Clong,
+                XLA.CUDA_DATA_DIR[]::Cstring,
+                compile_options.use_shardy_partitioner::Bool,
+                num_replicas::Int64,
+                num_partitions::Int64,
+                compile_options.use_spmd_partitioning::Bool,
+                Reactant.PersistentCompileCache.kernel_cache_enabled()::Bool,
+                Reactant.PersistentCompileCache.get_kernel_cache_path()::Cstring,
+                Reactant.PersistentCompileCache.autotune_cache_enabled()::Bool,
+                Reactant.PersistentCompileCache.get_autotune_cache_directory()::Cstring,
+                Reactant.Distributed.local_rank()::Cint,
+            )::Ptr{Cvoid}
+        end
+    end
+    return LoadedExecutable(
+        exec, num_outputs, num_parameters, is_sharded, num_replicas, num_partitions
+    )
+end
+
 @inline function XLA.execute(
     exec::LoadedExecutable,
     inputs::NTuple{N,Ptr{Cvoid}},
diff --git a/src/xla/PJRT/LoadedExecutable.jl b/src/xla/PJRT/LoadedExecutable.jl
index 14add5b4e6..7066c3b4d5 100644
--- a/src/xla/PJRT/LoadedExecutable.jl
+++ b/src/xla/PJRT/LoadedExecutable.jl
@@ -67,8 +67,8 @@ end
 
 function XLA.compile(
     client::Client,
-    mod::MLIR.IR.Module;
-    compile_options::Reactant.Proto.xla.CompileOptionsProto,
+    mod::MLIR.IR.Module,
+    compile_options::Reactant.Proto.xla.CompileOptionsProto;
     num_parameters::Int64,
     num_outputs::Int64,
     is_sharded::Bool,
@@ -91,6 +91,42 @@ function XLA.compile(
     )
 end
 
+function XLA.compile(
+    client::Client,
+    mod::MLIR.IR.Module,
+    compile_options::Reactant.XLA.CompileOptionsWithoutProto;
+    num_parameters::Int64,
+    num_outputs::Int64,
+    is_sharded::Bool,
+    num_replicas::Int64,
+    num_partitions::Int64,
+)
+    GC.@preserve client mod begin
+        exec = MLIR.IR.try_compile_dump_mlir(mod) do
+            @ccall MLIR.API.mlir_c.ClientCompile(
+                client.client::Ptr{Cvoid},
+                mod.module_::MLIR.API.MlirModule,
+                compile_options.device_id::Clong,
+                compile_options.global_device_ids::Ptr{Clong},
+                length(compile_options.global_device_ids)::Clong,
+                XLA.CUDA_DATA_DIR[]::Cstring,
+                compile_options.use_shardy_partitioner::Bool,
+                num_replicas::Int64,
+                num_partitions::Int64,
+                compile_options.use_spmd_partitioning::Bool,
+                Reactant.PersistentCompileCache.kernel_cache_enabled()::Bool,
+                Reactant.PersistentCompileCache.get_kernel_cache_path()::Cstring,
+                Reactant.PersistentCompileCache.autotune_cache_enabled()::Bool,
+                Reactant.PersistentCompileCache.get_autotune_cache_directory()::Cstring,
+                Reactant.Distributed.local_rank()::Cint,
+            )::Ptr{Cvoid}
+        end
+    end
+    return LoadedExecutable(
+        exec, num_outputs, num_parameters, is_sharded, num_replicas, num_partitions
+    )
+end
+
 function execute_ir(N, M, n_outs, with_device::Bool, nmesh_ids::Int64)
     ptr = @static if VERSION < v"1.12"
         sizeof(Int) == sizeof(Int64) ? "i64" : "i32"
diff --git a/src/xla/XLA.jl b/src/xla/XLA.jl
index 8cff7d44c2..6e56c020bc 100644
--- a/src/xla/XLA.jl
+++ b/src/xla/XLA.jl
@@ -9,8 +9,6 @@ using Enzyme: Compiler
 using Preferences: load_preference
 using UUIDs: UUID
 
-using Setfield: Setfield, @set!
-
 const XLA_REACTANT_GPU_MEM_FRACTION = Ref{Float64}(0.75)
 const XLA_REACTANT_GPU_PREALLOCATE = Ref{Bool}(true)
 const REACTANT_XLA_RUNTIME = load_preference(
@@ -28,6 +26,8 @@ function LLVMclopts(opts...)
     )::Cvoid
 end
 
+include("CompileOptions.jl")
+
 include("Distributed.jl")
 include("Client.jl")
 include("Device.jl")
@@ -48,8 +48,6 @@ include("PJRT/PJRT.jl")
 
 include("IFRT/IFRT.jl")
 
-include("CompileOptions.jl")
-
 abstract type AbstractBackendState end
 
 for runtime in (:PJRT, :IFRT)