diff --git a/Project.toml b/Project.toml index 99d1605b69..a149b51dad 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,10 @@ name = "Reactant" uuid = "3c362404-f566-11ee-1572-e11a4b42c853" -authors = ["William Moses ", "Valentin Churavy ", "Sergio Sánchez Ramírez ", "Paul Berg ", "Avik Pal ", "Mosè Giordano "] version = "0.2.203" +authors = ["William Moses ", "Valentin Churavy ", "Sergio Sánchez Ramírez ", "Paul Berg ", "Avik Pal ", "Mosè Giordano "] + +[workspace] +projects = ["docs", "test", "benchmark"] [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -30,7 +33,6 @@ ReactantCore = "a3311ec8-5e00-46d5-b541-4f83e724a433" Reactant_jll = "0192cb87-2b54-54ad-80e0-3be72ad8a3c0" ScopedValues = "7e506255-f358-4e82-b7e4-beb19740aa63" Scratch = "6c6a2e73-6563-6170-7368-637461726353" -Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" p7zip_jll = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" @@ -124,7 +126,6 @@ ReactantCore = "0.1.16" Reactant_jll = "0.0.305" ScopedValues = "1.3.0" Scratch = "1.2" -Setfield = "1.1.2" Sockets = "1.10" SparseArrays = "1.10" SpecialFunctions = "2.4" @@ -141,6 +142,3 @@ ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" - -[workspace] -projects = ["docs", "test", "benchmark"] diff --git a/src/Compiler.jl b/src/Compiler.jl index 966e93181d..295cfdc241 100644 --- a/src/Compiler.jl +++ b/src/Compiler.jl @@ -3763,8 +3763,8 @@ function compile_xla( exec = XLA.compile( client, - mod; - compile_options=xla_compile_options, + mod, + xla_compile_options; num_outputs=length(mlir_fn_res.linear_results), num_parameters=length(mlir_fn_res.linear_args), mlir_fn_res.is_sharded, diff --git a/src/proto/google/protobuf/any_pb.jl b/src/proto/google/protobuf/any_pb.jl index 7fb8b3bc1f..c49a0a0357 100644 --- a/src/proto/google/protobuf/any_pb.jl +++ b/src/proto/google/protobuf/any_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export var"#Any" -struct var"#Any" +mutable struct var"#Any" type_url::String value::Vector{UInt8} end diff --git a/src/proto/google/protobuf/duration_pb.jl b/src/proto/google/protobuf/duration_pb.jl index 6b763a9102..9b1ca2271d 100644 --- a/src/proto/google/protobuf/duration_pb.jl +++ b/src/proto/google/protobuf/duration_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Duration -struct Duration +mutable struct Duration seconds::Int64 nanos::Int32 end diff --git a/src/proto/google/protobuf/timestamp_pb.jl b/src/proto/google/protobuf/timestamp_pb.jl index 02b5887556..4f2e19cd0a 100644 --- a/src/proto/google/protobuf/timestamp_pb.jl +++ b/src/proto/google/protobuf/timestamp_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Timestamp -struct Timestamp +mutable struct Timestamp seconds::Int64 nanos::Int32 end diff --git a/src/proto/google/protobuf/wrappers_pb.jl b/src/proto/google/protobuf/wrappers_pb.jl index 397ad4f251..06b78e538d 100644 --- a/src/proto/google/protobuf/wrappers_pb.jl +++ b/src/proto/google/protobuf/wrappers_pb.jl @@ -6,7 +6,7 @@ export BoolValue, Int64Value, FloatValue, Int32Value, DoubleValue, UInt64Value, export BytesValue, StringValue -struct BoolValue +mutable struct BoolValue value::Bool end PB.default_values(::Type{BoolValue}) = (;value = false) @@ -36,7 +36,7 @@ function PB._encoded_size(x::BoolValue) return encoded_size end -struct Int64Value +mutable struct Int64Value value::Int64 end PB.default_values(::Type{Int64Value}) = (;value = zero(Int64)) @@ -66,7 +66,7 @@ function PB._encoded_size(x::Int64Value) return encoded_size end -struct FloatValue +mutable struct FloatValue value::Float32 end PB.default_values(::Type{FloatValue}) = (;value = zero(Float32)) @@ -96,7 +96,7 @@ function PB._encoded_size(x::FloatValue) return encoded_size end -struct Int32Value +mutable struct Int32Value value::Int32 end PB.default_values(::Type{Int32Value}) = (;value = zero(Int32)) @@ -126,7 +126,7 @@ function PB._encoded_size(x::Int32Value) return encoded_size end -struct DoubleValue +mutable struct DoubleValue value::Float64 end PB.default_values(::Type{DoubleValue}) = (;value = zero(Float64)) @@ -156,7 +156,7 @@ function PB._encoded_size(x::DoubleValue) return encoded_size end -struct UInt64Value +mutable struct UInt64Value value::UInt64 end PB.default_values(::Type{UInt64Value}) = (;value = zero(UInt64)) @@ -186,7 +186,7 @@ function PB._encoded_size(x::UInt64Value) return encoded_size end -struct UInt32Value +mutable struct UInt32Value value::UInt32 end PB.default_values(::Type{UInt32Value}) = (;value = zero(UInt32)) @@ -216,7 +216,7 @@ function PB._encoded_size(x::UInt32Value) return encoded_size end -struct BytesValue +mutable struct BytesValue value::Vector{UInt8} end PB.default_values(::Type{BytesValue}) = (;value = UInt8[]) @@ -246,7 +246,7 @@ function PB._encoded_size(x::BytesValue) return encoded_size end -struct StringValue +mutable struct StringValue value::String end PB.default_values(::Type{StringValue}) = (;value = "") diff --git a/src/proto/stream_executor/cuda_compute_capability_pb.jl b/src/proto/stream_executor/cuda_compute_capability_pb.jl index 64306bb8dc..c6f1b40094 100644 --- a/src/proto/stream_executor/cuda_compute_capability_pb.jl +++ b/src/proto/stream_executor/cuda_compute_capability_pb.jl @@ -7,7 +7,7 @@ export var"CudaComputeCapabilityProto.FeatureExtension", CudaComputeCapabilityPr @enumx var"CudaComputeCapabilityProto.FeatureExtension" UNSPECIFIED=0 NONE=1 ACCELERATED_FEATURES=2 FAMILY_COMPATIBLE_FEATURES=3 -struct CudaComputeCapabilityProto +mutable struct CudaComputeCapabilityProto major::Int32 minor::Int32 feature_extension::var"CudaComputeCapabilityProto.FeatureExtension".T diff --git a/src/proto/stream_executor/device_description_pb.jl b/src/proto/stream_executor/device_description_pb.jl index 86a3b39964..3ee940eb0a 100644 --- a/src/proto/stream_executor/device_description_pb.jl +++ b/src/proto/stream_executor/device_description_pb.jl @@ -6,7 +6,7 @@ export RocmComputeCapabilityProto, DnnVersionInfoProto, RuntimeVersionProto export GpuDeviceInfoProto, GpuComputeCapabilityProto, GpuTargetConfigProto -struct RocmComputeCapabilityProto +mutable struct RocmComputeCapabilityProto gcn_arch_name::String end PB.default_values(::Type{RocmComputeCapabilityProto}) = (;gcn_arch_name = "") @@ -36,7 +36,7 @@ function PB._encoded_size(x::RocmComputeCapabilityProto) return encoded_size end -struct DnnVersionInfoProto +mutable struct DnnVersionInfoProto major::Int32 minor::Int32 patch::Int32 @@ -78,7 +78,7 @@ function PB._encoded_size(x::DnnVersionInfoProto) return encoded_size end -struct RuntimeVersionProto +mutable struct RuntimeVersionProto major::Int32 minor::Int32 patch::Int32 @@ -120,7 +120,7 @@ function PB._encoded_size(x::RuntimeVersionProto) return encoded_size end -struct GpuDeviceInfoProto +mutable struct GpuDeviceInfoProto threads_per_block_limit::Int32 threads_per_warp::Int32 shared_memory_per_block::Int32 @@ -267,7 +267,7 @@ function PB._encoded_size(x::GpuDeviceInfoProto) return encoded_size end -struct GpuComputeCapabilityProto +mutable struct GpuComputeCapabilityProto compute_capability::Union{Nothing,OneOf{<:Union{CudaComputeCapabilityProto,RocmComputeCapabilityProto}}} end PB.oneof_field_types(::Type{GpuComputeCapabilityProto}) = (; @@ -312,7 +312,7 @@ function PB._encoded_size(x::GpuComputeCapabilityProto) return encoded_size end -struct GpuTargetConfigProto +mutable struct GpuTargetConfigProto gpu_device_info::Union{Nothing,GpuDeviceInfoProto} platform_name::String dnn_version_info::Union{Nothing,DnnVersionInfoProto} diff --git a/src/proto/tensorflow/profiler/diagnostics_pb.jl b/src/proto/tensorflow/profiler/diagnostics_pb.jl index c2d6990cd7..0d205b057e 100644 --- a/src/proto/tensorflow/profiler/diagnostics_pb.jl +++ b/src/proto/tensorflow/profiler/diagnostics_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Diagnostics -struct Diagnostics +mutable struct Diagnostics info::Vector{String} warnings::Vector{String} errors::Vector{String} diff --git a/src/proto/tensorflow/profiler/hardware_types_pb.jl b/src/proto/tensorflow/profiler/hardware_types_pb.jl index 9b4f8e3f0a..c4995d4e95 100644 --- a/src/proto/tensorflow/profiler/hardware_types_pb.jl +++ b/src/proto/tensorflow/profiler/hardware_types_pb.jl @@ -7,7 +7,7 @@ export HardwareType, GPUComputeCapability, DeviceCapabilities @enumx HardwareType UNKNOWN_HARDWARE=0 CPU_ONLY=1 GPU=2 TPU=3 -struct GPUComputeCapability +mutable struct GPUComputeCapability major::UInt32 minor::UInt32 end @@ -43,7 +43,7 @@ function PB._encoded_size(x::GPUComputeCapability) return encoded_size end -struct DeviceCapabilities +mutable struct DeviceCapabilities clock_rate_in_ghz::Float64 num_cores::UInt32 memory_size_in_bytes::UInt64 diff --git a/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl b/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl index d5a01b81b2..94096c455a 100644 --- a/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl +++ b/src/proto/tensorflow/profiler/hlo_stats/hlo_stats_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export HloStatsRecord, HloStatsDatabase -struct HloStatsRecord +mutable struct HloStatsRecord rank::UInt64 program_id::UInt64 hlo_category::String @@ -240,7 +240,7 @@ function PB._encoded_size(x::HloStatsRecord) return encoded_size end -struct HloStatsDatabase +mutable struct HloStatsDatabase hlo_stats_record::Vector{HloStatsRecord} end PB.default_values(::Type{HloStatsDatabase}) = (;hlo_stats_record = Vector{HloStatsRecord}()) diff --git a/src/proto/tensorflow/profiler/input_pipeline_pb.jl b/src/proto/tensorflow/profiler/input_pipeline_pb.jl index 61d9dc6242..9497f26308 100644 --- a/src/proto/tensorflow/profiler/input_pipeline_pb.jl +++ b/src/proto/tensorflow/profiler/input_pipeline_pb.jl @@ -7,7 +7,7 @@ export InputPipelineAnalysisRecommendation, InputOpDetails, StepSummary export GenericStepTimeBreakdown, InputPipelineAnalysisResult -struct InputTimeBreakdown +mutable struct InputTimeBreakdown demanded_file_read_us::Float64 advanced_file_read_us::Float64 preprocessing_us::Float64 @@ -61,7 +61,7 @@ function PB._encoded_size(x::InputTimeBreakdown) return encoded_size end -struct PerGenericStepDetails +mutable struct PerGenericStepDetails step_number::Int32 step_name::String step_time_ms::Float64 @@ -164,7 +164,7 @@ function PB._encoded_size(x::PerGenericStepDetails) return encoded_size end -struct BottleneckAnalysis +mutable struct BottleneckAnalysis input_percent::Float64 output_percent::Float64 idle_percent::Float64 @@ -260,7 +260,7 @@ function PB._encoded_size(x::BottleneckAnalysis) return encoded_size end -struct InputPipelineAnalysisRecommendation +mutable struct InputPipelineAnalysisRecommendation details::Vector{String} bottleneck_analysis::Union{Nothing,google.protobuf.var"#Any"} summary_next_step::String @@ -302,7 +302,7 @@ function PB._encoded_size(x::InputPipelineAnalysisRecommendation) return encoded_size end -struct InputOpDetails +mutable struct InputOpDetails op_name::String count::UInt64 time_in_ms::Float64 @@ -368,7 +368,7 @@ function PB._encoded_size(x::InputOpDetails) return encoded_size end -struct StepSummary +mutable struct StepSummary average::Float64 standard_deviation::Float64 minimum::Float64 @@ -416,7 +416,7 @@ function PB._encoded_size(x::StepSummary) return encoded_size end -struct GenericStepTimeBreakdown +mutable struct GenericStepTimeBreakdown unknown_time_ms_summary::Union{Nothing,StepSummary} host_wait_input_ms_summary::Union{Nothing,StepSummary} host_to_device_ms_summary::Union{Nothing,StepSummary} @@ -507,7 +507,7 @@ function PB._encoded_size(x::GenericStepTimeBreakdown) return encoded_size end -struct InputPipelineAnalysisResult +mutable struct InputPipelineAnalysisResult tag::Bool hardware_type::String step_time_summary::Union{Nothing,StepSummary} diff --git a/src/proto/tensorflow/profiler/kernel_stats_pb.jl b/src/proto/tensorflow/profiler/kernel_stats_pb.jl index f5d132ffe0..889f53813c 100644 --- a/src/proto/tensorflow/profiler/kernel_stats_pb.jl +++ b/src/proto/tensorflow/profiler/kernel_stats_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export KernelReport, KernelStatsDb -struct KernelReport +mutable struct KernelReport name::String registers_per_thread::UInt32 static_shmem_bytes::UInt32 @@ -113,7 +113,7 @@ function PB._encoded_size(x::KernelReport) return encoded_size end -struct KernelStatsDb +mutable struct KernelStatsDb reports::Vector{KernelReport} end PB.default_values(::Type{KernelStatsDb}) = (;reports = Vector{KernelReport}()) diff --git a/src/proto/tensorflow/profiler/memory_profile_pb.jl b/src/proto/tensorflow/profiler/memory_profile_pb.jl index 8e5dc9648b..7ad2d362e0 100644 --- a/src/proto/tensorflow/profiler/memory_profile_pb.jl +++ b/src/proto/tensorflow/profiler/memory_profile_pb.jl @@ -9,7 +9,7 @@ export MemoryProfile @enumx MemoryActivity UNKNOWN_ACTIVITY=0 ALLOCATION=1 DEALLOCATION=2 RESERVATION=3 EXPANSION=4 -struct MemoryAggregationStats +mutable struct MemoryAggregationStats stack_reserved_bytes::Int64 heap_allocated_bytes::Int64 free_memory_bytes::Int64 @@ -63,7 +63,7 @@ function PB._encoded_size(x::MemoryAggregationStats) return encoded_size end -struct ActiveAllocation +mutable struct ActiveAllocation snapshot_index::Int64 special_index::Int64 num_occurrences::Int64 @@ -105,7 +105,7 @@ function PB._encoded_size(x::ActiveAllocation) return encoded_size end -struct MemoryActivityMetadata +mutable struct MemoryActivityMetadata memory_activity::MemoryActivity.T requested_bytes::Int64 allocation_bytes::Int64 @@ -183,7 +183,7 @@ function PB._encoded_size(x::MemoryActivityMetadata) return encoded_size end -struct MemoryProfileSummary +mutable struct MemoryProfileSummary peak_bytes_usage_lifetime::Int64 peak_stats::Union{Nothing,MemoryAggregationStats} peak_stats_time_ps::Int64 @@ -231,7 +231,7 @@ function PB._encoded_size(x::MemoryProfileSummary) return encoded_size end -struct MemoryProfileSnapshot +mutable struct MemoryProfileSnapshot time_offset_ps::Int64 aggregation_stats::Union{Nothing,MemoryAggregationStats} activity_metadata::Union{Nothing,MemoryActivityMetadata} @@ -273,7 +273,7 @@ function PB._encoded_size(x::MemoryProfileSnapshot) return encoded_size end -struct PerAllocatorMemoryProfile +mutable struct PerAllocatorMemoryProfile memory_profile_snapshots::Vector{MemoryProfileSnapshot} profile_summary::Union{Nothing,MemoryProfileSummary} active_allocations::Vector{ActiveAllocation} @@ -327,7 +327,7 @@ function PB._encoded_size(x::PerAllocatorMemoryProfile) return encoded_size end -struct MemoryProfile +mutable struct MemoryProfile memory_profile_per_allocator::Dict{String,PerAllocatorMemoryProfile} num_hosts::Int32 memory_ids::Vector{String} diff --git a/src/proto/tensorflow/profiler/op_metrics_pb.jl b/src/proto/tensorflow/profiler/op_metrics_pb.jl index a242f228b1..0d87ed2f7e 100644 --- a/src/proto/tensorflow/profiler/op_metrics_pb.jl +++ b/src/proto/tensorflow/profiler/op_metrics_pb.jl @@ -11,7 +11,7 @@ abstract type var"##Abstract#OpMetrics" end abstract type var"##Abstract#OpMetricsDb" end -struct PrecisionStats +mutable struct PrecisionStats compute_16bit_ps::UInt64 compute_32bit_ps::UInt64 end @@ -59,7 +59,7 @@ end @enumx MemorySpace MEMORY_SPACE_UNDEFINED=0 MEMORY_SPACE_HBM=1 MEMORY_SPACE_ON_CHIP=2147483646 MEMORY_SPACE_ALL=2147483647 -struct var"PerformanceInfo.MemoryAccessed" +mutable struct var"PerformanceInfo.MemoryAccessed" is_read::Bool memory_space::var"PerformanceInfo.MemoryAccessed.MemorySpace".T bytes_accessed::Int64 @@ -101,7 +101,7 @@ function PB._encoded_size(x::var"PerformanceInfo.MemoryAccessed") return encoded_size end -struct var"LayoutAnalysis.Dimension" +mutable struct var"LayoutAnalysis.Dimension" size::Int32 alignment::Int32 semantics::LayoutDimensionSemantics.T @@ -143,7 +143,7 @@ function PB._encoded_size(x::var"LayoutAnalysis.Dimension") return encoded_size end -struct var"OpMetrics.MemoryAccessed" +mutable struct var"OpMetrics.MemoryAccessed" operation_type::var"OpMetrics.MemoryAccessed.OperationType".T memory_space::UInt64 bytes_accessed::UInt64 @@ -185,7 +185,7 @@ function PB._encoded_size(x::var"OpMetrics.MemoryAccessed") return encoded_size end -struct PerformanceInfo +mutable struct PerformanceInfo flops::Int64 bytes_accessed::Int64 memory_accessed_breakdown::Vector{var"PerformanceInfo.MemoryAccessed"} @@ -228,7 +228,7 @@ function PB._encoded_size(x::PerformanceInfo) return encoded_size end -struct LayoutAnalysis +mutable struct LayoutAnalysis dimensions::Vector{var"LayoutAnalysis.Dimension"} end PB.default_values(::Type{LayoutAnalysis}) = (;dimensions = Vector{var"LayoutAnalysis.Dimension"}()) @@ -258,7 +258,7 @@ function PB._encoded_size(x::LayoutAnalysis) return encoded_size end -struct MemoryAccessBreakdown +mutable struct MemoryAccessBreakdown memory_accessed::Vector{var"OpMetrics.MemoryAccessed"} end PB.default_values(::Type{MemoryAccessBreakdown}) = (;memory_accessed = Vector{var"OpMetrics.MemoryAccessed"}()) @@ -289,7 +289,7 @@ function PB._encoded_size(x::MemoryAccessBreakdown) end # Stub definitions for cyclic types -struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract#OpMetrics" +mutable struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract#OpMetrics" hlo_module_id::UInt64 name::String long_name::String @@ -317,7 +317,7 @@ struct var"##Stub#OpMetrics"{T1<:var"##Abstract#OpMetricsDb"} <: var"##Abstract# core_type::var"OpMetrics.TpuCoreType".T end -struct var"##Stub#OpMetricsDb" <: var"##Abstract#OpMetricsDb" +mutable struct var"##Stub#OpMetricsDb" <: var"##Abstract#OpMetricsDb" metrics_db::Vector{var"##Stub#OpMetrics"{var"##Stub#OpMetricsDb"}} total_host_infeed_enq_duration_ps::UInt64 total_host_infeed_enq_start_timestamp_ps_diff::UInt64 diff --git a/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl b/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl index 43e7d814bb..285fc26635 100644 --- a/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl +++ b/src/proto/tensorflow/profiler/op_profile/op_profile_pb.jl @@ -7,7 +7,7 @@ export Metrics, var"Node.XLAInstruction.LayoutAnalysis", var"Node.XLAInstruction export Profile -struct var"Node.InstructionCategory" end +mutable struct var"Node.InstructionCategory" end function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:var"Node.InstructionCategory"}) while !PB.message_done(d) @@ -26,7 +26,7 @@ function PB._encoded_size(x::var"Node.InstructionCategory") return encoded_size end -struct var"Node.XLAInstruction.LayoutAnalysis.Dimension" +mutable struct var"Node.XLAInstruction.LayoutAnalysis.Dimension" size::Int32 alignment::Int32 semantics::String @@ -68,7 +68,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction.LayoutAnalysis.Dimension") return encoded_size end -struct Metrics +mutable struct Metrics flops::Float64 uncapped_flops::Float64 bandwidth_utils::Vector{Float64} @@ -153,7 +153,7 @@ function PB._encoded_size(x::Metrics) return encoded_size end -struct var"Node.XLAInstruction.LayoutAnalysis" +mutable struct var"Node.XLAInstruction.LayoutAnalysis" dimensions::Vector{var"Node.XLAInstruction.LayoutAnalysis.Dimension"} end PB.default_values(::Type{var"Node.XLAInstruction.LayoutAnalysis"}) = (;dimensions = Vector{var"Node.XLAInstruction.LayoutAnalysis.Dimension"}()) @@ -183,7 +183,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction.LayoutAnalysis") return encoded_size end -struct var"Node.XLAInstruction" +mutable struct var"Node.XLAInstruction" op::String expression::String provenance::String @@ -267,7 +267,7 @@ function PB._encoded_size(x::var"Node.XLAInstruction") return encoded_size end -struct Node +mutable struct Node name::String metrics::Union{Nothing,Metrics} children::Vector{Node} @@ -336,7 +336,7 @@ function PB._encoded_size(x::Node) return encoded_size end -struct Profile +mutable struct Profile by_category::Union{Nothing,Node} by_program::Union{Nothing,Node} device_type::String diff --git a/src/proto/tensorflow/profiler/op_stats_pb.jl b/src/proto/tensorflow/profiler/op_stats_pb.jl index 4056282dd4..48b0d84233 100644 --- a/src/proto/tensorflow/profiler/op_stats_pb.jl +++ b/src/proto/tensorflow/profiler/op_stats_pb.jl @@ -6,7 +6,7 @@ export HostIndependentJobInfoResult, CoreDetails, HostDependentJobInfoResult export SystemTopology, PerformanceCounterResult, PerfEnv, RunEnvironment, OpStats -struct HostIndependentJobInfoResult +mutable struct HostIndependentJobInfoResult change_list::Int64 workspace_id::String snapshot::Int64 @@ -66,7 +66,7 @@ function PB._encoded_size(x::HostIndependentJobInfoResult) return encoded_size end -struct CoreDetails +mutable struct CoreDetails hostname::String device_ordinal::UInt32 core_num::UInt32 @@ -132,7 +132,7 @@ function PB._encoded_size(x::CoreDetails) return encoded_size end -struct HostDependentJobInfoResult +mutable struct HostDependentJobInfoResult host_id::String command_line::String start_time::Int64 @@ -186,7 +186,7 @@ function PB._encoded_size(x::HostDependentJobInfoResult) return encoded_size end -struct SystemTopology +mutable struct SystemTopology x_dimension::Int64 y_dimension::Int64 z_dimension::Int64 @@ -234,7 +234,7 @@ function PB._encoded_size(x::SystemTopology) return encoded_size end -struct PerformanceCounterResult +mutable struct PerformanceCounterResult matrix_unit_utilization_percent::Float64 hbm_utilization_percent::Float64 end @@ -270,7 +270,7 @@ function PB._encoded_size(x::PerformanceCounterResult) return encoded_size end -struct PerfEnv +mutable struct PerfEnv peak_tera_flops_per_second::Float64 peak_bw_giga_bytes_per_second::Float64 peak_hbm_bw_giga_bytes_per_second::Float64 @@ -342,7 +342,7 @@ function PB._encoded_size(x::PerfEnv) return encoded_size end -struct RunEnvironment +mutable struct RunEnvironment host_count::Int32 task_count::Int32 hostnames::Dict{String,Bool} @@ -451,7 +451,7 @@ function PB._encoded_size(x::RunEnvironment) return encoded_size end -struct OpStats +mutable struct OpStats host_op_metrics_db::Union{Nothing,OpMetricsDb} device_op_metrics_db::Union{Nothing,OpMetricsDb} hlo_metrics_db_complete_steps_only::Union{Nothing,OpMetricsDb} diff --git a/src/proto/tensorflow/profiler/overview_page_pb.jl b/src/proto/tensorflow/profiler/overview_page_pb.jl index c4bc5f80d3..aca5a4ac18 100644 --- a/src/proto/tensorflow/profiler/overview_page_pb.jl +++ b/src/proto/tensorflow/profiler/overview_page_pb.jl @@ -8,7 +8,7 @@ export OverviewPageAnalysis, OverviewInferenceLatency, OverviewPageRecommendatio export OverviewPageRunEnvironment, OverviewPage -struct OverviewTfOp +mutable struct OverviewTfOp name::String category::String self_time_fraction::Float64 @@ -74,7 +74,7 @@ function PB._encoded_size(x::OverviewTfOp) return encoded_size end -struct OverviewPageHostDependentJobInfo +mutable struct OverviewPageHostDependentJobInfo host_id::String command_line::String start_time::Int64 @@ -128,7 +128,7 @@ function PB._encoded_size(x::OverviewPageHostDependentJobInfo) return encoded_size end -struct OverviewLatencyBreakdown +mutable struct OverviewLatencyBreakdown total_latency_us::Float64 host_latency_us::Float64 device_latency_us::Float64 @@ -176,7 +176,7 @@ function PB._encoded_size(x::OverviewLatencyBreakdown) return encoded_size end -struct GenericRecommendation +mutable struct GenericRecommendation kernel_launch_bottleneck::String kernel_launch_statement::String all_other_bottleneck::String @@ -242,7 +242,7 @@ function PB._encoded_size(x::GenericRecommendation) return encoded_size end -struct OverviewPageTip +mutable struct OverviewPageTip link::String end PB.default_values(::Type{OverviewPageTip}) = (;link = "") @@ -272,7 +272,7 @@ function PB._encoded_size(x::OverviewPageTip) return encoded_size end -struct OverviewPageHostIndependentJobInfo +mutable struct OverviewPageHostIndependentJobInfo change_list::Int64 workspace_id::String snapshot::Int64 @@ -332,7 +332,7 @@ function PB._encoded_size(x::OverviewPageHostIndependentJobInfo) return encoded_size end -struct OverviewPageAnalysis +mutable struct OverviewPageAnalysis mxu_utilization_percent::Float64 device_idle_time_percent::Float64 host_idle_time_percent::Float64 @@ -542,7 +542,7 @@ function PB._encoded_size(x::OverviewPageAnalysis) return encoded_size end -struct OverviewInferenceLatency +mutable struct OverviewInferenceLatency percentile_numbers::Vector{Float64} latency_breakdowns::Vector{OverviewLatencyBreakdown} max_latency_us::Float64 @@ -596,7 +596,7 @@ function PB._encoded_size(x::OverviewInferenceLatency) return encoded_size end -struct OverviewPageRecommendation +mutable struct OverviewPageRecommendation bottleneck::String statement::String input_tips::Vector{OverviewPageTip} @@ -698,7 +698,7 @@ function PB._encoded_size(x::OverviewPageRecommendation) return encoded_size end -struct OverviewPageRunEnvironment +mutable struct OverviewPageRunEnvironment host_count::Int32 task_count::Int32 hostnames::Dict{String,Bool} @@ -789,7 +789,7 @@ function PB._encoded_size(x::OverviewPageRunEnvironment) return encoded_size end -struct OverviewPage +mutable struct OverviewPage run_environment::Union{Nothing,OverviewPageRunEnvironment} input_analysis::Union{Nothing,InputPipelineAnalysisResult} analysis::Union{Nothing,OverviewPageAnalysis} diff --git a/src/proto/tensorflow/profiler/power_metrics_pb.jl b/src/proto/tensorflow/profiler/power_metrics_pb.jl index 290652ab3d..3064821985 100644 --- a/src/proto/tensorflow/profiler/power_metrics_pb.jl +++ b/src/proto/tensorflow/profiler/power_metrics_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export PowerComponentMetrics, PowerMetrics -struct PowerComponentMetrics +mutable struct PowerComponentMetrics component_name::String max_power::Float64 avg_power::Float64 @@ -83,7 +83,7 @@ function PB._encoded_size(x::PowerComponentMetrics) return encoded_size end -struct PowerMetrics +mutable struct PowerMetrics power_component_metrics::Vector{PowerComponentMetrics} end PB.default_values(::Type{PowerMetrics}) = (;power_component_metrics = Vector{PowerComponentMetrics}()) diff --git a/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl b/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl index e1c2732254..b51e1be3bf 100644 --- a/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl +++ b/src/proto/tensorflow/profiler/roofline_model/roofline_model_pb.jl @@ -7,7 +7,7 @@ export RecordType, RooflineModelRecord, RooflineModelDatabase @enumx RecordType INVALID_RECORD_TYPE=0 ALL=1 AVERAGE_STEP=2 PER_STEP=3 ALL_HW=4 -struct RooflineModelRecord +mutable struct RooflineModelRecord record_type::RecordType.T step_num::UInt32 rank::UInt64 @@ -278,7 +278,7 @@ function PB._encoded_size(x::RooflineModelRecord) return encoded_size end -struct RooflineModelDatabase +mutable struct RooflineModelDatabase device_type::String megacore::Bool has_cmem::Bool diff --git a/src/proto/tensorflow/profiler/source_info_pb.jl b/src/proto/tensorflow/profiler/source_info_pb.jl index d9c675f82c..673de00685 100644 --- a/src/proto/tensorflow/profiler/source_info_pb.jl +++ b/src/proto/tensorflow/profiler/source_info_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export SourceInfo -struct SourceInfo +mutable struct SourceInfo file_name::String line_number::Int32 stack_frame::String diff --git a/src/proto/tensorflow/profiler/source_stats_pb.jl b/src/proto/tensorflow/profiler/source_stats_pb.jl index 133e12e87c..4c470ec83a 100644 --- a/src/proto/tensorflow/profiler/source_stats_pb.jl +++ b/src/proto/tensorflow/profiler/source_stats_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export var"SourceStats.Metric", var"SourceStats.FileMetrics", SourceStats -struct var"SourceStats.Metric" +mutable struct var"SourceStats.Metric" occurrences::UInt64 self_time_ps::UInt64 time_ps::UInt64 @@ -59,7 +59,7 @@ function PB._encoded_size(x::var"SourceStats.Metric") return encoded_size end -struct var"SourceStats.FileMetrics" +mutable struct var"SourceStats.FileMetrics" line_number_to_metric::Dict{Int32,var"SourceStats.Metric"} end PB.default_values(::Type{var"SourceStats.FileMetrics"}) = (;line_number_to_metric = Dict{Int32,var"SourceStats.Metric"}()) @@ -89,7 +89,7 @@ function PB._encoded_size(x::var"SourceStats.FileMetrics") return encoded_size end -struct SourceStats +mutable struct SourceStats file_name_to_metrics::Dict{String,var"SourceStats.FileMetrics"} end PB.default_values(::Type{SourceStats}) = (;file_name_to_metrics = Dict{String,var"SourceStats.FileMetrics"}()) diff --git a/src/proto/tensorflow/profiler/steps_db_pb.jl b/src/proto/tensorflow/profiler/steps_db_pb.jl index 1ecc59ff1f..e488155954 100644 --- a/src/proto/tensorflow/profiler/steps_db_pb.jl +++ b/src/proto/tensorflow/profiler/steps_db_pb.jl @@ -7,7 +7,7 @@ export TpuStepBreakdown, StepInfoResult, AllReduceDbResult, PerCoreStepInfo export StepDatabaseResult -struct DeviceMemoryTransfer +mutable struct DeviceMemoryTransfer occurrence::UInt64 time_us::Float64 bytes_transferred::UInt64 @@ -49,7 +49,7 @@ function PB._encoded_size(x::DeviceMemoryTransfer) return encoded_size end -struct GenericStepBreakdown +mutable struct GenericStepBreakdown type_ps::Dict{Int32,UInt64} category_ps::Dict{String,UInt64} end @@ -85,7 +85,7 @@ function PB._encoded_size(x::GenericStepBreakdown) return encoded_size end -struct AllReduceInfo +mutable struct AllReduceInfo id::UInt64 name::String all_reduce_id::UInt64 @@ -145,7 +145,7 @@ function PB._encoded_size(x::AllReduceInfo) return encoded_size end -struct SparseCoreStepBreakdown +mutable struct SparseCoreStepBreakdown sc_compute_ps::UInt64 sc_infeed_ps::UInt64 sc_outfeed_ps::UInt64 @@ -199,7 +199,7 @@ function PB._encoded_size(x::SparseCoreStepBreakdown) return encoded_size end -struct TpuStepBreakdown +mutable struct TpuStepBreakdown infeed_duration_ps::UInt64 host_outfeed_ps::UInt64 wait_for_scv0_duration_ps::UInt64 @@ -332,7 +332,7 @@ function PB._encoded_size(x::TpuStepBreakdown) return encoded_size end -struct StepInfoResult +mutable struct StepInfoResult step_num::UInt32 step_name::String duration_ps::UInt64 @@ -392,7 +392,7 @@ function PB._encoded_size(x::StepInfoResult) return encoded_size end -struct AllReduceDbResult +mutable struct AllReduceDbResult all_reduce_info::Vector{AllReduceInfo} end PB.default_values(::Type{AllReduceDbResult}) = (;all_reduce_info = Vector{AllReduceInfo}()) @@ -422,7 +422,7 @@ function PB._encoded_size(x::AllReduceDbResult) return encoded_size end -struct PerCoreStepInfo +mutable struct PerCoreStepInfo step_num::UInt32 step_info_per_core::Dict{UInt32,StepInfoResult} hlo_metrics_db::Union{Nothing,OpMetricsDb} @@ -483,7 +483,7 @@ function PB._encoded_size(x::PerCoreStepInfo) return encoded_size end -struct StepDatabaseResult +mutable struct StepDatabaseResult step_sequence::Vector{PerCoreStepInfo} use_incomplete_step::Bool num_steps_dropped::UInt32 diff --git a/src/proto/tensorflow/profiler/task_pb.jl b/src/proto/tensorflow/profiler/task_pb.jl index 61b9016089..9dd1923b6e 100644 --- a/src/proto/tensorflow/profiler/task_pb.jl +++ b/src/proto/tensorflow/profiler/task_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Task -struct Task +mutable struct Task changelist::Int64 workspace_id::String snapshot::Int64 diff --git a/src/proto/tensorflow/profiler/tf_function_pb.jl b/src/proto/tensorflow/profiler/tf_function_pb.jl index ca4d411cb0..08d55342a1 100644 --- a/src/proto/tensorflow/profiler/tf_function_pb.jl +++ b/src/proto/tensorflow/profiler/tf_function_pb.jl @@ -6,7 +6,7 @@ export TfFunctionMetrics, TfFunctionCompiler, TfFunctionExecutionMode, TfFunctio export TfFunctionDb -struct TfFunctionMetrics +mutable struct TfFunctionMetrics count::UInt64 self_time_ps::UInt64 end @@ -46,7 +46,7 @@ end @enumx TfFunctionExecutionMode INVALID_MODE=0 EAGER_MODE=1 TRACED_MODE=2 NOT_TRACED_MODE=3 CONCRETE_MODE=4 -struct TfFunction +mutable struct TfFunction metrics::Dict{Int32,TfFunctionMetrics} total_tracing_count::Int64 compiler::TfFunctionCompiler.T @@ -94,7 +94,7 @@ function PB._encoded_size(x::TfFunction) return encoded_size end -struct TfFunctionDb +mutable struct TfFunctionDb tf_functions::Dict{String,TfFunction} end PB.default_values(::Type{TfFunctionDb}) = (;tf_functions = Dict{String,TfFunction}()) diff --git a/src/proto/tensorflow/profiler/topology_pb.jl b/src/proto/tensorflow/profiler/topology_pb.jl index bb7c14af14..67cdda8f5c 100644 --- a/src/proto/tensorflow/profiler/topology_pb.jl +++ b/src/proto/tensorflow/profiler/topology_pb.jl @@ -7,7 +7,7 @@ export var"LogicalTopology.LogicalDevice", Topology, var"LogicalTopology.Logical export var"LogicalTopology.LogicalSlice", LogicalTopology -struct var"LogicalTopology.HostNetworkAddress" +mutable struct var"LogicalTopology.HostNetworkAddress" address::String interface_name::String end @@ -43,7 +43,7 @@ function PB._encoded_size(x::var"LogicalTopology.HostNetworkAddress") return encoded_size end -struct TopologyLocation +mutable struct TopologyLocation x::Int32 y::Int32 z::Int32 @@ -115,7 +115,7 @@ function PB._encoded_size(x::TopologyLocation) return encoded_size end -struct TopologyDimension +mutable struct TopologyDimension x::Int32 y::Int32 z::Int32 @@ -157,7 +157,7 @@ function PB._encoded_size(x::TopologyDimension) return encoded_size end -struct var"LogicalTopology.LogicalDevice" +mutable struct var"LogicalTopology.LogicalDevice" global_id::Int32 slice_local_id::Int32 host_local_id::Int32 @@ -199,7 +199,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalDevice") return encoded_size end -struct Topology +mutable struct Topology chips_per_host_bounds::Union{Nothing,TopologyDimension} host_bounds::Union{Nothing,TopologyDimension} mesh_location::Vector{TopologyLocation} @@ -241,7 +241,7 @@ function PB._encoded_size(x::Topology) return encoded_size end -struct var"LogicalTopology.LogicalHost" +mutable struct var"LogicalTopology.LogicalHost" slice_local_id::Int32 network_addresses::Vector{var"LogicalTopology.HostNetworkAddress"} devices::Vector{var"LogicalTopology.LogicalDevice"} @@ -283,7 +283,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalHost") return encoded_size end -struct var"LogicalTopology.LogicalSlice" +mutable struct var"LogicalTopology.LogicalSlice" global_id::Int32 hosts::Vector{var"LogicalTopology.LogicalHost"} end @@ -319,7 +319,7 @@ function PB._encoded_size(x::var"LogicalTopology.LogicalSlice") return encoded_size end -struct LogicalTopology +mutable struct LogicalTopology slices::Vector{var"LogicalTopology.LogicalSlice"} end PB.default_values(::Type{LogicalTopology}) = (;slices = Vector{var"LogicalTopology.LogicalSlice"}()) diff --git a/src/proto/tensorflow/profiler/trace_events_pb.jl b/src/proto/tensorflow/profiler/trace_events_pb.jl index d54024fa74..b222ebb0fb 100644 --- a/src/proto/tensorflow/profiler/trace_events_pb.jl +++ b/src/proto/tensorflow/profiler/trace_events_pb.jl @@ -11,7 +11,7 @@ export Device, Trace @enumx var"TraceEvent.EventType" EVENT_TYPE_UNSPECIFIED=0 EVENT_TYPE_COMPLETE=1 EVENT_TYPE_ASYNC=3 EVENT_TYPE_COUNTER=4 PB.reserved_fields(::Type{var"TraceEvent.EventType".T}) = (names = String[], numbers = Union{Int,UnitRange{Int}}[2]) -struct Resource +mutable struct Resource name::String resource_id::UInt64 num_events::UInt32 @@ -53,7 +53,7 @@ function PB._encoded_size(x::Resource) return encoded_size end -struct TraceEvent +mutable struct TraceEvent var"#type"::var"TraceEvent.EventType".T device_id::UInt32 resource_id::UInt64 @@ -165,7 +165,7 @@ function PB._encoded_size(x::TraceEvent) return encoded_size end -struct Device +mutable struct Device name::String device_id::UInt32 resources::Dict{UInt64,Resource} @@ -208,7 +208,7 @@ function PB._encoded_size(x::Device) return encoded_size end -struct Trace +mutable struct Trace devices::Dict{UInt32,Device} tasks::Dict{UInt32,Task} min_timestamp_ps::UInt64 diff --git a/src/proto/tensorflow/profiler/xplane_pb.jl b/src/proto/tensorflow/profiler/xplane_pb.jl index 056ab661a9..5434ba765d 100644 --- a/src/proto/tensorflow/profiler/xplane_pb.jl +++ b/src/proto/tensorflow/profiler/xplane_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export XStatMetadata, XStat, XEventMetadata, XEvent, XLine, XPlane, XSpace -struct XStatMetadata +mutable struct XStatMetadata id::Int64 name::String description::String @@ -47,7 +47,7 @@ function PB._encoded_size(x::XStatMetadata) return encoded_size end -struct XStat +mutable struct XStat metadata_id::Int64 value::Union{Nothing,OneOf{<:Union{Float64,UInt64,Int64,String,Vector{UInt8}}}} end @@ -122,7 +122,7 @@ function PB._encoded_size(x::XStat) return encoded_size end -struct XEventMetadata +mutable struct XEventMetadata id::Int64 name::String display_name::String @@ -182,7 +182,7 @@ function PB._encoded_size(x::XEventMetadata) return encoded_size end -struct XEvent +mutable struct XEvent metadata_id::Int64 data::Union{Nothing,OneOf{Int64}} duration_ps::Int64 @@ -245,7 +245,7 @@ function PB._encoded_size(x::XEvent) return encoded_size end -struct XLine +mutable struct XLine id::Int64 display_id::Int64 name::String @@ -312,7 +312,7 @@ function PB._encoded_size(x::XLine) return encoded_size end -struct XPlane +mutable struct XPlane id::Int64 name::String lines::Vector{XLine} @@ -372,7 +372,7 @@ function PB._encoded_size(x::XPlane) return encoded_size end -struct XSpace +mutable struct XSpace planes::Vector{XPlane} errors::Vector{String} warnings::Vector{String} diff --git a/src/proto/tensorflow/profiler_options_pb.jl b/src/proto/tensorflow/profiler_options_pb.jl index 4fc2810798..8e0dc31a64 100644 --- a/src/proto/tensorflow/profiler_options_pb.jl +++ b/src/proto/tensorflow/profiler_options_pb.jl @@ -7,7 +7,7 @@ export var"ProfileOptions.AdvancedConfigValue", ProfileOptions export RemoteProfilerSessionManagerOptions -struct var"ProfileOptions.TraceOptions" +mutable struct var"ProfileOptions.TraceOptions" host_traceme_filter_mask::UInt64 end PB.default_values(::Type{var"ProfileOptions.TraceOptions"}) = (;host_traceme_filter_mask = zero(UInt64)) @@ -39,7 +39,7 @@ end @enumx var"ProfileOptions.DeviceType" UNSPECIFIED=0 CPU=1 GPU=2 TPU=3 PLUGGABLE_DEVICE=4 -struct var"ProfileOptions.AdvancedConfigValue" +mutable struct var"ProfileOptions.AdvancedConfigValue" value::Union{Nothing,OneOf{<:Union{String,Bool,Int64}}} end PB.oneof_field_types(::Type{var"ProfileOptions.AdvancedConfigValue"}) = (; @@ -90,7 +90,7 @@ function PB._encoded_size(x::var"ProfileOptions.AdvancedConfigValue") return encoded_size end -struct ProfileOptions +mutable struct ProfileOptions version::UInt32 device_type::var"ProfileOptions.DeviceType".T include_dataset_ops::Bool @@ -198,7 +198,7 @@ function PB._encoded_size(x::ProfileOptions) return encoded_size end -struct RemoteProfilerSessionManagerOptions +mutable struct RemoteProfilerSessionManagerOptions profiler_options::Union{Nothing,ProfileOptions} service_addresses::Vector{String} session_creation_timestamp_ns::UInt64 diff --git a/src/proto/tensorflow/tfprof/pprof/profile_pb.jl b/src/proto/tensorflow/tfprof/pprof/profile_pb.jl index a459b58aa6..42601dda14 100644 --- a/src/proto/tensorflow/tfprof/pprof/profile_pb.jl +++ b/src/proto/tensorflow/tfprof/pprof/profile_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Line, Label, ValueType, Mapping, Function, Location, Sample, Profile -struct Line +mutable struct Line function_id::UInt64 line::Int64 end @@ -41,7 +41,7 @@ function PB._encoded_size(x::Line) return encoded_size end -struct Label +mutable struct Label key::Int64 str::Int64 num::Int64 @@ -83,7 +83,7 @@ function PB._encoded_size(x::Label) return encoded_size end -struct ValueType +mutable struct ValueType var"#type"::Int64 unit::Int64 end @@ -119,7 +119,7 @@ function PB._encoded_size(x::ValueType) return encoded_size end -struct Mapping +mutable struct Mapping id::UInt64 memory_start::UInt64 memory_limit::UInt64 @@ -203,7 +203,7 @@ function PB._encoded_size(x::Mapping) return encoded_size end -struct Function +mutable struct Function id::UInt64 name::Int64 system_name::Int64 @@ -257,7 +257,7 @@ function PB._encoded_size(x::Function) return encoded_size end -struct Location +mutable struct Location id::UInt64 mapping_id::UInt64 address::UInt64 @@ -305,7 +305,7 @@ function PB._encoded_size(x::Location) return encoded_size end -struct Sample +mutable struct Sample location_id::Vector{UInt64} value::Vector{Int64} label::Vector{Label} @@ -347,7 +347,7 @@ function PB._encoded_size(x::Sample) return encoded_size end -struct Profile +mutable struct Profile sample_type::Vector{ValueType} sample::Vector{Sample} mapping::Vector{Mapping} diff --git a/src/proto/tsl/profiler/trace_events_pb.jl b/src/proto/tsl/profiler/trace_events_pb.jl index 1eee3aa8f1..bbaab7b90c 100644 --- a/src/proto/tsl/profiler/trace_events_pb.jl +++ b/src/proto/tsl/profiler/trace_events_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export Resource, TraceEvent, Device, Trace -struct Resource +mutable struct Resource name::String resource_id::UInt32 sort_index::UInt32 @@ -47,7 +47,7 @@ function PB._encoded_size(x::Resource) return encoded_size end -struct TraceEvent +mutable struct TraceEvent device_id::UInt32 resource_id::UInt32 name::String @@ -107,7 +107,7 @@ function PB._encoded_size(x::TraceEvent) return encoded_size end -struct Device +mutable struct Device name::String device_id::UInt32 resources::Dict{UInt32,Resource} @@ -149,7 +149,7 @@ function PB._encoded_size(x::Device) return encoded_size end -struct Trace +mutable struct Trace devices::Dict{UInt32,Device} trace_events::Vector{TraceEvent} end diff --git a/src/proto/xla/compile_options_pb.jl b/src/proto/xla/compile_options_pb.jl index 55249422c0..0b93c5f352 100644 --- a/src/proto/xla/compile_options_pb.jl +++ b/src/proto/xla/compile_options_pb.jl @@ -6,7 +6,7 @@ export OptionOverrideProto, ExecutableBuildOptionsProto, CompileOptionsProto export ExecutableAndOptionsProto -struct OptionOverrideProto +mutable struct OptionOverrideProto value::Union{Nothing,OneOf{<:Union{String,Bool,Int64,Float64}}} end PB.oneof_field_types(::Type{OptionOverrideProto}) = (; @@ -63,7 +63,7 @@ function PB._encoded_size(x::OptionOverrideProto) return encoded_size end -struct ExecutableBuildOptionsProto +mutable struct ExecutableBuildOptionsProto device_ordinal::Int64 result_layout::Union{Nothing,ShapeProto} comp_envs::Union{Nothing,CompilationEnvironmentsProto} @@ -243,7 +243,7 @@ function PB._encoded_size(x::ExecutableBuildOptionsProto) return encoded_size end -struct CompileOptionsProto +mutable struct CompileOptionsProto argument_layouts::Vector{ShapeProto} parameter_is_tupled_arguments::Bool executable_build_options::Union{Nothing,ExecutableBuildOptionsProto} @@ -327,7 +327,7 @@ function PB._encoded_size(x::CompileOptionsProto) return encoded_size end -struct ExecutableAndOptionsProto +mutable struct ExecutableAndOptionsProto serialized_executable::Vector{UInt8} compile_options::Union{Nothing,CompileOptionsProto} pjrt_client_name::String diff --git a/src/proto/xla/execute_options_pb.jl b/src/proto/xla/execute_options_pb.jl index 2c136f9b08..bdbfb2509a 100644 --- a/src/proto/xla/execute_options_pb.jl +++ b/src/proto/xla/execute_options_pb.jl @@ -7,7 +7,7 @@ export ExecutionModeProto, ExecuteOptionsProto @enumx ExecutionModeProto EXECUTION_MODE_UNSPECIFIED=0 EXECUTION_MODE_DEFAULT=1 EXECUTION_MODE_SYNCHRONOUS=2 EXECUTION_MODE_ASYNCHRONOUS=3 -struct ExecuteOptionsProto +mutable struct ExecuteOptionsProto arguments_are_tupled::Bool untuple_result::Bool launch_id::Int32 diff --git a/src/proto/xla/hlo_pb.jl b/src/proto/xla/hlo_pb.jl index be33af12ba..9dba2e2aee 100644 --- a/src/proto/xla/hlo_pb.jl +++ b/src/proto/xla/hlo_pb.jl @@ -26,7 +26,7 @@ abstract type var"##Abstract#HloModuleProto" end abstract type var"##Abstract#HloUnoptimizedSnapshot" end -struct var"HloScheduleProto.InstructionSequence" +mutable struct var"HloScheduleProto.InstructionSequence" instruction_ids::Vector{Int64} end PB.default_values(::Type{var"HloScheduleProto.InstructionSequence"}) = (;instruction_ids = Vector{Int64}()) @@ -58,7 +58,7 @@ end @enumx CustomCallApiVersion API_VERSION_UNSPECIFIED=0 API_VERSION_ORIGINAL=1 API_VERSION_STATUS_RETURNING=2 API_VERSION_STATUS_RETURNING_UNIFIED=3 API_VERSION_TYPED_FFI=4 -struct var"StackFrameIndexProto.StackFrame" +mutable struct var"StackFrameIndexProto.StackFrame" file_location_id::Int32 parent_frame_id::Int32 end @@ -94,7 +94,7 @@ function PB._encoded_size(x::var"StackFrameIndexProto.StackFrame") return encoded_size end -struct var"HloInputs.LiteralDescriptor" +mutable struct var"HloInputs.LiteralDescriptor" version::Int32 argument_size_bytes::UInt64 end @@ -130,7 +130,7 @@ function PB._encoded_size(x::var"HloInputs.LiteralDescriptor") return encoded_size end -struct HloPassMetadata +mutable struct HloPassMetadata pass_id::Int64 pass_name::String pipeline_name::String @@ -220,7 +220,7 @@ function PB._encoded_size(x::HloPassMetadata) return encoded_size end -struct var"BufferAllocationProto.Assigned" +mutable struct var"BufferAllocationProto.Assigned" logical_buffer_id::Int64 offset::Int64 size::Int64 @@ -270,7 +270,7 @@ end @enumx Kind UNDEFINED_ALIAS=0 MAY_ALIAS=1 MUST_ALIAS=2 -struct var"StackFrameIndexProto.FileLocation" +mutable struct var"StackFrameIndexProto.FileLocation" file_name_id::Int32 function_name_id::Int32 line::Int32 @@ -330,7 +330,7 @@ function PB._encoded_size(x::var"StackFrameIndexProto.FileLocation") return encoded_size end -struct var"LogicalBufferProto.Location" +mutable struct var"LogicalBufferProto.Location" instruction_name::String instruction_id::Int64 shape_index::Vector{Int64} @@ -373,7 +373,7 @@ function PB._encoded_size(x::var"LogicalBufferProto.Location") return encoded_size end -struct CrossProgramPrefetch +mutable struct CrossProgramPrefetch parameter::Int64 index::Vector{Int64} offset::Int64 @@ -415,7 +415,7 @@ function PB._encoded_size(x::CrossProgramPrefetch) return encoded_size end -struct var"HloBufferDonorProto.BufferDonorEntryProto" +mutable struct var"HloBufferDonorProto.BufferDonorEntryProto" parameter_number::Int64 parameter_shape_index::Vector{Int64} end @@ -451,7 +451,7 @@ function PB._encoded_size(x::var"HloBufferDonorProto.BufferDonorEntryProto") return encoded_size end -struct var"HloInstructionProto.SliceDimensions" +mutable struct var"HloInstructionProto.SliceDimensions" start::Int64 limit::Int64 stride::Int64 @@ -499,7 +499,7 @@ end @enumx var"HeapSimulatorTrace.Event.Kind" ALLOC=0 FREE=1 SHARE_WITH=2 -struct HloScheduleProto +mutable struct HloScheduleProto sequences::Dict{Int64,var"HloScheduleProto.InstructionSequence"} end PB.default_values(::Type{HloScheduleProto}) = (;sequences = Dict{Int64,var"HloScheduleProto.InstructionSequence"}()) @@ -529,7 +529,7 @@ function PB._encoded_size(x::HloScheduleProto) return encoded_size end -struct HloInputs +mutable struct HloInputs arguments::Vector{LiteralProto} arguments_descriptors::Vector{var"HloInputs.LiteralDescriptor"} end @@ -565,7 +565,7 @@ function PB._encoded_size(x::HloInputs) return encoded_size end -struct HloModuleMetadataProto +mutable struct HloModuleMetadataProto canonical_module_id::Int64 module_group_name::String original_module_id::Int64 @@ -619,7 +619,7 @@ function PB._encoded_size(x::HloModuleMetadataProto) return encoded_size end -struct BufferAllocationProto +mutable struct BufferAllocationProto index::Int64 size::Int64 is_thread_local::Bool @@ -715,7 +715,7 @@ function PB._encoded_size(x::BufferAllocationProto) return encoded_size end -struct var"HloInputOutputAliasProto.AliasEntryProto" +mutable struct var"HloInputOutputAliasProto.AliasEntryProto" output_shape_index::Vector{Int64} parameter_number::Int64 parameter_shape_index::Vector{Int64} @@ -763,7 +763,7 @@ function PB._encoded_size(x::var"HloInputOutputAliasProto.AliasEntryProto") return encoded_size end -struct StackFrameIndexProto +mutable struct StackFrameIndexProto file_names::Vector{String} function_names::Vector{String} file_locations::Vector{var"StackFrameIndexProto.FileLocation"} @@ -811,7 +811,7 @@ function PB._encoded_size(x::StackFrameIndexProto) return encoded_size end -struct LogicalBufferProto +mutable struct LogicalBufferProto id::Int64 size::Int64 defined_at::Union{Nothing,var"LogicalBufferProto.Location"} @@ -859,7 +859,7 @@ function PB._encoded_size(x::LogicalBufferProto) return encoded_size end -struct var"BufferAssignmentProto.BufferAlias" +mutable struct var"BufferAssignmentProto.BufferAlias" source_buffer_id::Int64 location::Union{Nothing,var"LogicalBufferProto.Location"} end @@ -895,7 +895,7 @@ function PB._encoded_size(x::var"BufferAssignmentProto.BufferAlias") return encoded_size end -struct HloBufferDonorProto +mutable struct HloBufferDonorProto entries::Vector{var"HloBufferDonorProto.BufferDonorEntryProto"} end PB.default_values(::Type{HloBufferDonorProto}) = (;entries = Vector{var"HloBufferDonorProto.BufferDonorEntryProto"}()) @@ -925,7 +925,7 @@ function PB._encoded_size(x::HloBufferDonorProto) return encoded_size end -struct var"HloModuleProto.ProfileInfo" +mutable struct var"HloModuleProto.ProfileInfo" profile_type::var"HloModuleProto.ProfileType".T relative_speedup::Float64 profile_source::ProfileSource.T @@ -997,7 +997,7 @@ function PB._encoded_size(x::var"HloModuleProto.ProfileInfo") return encoded_size end -struct HloInstructionProto +mutable struct HloInstructionProto name::String opcode::String shape::Union{Nothing,ShapeProto} @@ -1069,7 +1069,7 @@ struct HloInstructionProto k::Int64 largest::Bool statistics_viz::Union{Nothing,StatisticsViz} - replica_group_list::Union{Nothing,OneOf{<:Union{CollectiveDeviceListProto,IotaReplicaGroupListProto,MeshAxesReplicaGroupListProto}}} + collective_device_list::Union{Nothing,CollectiveDeviceListProto} original_value::Union{Nothing,OriginalValueProto} is_composite::Bool result_accuracy::Union{Nothing,ResultAccuracy} @@ -1077,10 +1077,9 @@ end PB.reserved_fields(::Type{HloInstructionProto}) = (names = ["parameter_name", "fused_instructions_computation", "operand_names", "control_predecessor_names", "called_computation_names", "replica_group_ids", "custom_call_opaque", "all_reduce_barrier"], numbers = Union{Int,UnitRange{Int}}[10, 12, 4, 5, 6, 44, 53, 46, 41, 42, 64, 78, 83, 84, 86]) PB.oneof_field_types(::Type{HloInstructionProto}) = (; optional_cross_program_prefetch_index = (;cross_program_prefetch_index=Int32), - replica_group_list = (;collective_device_list=CollectiveDeviceListProto, iota_collective_device_list=IotaReplicaGroupListProto, mesh_axes_replica_group_list=MeshAxesReplicaGroupListProto), ) -PB.default_values(::Type{HloInstructionProto}) = (;name = "", opcode = "", shape = nothing, metadata = nothing, literal = nothing, parameter_number = zero(Int64), fusion_kind = "", tuple_index = zero(Int64), dimensions = Vector{Int64}(), window = nothing, convolution_dimension_numbers = nothing, feature_group_count = zero(Int64), batch_group_count = zero(Int64), slice_dimensions = Vector{var"HloInstructionProto.SliceDimensions"}(), exponent_bits = zero(Int32), mantissa_bits = zero(Int32), dynamic_slice_sizes = Vector{Int64}(), padding_config = nothing, outfeed_config = UInt8[], distribution = RandomDistribution.RNG_INVALID, epsilon = zero(Float32), feature_index = zero(Int64), channel_id = zero(Int64), infeed_config = UInt8[], custom_call_target = "", outfeed_shape = nothing, dot_dimension_numbers = nothing, ragged_dot_dimension_numbers = nothing, fft_type = FftType.FFT, fft_length = Vector{Int64}(), comparison_direction = "", gather_dimension_numbers = nothing, gather_slice_sizes = Vector{Int64}(), id = zero(Int64), operand_ids = Vector{Int64}(), control_predecessor_ids = Vector{Int64}(), called_computation_ids = Vector{Int64}(), sharding = nothing, backend_config = UInt8[], replica_groups = Vector{ReplicaGroup}(), all_reduce_id = zero(Int64), use_global_device_ids = false, is_host_transfer = false, is_stable = false, scatter_dimension_numbers = nothing, precision_config = nothing, source_target_pairs = Vector{SourceTarget}(), domain_entry_sharding = nothing, domain_exit_sharding = nothing, constrain_layout = false, operand_shapes_with_layout = Vector{ShapeProto}(), triangular_solve_options = nothing, cholesky_options = nothing, parameter_replication = nothing, custom_call_has_side_effect = false, output_operand_aliasing = Vector{OutputOperandAliasing}(), custom_call_schedule = CustomCallSchedule.SCHEDULE_NONE, delta = zero(Int64), indices_are_sorted = false, frontend_attributes = nothing, unique_indices = false, rng_algorithm = RandomAlgorithm.RNG_DEFAULT, comparison_type = "", is_cross_program_prefetch = false, cross_program_prefetch_index = zero(Int32), padding_type = PaddingType.PADDING_INVALID, custom_call_api_version = CustomCallApiVersion.API_VERSION_UNSPECIFIED, async_execution_thread = "", k = zero(Int64), largest = false, statistics_viz = nothing, collective_device_list = nothing, iota_collective_device_list = nothing, mesh_axes_replica_group_list = nothing, original_value = nothing, is_composite = false, result_accuracy = nothing) -PB.field_numbers(::Type{HloInstructionProto}) = (;name = 1, opcode = 2, shape = 3, metadata = 7, literal = 8, parameter_number = 9, fusion_kind = 11, tuple_index = 13, dimensions = 14, window = 15, convolution_dimension_numbers = 16, feature_group_count = 50, batch_group_count = 58, slice_dimensions = 17, exponent_bits = 18, mantissa_bits = 19, dynamic_slice_sizes = 20, padding_config = 21, outfeed_config = 22, distribution = 23, epsilon = 24, feature_index = 25, channel_id = 26, infeed_config = 27, custom_call_target = 28, outfeed_shape = 29, dot_dimension_numbers = 30, ragged_dot_dimension_numbers = 90, fft_type = 31, fft_length = 32, comparison_direction = 63, gather_dimension_numbers = 33, gather_slice_sizes = 34, id = 35, operand_ids = 36, control_predecessor_ids = 37, called_computation_ids = 38, sharding = 40, backend_config = 43, replica_groups = 49, all_reduce_id = 45, use_global_device_ids = 71, is_host_transfer = 47, is_stable = 60, scatter_dimension_numbers = 48, precision_config = 51, source_target_pairs = 52, domain_entry_sharding = 54, domain_exit_sharding = 55, constrain_layout = 56, operand_shapes_with_layout = 57, triangular_solve_options = 59, cholesky_options = 62, parameter_replication = 61, custom_call_has_side_effect = 65, output_operand_aliasing = 74, custom_call_schedule = 76, delta = 66, indices_are_sorted = 67, frontend_attributes = 68, unique_indices = 69, rng_algorithm = 70, comparison_type = 72, is_cross_program_prefetch = 73, cross_program_prefetch_index = 80, padding_type = 75, custom_call_api_version = 77, async_execution_thread = 79, k = 81, largest = 85, statistics_viz = 82, collective_device_list = 87, iota_collective_device_list = 92, mesh_axes_replica_group_list = 93, original_value = 88, is_composite = 89, result_accuracy = 91) +PB.default_values(::Type{HloInstructionProto}) = (;name = "", opcode = "", shape = nothing, metadata = nothing, literal = nothing, parameter_number = zero(Int64), fusion_kind = "", tuple_index = zero(Int64), dimensions = Vector{Int64}(), window = nothing, convolution_dimension_numbers = nothing, feature_group_count = zero(Int64), batch_group_count = zero(Int64), slice_dimensions = Vector{var"HloInstructionProto.SliceDimensions"}(), exponent_bits = zero(Int32), mantissa_bits = zero(Int32), dynamic_slice_sizes = Vector{Int64}(), padding_config = nothing, outfeed_config = UInt8[], distribution = RandomDistribution.RNG_INVALID, epsilon = zero(Float32), feature_index = zero(Int64), channel_id = zero(Int64), infeed_config = UInt8[], custom_call_target = "", outfeed_shape = nothing, dot_dimension_numbers = nothing, ragged_dot_dimension_numbers = nothing, fft_type = FftType.FFT, fft_length = Vector{Int64}(), comparison_direction = "", gather_dimension_numbers = nothing, gather_slice_sizes = Vector{Int64}(), id = zero(Int64), operand_ids = Vector{Int64}(), control_predecessor_ids = Vector{Int64}(), called_computation_ids = Vector{Int64}(), sharding = nothing, backend_config = UInt8[], replica_groups = Vector{ReplicaGroup}(), all_reduce_id = zero(Int64), use_global_device_ids = false, is_host_transfer = false, is_stable = false, scatter_dimension_numbers = nothing, precision_config = nothing, source_target_pairs = Vector{SourceTarget}(), domain_entry_sharding = nothing, domain_exit_sharding = nothing, constrain_layout = false, operand_shapes_with_layout = Vector{ShapeProto}(), triangular_solve_options = nothing, cholesky_options = nothing, parameter_replication = nothing, custom_call_has_side_effect = false, output_operand_aliasing = Vector{OutputOperandAliasing}(), custom_call_schedule = CustomCallSchedule.SCHEDULE_NONE, delta = zero(Int64), indices_are_sorted = false, frontend_attributes = nothing, unique_indices = false, rng_algorithm = RandomAlgorithm.RNG_DEFAULT, comparison_type = "", is_cross_program_prefetch = false, cross_program_prefetch_index = zero(Int32), padding_type = PaddingType.PADDING_INVALID, custom_call_api_version = CustomCallApiVersion.API_VERSION_UNSPECIFIED, async_execution_thread = "", k = zero(Int64), largest = false, statistics_viz = nothing, collective_device_list = nothing, original_value = nothing, is_composite = false, result_accuracy = nothing) +PB.field_numbers(::Type{HloInstructionProto}) = (;name = 1, opcode = 2, shape = 3, metadata = 7, literal = 8, parameter_number = 9, fusion_kind = 11, tuple_index = 13, dimensions = 14, window = 15, convolution_dimension_numbers = 16, feature_group_count = 50, batch_group_count = 58, slice_dimensions = 17, exponent_bits = 18, mantissa_bits = 19, dynamic_slice_sizes = 20, padding_config = 21, outfeed_config = 22, distribution = 23, epsilon = 24, feature_index = 25, channel_id = 26, infeed_config = 27, custom_call_target = 28, outfeed_shape = 29, dot_dimension_numbers = 30, ragged_dot_dimension_numbers = 90, fft_type = 31, fft_length = 32, comparison_direction = 63, gather_dimension_numbers = 33, gather_slice_sizes = 34, id = 35, operand_ids = 36, control_predecessor_ids = 37, called_computation_ids = 38, sharding = 40, backend_config = 43, replica_groups = 49, all_reduce_id = 45, use_global_device_ids = 71, is_host_transfer = 47, is_stable = 60, scatter_dimension_numbers = 48, precision_config = 51, source_target_pairs = 52, domain_entry_sharding = 54, domain_exit_sharding = 55, constrain_layout = 56, operand_shapes_with_layout = 57, triangular_solve_options = 59, cholesky_options = 62, parameter_replication = 61, custom_call_has_side_effect = 65, output_operand_aliasing = 74, custom_call_schedule = 76, delta = 66, indices_are_sorted = 67, frontend_attributes = 68, unique_indices = 69, rng_algorithm = 70, comparison_type = 72, is_cross_program_prefetch = 73, cross_program_prefetch_index = 80, padding_type = 75, custom_call_api_version = 77, async_execution_thread = 79, k = 81, largest = 85, statistics_viz = 82, collective_device_list = 87, original_value = 88, is_composite = 89, result_accuracy = 91) function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto}) name = "" @@ -1154,7 +1153,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto}) k = zero(Int64) largest = false statistics_viz = Ref{Union{Nothing,StatisticsViz}}(nothing) - replica_group_list = nothing + collective_device_list = Ref{Union{Nothing,CollectiveDeviceListProto}}(nothing) original_value = Ref{Union{Nothing,OriginalValueProto}}(nothing) is_composite = false result_accuracy = Ref{Union{Nothing,ResultAccuracy}}(nothing) @@ -1303,11 +1302,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto}) elseif field_number == 82 PB.decode!(d, statistics_viz) elseif field_number == 87 - replica_group_list = OneOf(:collective_device_list, PB.decode(d, Ref{CollectiveDeviceListProto})) - elseif field_number == 92 - replica_group_list = OneOf(:iota_collective_device_list, PB.decode(d, Ref{IotaReplicaGroupListProto})) - elseif field_number == 93 - replica_group_list = OneOf(:mesh_axes_replica_group_list, PB.decode(d, Ref{MeshAxesReplicaGroupListProto})) + PB.decode!(d, collective_device_list) elseif field_number == 88 PB.decode!(d, original_value) elseif field_number == 89 @@ -1318,7 +1313,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:HloInstructionProto}) Base.skip(d, wire_type) end end - return HloInstructionProto(name, opcode, shape[], metadata[], literal[], parameter_number, fusion_kind, tuple_index, dimensions[], window[], convolution_dimension_numbers[], feature_group_count, batch_group_count, slice_dimensions[], exponent_bits, mantissa_bits, dynamic_slice_sizes[], padding_config[], outfeed_config, distribution, epsilon, feature_index, channel_id, infeed_config, custom_call_target, outfeed_shape[], dot_dimension_numbers[], ragged_dot_dimension_numbers[], fft_type, fft_length[], comparison_direction, gather_dimension_numbers[], gather_slice_sizes[], id, operand_ids[], control_predecessor_ids[], called_computation_ids[], sharding[], backend_config, replica_groups[], all_reduce_id, use_global_device_ids, is_host_transfer, is_stable, scatter_dimension_numbers[], precision_config[], source_target_pairs[], domain_entry_sharding[], domain_exit_sharding[], constrain_layout, operand_shapes_with_layout[], triangular_solve_options[], cholesky_options[], parameter_replication[], custom_call_has_side_effect, output_operand_aliasing[], custom_call_schedule, delta, indices_are_sorted, frontend_attributes[], unique_indices, rng_algorithm, comparison_type, is_cross_program_prefetch, optional_cross_program_prefetch_index, padding_type, custom_call_api_version, async_execution_thread, k, largest, statistics_viz[], replica_group_list, original_value[], is_composite, result_accuracy[]) + return HloInstructionProto(name, opcode, shape[], metadata[], literal[], parameter_number, fusion_kind, tuple_index, dimensions[], window[], convolution_dimension_numbers[], feature_group_count, batch_group_count, slice_dimensions[], exponent_bits, mantissa_bits, dynamic_slice_sizes[], padding_config[], outfeed_config, distribution, epsilon, feature_index, channel_id, infeed_config, custom_call_target, outfeed_shape[], dot_dimension_numbers[], ragged_dot_dimension_numbers[], fft_type, fft_length[], comparison_direction, gather_dimension_numbers[], gather_slice_sizes[], id, operand_ids[], control_predecessor_ids[], called_computation_ids[], sharding[], backend_config, replica_groups[], all_reduce_id, use_global_device_ids, is_host_transfer, is_stable, scatter_dimension_numbers[], precision_config[], source_target_pairs[], domain_entry_sharding[], domain_exit_sharding[], constrain_layout, operand_shapes_with_layout[], triangular_solve_options[], cholesky_options[], parameter_replication[], custom_call_has_side_effect, output_operand_aliasing[], custom_call_schedule, delta, indices_are_sorted, frontend_attributes[], unique_indices, rng_algorithm, comparison_type, is_cross_program_prefetch, optional_cross_program_prefetch_index, padding_type, custom_call_api_version, async_execution_thread, k, largest, statistics_viz[], collective_device_list[], original_value[], is_composite, result_accuracy[]) end function PB.encode(e::PB.AbstractProtoEncoder, x::HloInstructionProto) @@ -1397,14 +1392,7 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::HloInstructionProto) x.k != zero(Int64) && PB.encode(e, 81, x.k) x.largest != false && PB.encode(e, 85, x.largest) !isnothing(x.statistics_viz) && PB.encode(e, 82, x.statistics_viz) - if isnothing(x.replica_group_list); - elseif x.replica_group_list.name === :collective_device_list - PB.encode(e, 87, x.replica_group_list[]::CollectiveDeviceListProto) - elseif x.replica_group_list.name === :iota_collective_device_list - PB.encode(e, 92, x.replica_group_list[]::IotaReplicaGroupListProto) - elseif x.replica_group_list.name === :mesh_axes_replica_group_list - PB.encode(e, 93, x.replica_group_list[]::MeshAxesReplicaGroupListProto) - end + !isnothing(x.collective_device_list) && PB.encode(e, 87, x.collective_device_list) !isnothing(x.original_value) && PB.encode(e, 88, x.original_value) x.is_composite != false && PB.encode(e, 89, x.is_composite) !isnothing(x.result_accuracy) && PB.encode(e, 91, x.result_accuracy) @@ -1486,21 +1474,14 @@ function PB._encoded_size(x::HloInstructionProto) x.k != zero(Int64) && (encoded_size += PB._encoded_size(x.k, 81)) x.largest != false && (encoded_size += PB._encoded_size(x.largest, 85)) !isnothing(x.statistics_viz) && (encoded_size += PB._encoded_size(x.statistics_viz, 82)) - if isnothing(x.replica_group_list); - elseif x.replica_group_list.name === :collective_device_list - encoded_size += PB._encoded_size(x.replica_group_list[]::CollectiveDeviceListProto, 87) - elseif x.replica_group_list.name === :iota_collective_device_list - encoded_size += PB._encoded_size(x.replica_group_list[]::IotaReplicaGroupListProto, 92) - elseif x.replica_group_list.name === :mesh_axes_replica_group_list - encoded_size += PB._encoded_size(x.replica_group_list[]::MeshAxesReplicaGroupListProto, 93) - end + !isnothing(x.collective_device_list) && (encoded_size += PB._encoded_size(x.collective_device_list, 87)) !isnothing(x.original_value) && (encoded_size += PB._encoded_size(x.original_value, 88)) x.is_composite != false && (encoded_size += PB._encoded_size(x.is_composite, 89)) !isnothing(x.result_accuracy) && (encoded_size += PB._encoded_size(x.result_accuracy, 91)) return encoded_size end -struct var"HeapSimulatorTrace.Event" +mutable struct var"HeapSimulatorTrace.Event" kind::var"HeapSimulatorTrace.Event.Kind".T buffer_id::Int64 computation_name::String @@ -1554,7 +1535,7 @@ function PB._encoded_size(x::var"HeapSimulatorTrace.Event") return encoded_size end -struct HloInputOutputAliasProto +mutable struct HloInputOutputAliasProto entries::Vector{var"HloInputOutputAliasProto.AliasEntryProto"} end PB.default_values(::Type{HloInputOutputAliasProto}) = (;entries = Vector{var"HloInputOutputAliasProto.AliasEntryProto"}()) @@ -1584,7 +1565,7 @@ function PB._encoded_size(x::HloInputOutputAliasProto) return encoded_size end -struct HloComputationProto +mutable struct HloComputationProto name::String instructions::Vector{HloInstructionProto} program_shape::Union{Nothing,ProgramShapeProto} @@ -1651,7 +1632,7 @@ function PB._encoded_size(x::HloComputationProto) return encoded_size end -struct HeapSimulatorTrace +mutable struct HeapSimulatorTrace events::Vector{var"HeapSimulatorTrace.Event"} whole_module_simulation::Bool buffer_allocation_index::Int64 @@ -1693,7 +1674,7 @@ function PB._encoded_size(x::HeapSimulatorTrace) return encoded_size end -struct BufferAssignmentProto +mutable struct BufferAssignmentProto logical_buffers::Vector{LogicalBufferProto} buffer_aliases::Vector{var"BufferAssignmentProto.BufferAlias"} buffer_allocations::Vector{BufferAllocationProto} @@ -1742,12 +1723,12 @@ function PB._encoded_size(x::BufferAssignmentProto) end # Stub definitions for cyclic types -struct var"##Stub#HloModuleGroupProto"{T1<:var"##Abstract#HloModuleProto"} <: var"##Abstract#HloModuleGroupProto" +mutable struct var"##Stub#HloModuleGroupProto"{T1<:var"##Abstract#HloModuleProto"} <: var"##Abstract#HloModuleGroupProto" name::String hlo_modules::Vector{T1} end -struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloModuleProto" +mutable struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloModuleProto" name::String entry_computation_name::String entry_computation_id::Int64 @@ -1769,29 +1750,29 @@ struct var"##Stub#HloModuleProto"{T1<:var"##Abstract#OriginalValueRecoveryTableP original_value_recovery_table::Union{Nothing,T1} end -struct var"##Stub#HloProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloProto" +mutable struct var"##Stub#HloProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloProto" hlo_module::Union{Nothing,var"##Stub#HloModuleProto"{T1}} buffer_assignment::Union{Nothing,BufferAssignmentProto} end -struct var"##Stub#HloSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloSnapshot" +mutable struct var"##Stub#HloSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloSnapshot" hlo::Union{Nothing,var"##Stub#HloProto"{T1}} arguments::Vector{LiteralProto} result::Union{Nothing,LiteralProto} execution_platform::String end -struct var"##Stub#HloUnoptimizedSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloUnoptimizedSnapshot" +mutable struct var"##Stub#HloUnoptimizedSnapshot"{T1<:var"##Abstract#OriginalValueRecoveryTableProto"} <: var"##Abstract#HloUnoptimizedSnapshot" hlo_module::Union{Nothing,var"##Stub#HloModuleProto"{T1}} partitions::Vector{HloInputs} version::Int32 end -struct var"##Stub#OriginalValueRecoveryTableProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto.Entry"} <: var"##Abstract#OriginalValueRecoveryTableProto" +mutable struct var"##Stub#OriginalValueRecoveryTableProto"{T1<:var"##Abstract#OriginalValueRecoveryTableProto.Entry"} <: var"##Abstract#OriginalValueRecoveryTableProto" entries::Vector{T1} end -struct var"##Stub#OriginalValueRecoveryTableProto.Entry" <: var"##Abstract#OriginalValueRecoveryTableProto.Entry" +mutable struct var"##Stub#OriginalValueRecoveryTableProto.Entry" <: var"##Abstract#OriginalValueRecoveryTableProto.Entry" old_original_array::Union{Nothing,OriginalArrayProto} new_original_array::Union{Nothing,OriginalArrayProto} recovery_module::Union{Nothing,var"##Stub#HloModuleProto"{var"##Stub#OriginalValueRecoveryTableProto"{var"##Stub#OriginalValueRecoveryTableProto.Entry"}}} diff --git a/src/proto/xla/metrics_pb.jl b/src/proto/xla/metrics_pb.jl index 0db498df8e..677847810d 100644 --- a/src/proto/xla/metrics_pb.jl +++ b/src/proto/xla/metrics_pb.jl @@ -6,7 +6,7 @@ export JobInfo, TagMetric, var"CompilationLogEntry.CompilationStage", KeyValueMe export PassMetrics, CompilationLogEntry -struct JobInfo +mutable struct JobInfo name::String cell::String user::String @@ -78,7 +78,7 @@ function PB._encoded_size(x::JobInfo) return encoded_size end -struct TagMetric +mutable struct TagMetric key::String value::String end @@ -116,7 +116,7 @@ end @enumx var"CompilationLogEntry.CompilationStage" UNSPECIFIED=0 END_TO_END=1 HLO_PASSES=2 CODE_GENERATION=3 BACKEND_PASSES=4 -struct KeyValueMetric +mutable struct KeyValueMetric key::String value::Int64 end @@ -152,7 +152,7 @@ function PB._encoded_size(x::KeyValueMetric) return encoded_size end -struct PassMetrics +mutable struct PassMetrics module_id::UInt64 pass_name::String pass_duration::Union{Nothing,google.protobuf.Duration} @@ -206,7 +206,7 @@ function PB._encoded_size(x::PassMetrics) return encoded_size end -struct CompilationLogEntry +mutable struct CompilationLogEntry timestamp::Union{Nothing,google.protobuf.Timestamp} stage::var"CompilationLogEntry.CompilationStage".T duration::Union{Nothing,google.protobuf.Duration} diff --git a/src/proto/xla/xla_data_pb.jl b/src/proto/xla/xla_data_pb.jl index cbdb348368..588a7787b0 100644 --- a/src/proto/xla/xla_data_pb.jl +++ b/src/proto/xla/xla_data_pb.jl @@ -35,7 +35,7 @@ abstract type var"##Abstract#ProgramShapeProto" end @enumx RandomDistribution RNG_INVALID=0 RNG_UNIFORM=1 RNG_NORMAL=2 -struct Statistic +mutable struct Statistic stat_name::String stat_val::Float64 end @@ -73,7 +73,7 @@ end @enumx var"TriangularSolveOptions.Transpose" TRANSPOSE_INVALID=0 NO_TRANSPOSE=1 TRANSPOSE=2 ADJOINT=3 -struct var"WhileLoopBackendConfig.KnownInitStep" +mutable struct var"WhileLoopBackendConfig.KnownInitStep" init::Int64 step::Int64 end @@ -111,7 +111,7 @@ end @enumx var"ResultAccuracy.Mode" DEFAULT=0 HIGHEST=1 -struct GatherDimensionNumbers +mutable struct GatherDimensionNumbers offset_dims::Vector{Int64} collapsed_slice_dims::Vector{Int64} start_index_map::Vector{Int64} @@ -171,7 +171,7 @@ function PB._encoded_size(x::GatherDimensionNumbers) return encoded_size end -struct var"DeviceAssignmentProto.ComputationDevice" +mutable struct var"DeviceAssignmentProto.ComputationDevice" replica_device_ids::Vector{Int64} end PB.default_values(::Type{var"DeviceAssignmentProto.ComputationDevice"}) = (;replica_device_ids = Vector{Int64}()) @@ -201,7 +201,7 @@ function PB._encoded_size(x::var"DeviceAssignmentProto.ComputationDevice") return encoded_size end -struct SplitConfigProto +mutable struct SplitConfigProto dimension::Int64 split_indices::Vector{Int64} end @@ -243,7 +243,7 @@ end @enumx DimLevelType DIM_DENSE=0 DIM_COMPRESSED=1 DIM_SINGLETON=2 DIM_LOOSE_COMPRESSED=3 -struct var"WhileLoopBackendConfig.KnownTripCount" +mutable struct var"WhileLoopBackendConfig.KnownTripCount" n::Int64 end PB.default_values(::Type{var"WhileLoopBackendConfig.KnownTripCount"}) = (;n = zero(Int64)) @@ -277,7 +277,7 @@ end @enumx PrimitiveType PRIMITIVE_TYPE_INVALID=0 PRED=1 S1=30 S2=26 S4=21 S8=2 S16=3 S32=4 S64=5 U1=31 U2=27 U4=22 U8=6 U16=7 U32=8 U64=9 F16=10 F32=11 BF16=16 F64=12 F8E5M2=19 F8E4M3=28 F8E4M3FN=20 F8E4M3B11FNUZ=23 F8E3M4=29 F8E5M2FNUZ=24 F8E4M3FNUZ=25 F4E2M1FN=32 F8E8M0FNU=33 C64=15 C128=18 TUPLE=13 OPAQUE_TYPE=14 TOKEN=17 BUFFER=34 -struct ParameterReplication +mutable struct ParameterReplication replicated_at_leaf_buffers::Vector{Bool} end PB.default_values(::Type{ParameterReplication}) = (;replicated_at_leaf_buffers = Vector{Bool}()) @@ -311,7 +311,7 @@ end @enumx var"ChannelHandle.ChannelType" CHANNEL_TYPE_INVALID=0 DEVICE_TO_DEVICE=1 DEVICE_TO_HOST=2 HOST_TO_DEVICE=3 -struct SortOptions +mutable struct SortOptions descending::Bool end PB.default_values(::Type{SortOptions}) = (;descending = false) @@ -341,7 +341,7 @@ function PB._encoded_size(x::SortOptions) return encoded_size end -struct ReplicaGroup +mutable struct ReplicaGroup replica_ids::Vector{Int64} end PB.default_values(::Type{ReplicaGroup}) = (;replica_ids = Vector{Int64}()) @@ -371,7 +371,7 @@ function PB._encoded_size(x::ReplicaGroup) return encoded_size end -struct var"ResultAccuracy.Tolerance" +mutable struct var"ResultAccuracy.Tolerance" atol::Float64 rtol::Float64 ulps::Int64 @@ -413,7 +413,7 @@ function PB._encoded_size(x::var"ResultAccuracy.Tolerance") return encoded_size end -struct TileProto +mutable struct TileProto dimensions::Vector{Int64} end PB.default_values(::Type{TileProto}) = (;dimensions = Vector{Int64}()) @@ -443,7 +443,7 @@ function PB._encoded_size(x::TileProto) return encoded_size end -struct ScatterDimensionNumbers +mutable struct ScatterDimensionNumbers update_window_dims::Vector{Int64} inserted_window_dims::Vector{Int64} scatter_dims_to_operand_dims::Vector{Int64} @@ -503,7 +503,7 @@ function PB._encoded_size(x::ScatterDimensionNumbers) return encoded_size end -struct SourceTarget +mutable struct SourceTarget source::Int64 target::Int64 end @@ -539,7 +539,7 @@ function PB._encoded_size(x::SourceTarget) return encoded_size end -struct ExecutionHandle +mutable struct ExecutionHandle handle::Int64 end PB.default_values(::Type{ExecutionHandle}) = (;handle = zero(Int64)) @@ -569,7 +569,7 @@ function PB._encoded_size(x::ExecutionHandle) return encoded_size end -struct GlobalDataHandle +mutable struct GlobalDataHandle handle::Int64 end PB.default_values(::Type{GlobalDataHandle}) = (;handle = zero(Int64)) @@ -603,7 +603,7 @@ end @enumx ProfileSource PROFILE_SOURCE_UNKNOWN_SOURCE=0 PROFILE_SOURCE_EMBEDDED=1 PROFILE_SOURCE_REMOTE=2 -struct DotDimensionNumbers +mutable struct DotDimensionNumbers lhs_contracting_dimensions::Vector{Int64} rhs_contracting_dimensions::Vector{Int64} lhs_batch_dimensions::Vector{Int64} @@ -651,7 +651,7 @@ function PB._encoded_size(x::DotDimensionNumbers) return encoded_size end -struct DeviceHandle +mutable struct DeviceHandle handle::Int64 device_count::Int64 end @@ -689,7 +689,7 @@ end @enumx var"OpSharding.Type" REPLICATED=0 MAXIMAL=1 TUPLE=2 OTHER=3 MANUAL=4 UNKNOWN=5 UNREDUCED=6 -struct WindowDimension +mutable struct WindowDimension size::Int64 stride::Int64 padding_low::Int64 @@ -755,7 +755,7 @@ function PB._encoded_size(x::WindowDimension) return encoded_size end -struct ConvolutionDimensionNumbers +mutable struct ConvolutionDimensionNumbers input_batch_dimension::Int64 input_feature_dimension::Int64 input_spatial_dimensions::Vector{Int64} @@ -833,7 +833,7 @@ function PB._encoded_size(x::ConvolutionDimensionNumbers) return encoded_size end -struct IotaReplicaGroupListProto +mutable struct IotaReplicaGroupListProto num_replica_groups::Int64 num_devices_per_group::Int64 iota_reshape_dims::Vector{Int64} @@ -881,7 +881,7 @@ function PB._encoded_size(x::IotaReplicaGroupListProto) return encoded_size end -struct OriginalArrayProto +mutable struct OriginalArrayProto instruction_name::String shape_index::Vector{Int64} end @@ -917,7 +917,7 @@ function PB._encoded_size(x::OriginalArrayProto) return encoded_size end -struct ComputationStats +mutable struct ComputationStats flop_count::Float64 transcendental_count::Float64 end @@ -953,7 +953,7 @@ function PB._encoded_size(x::ComputationStats) return encoded_size end -struct FrontendAttributes +mutable struct FrontendAttributes map::Dict{String,String} end PB.default_values(::Type{FrontendAttributes}) = (;map = Dict{String,String}()) @@ -987,7 +987,7 @@ end @enumx AsyncStreamKind ASYNC_STREAM_KIND_COLLECTIVE=0 ASYNC_STREAM_KIND_P2P0=1 ASYNC_STREAM_KIND_P2P1=2 ASYNC_STREAM_KIND_MEMCPYP2P=3 -struct var"WhileLoopBackendConfig.KnownInductionVariable" +mutable struct var"WhileLoopBackendConfig.KnownInductionVariable" tuple_index::Int64 end PB.default_values(::Type{var"WhileLoopBackendConfig.KnownInductionVariable"}) = (;tuple_index = zero(Int64)) @@ -1017,7 +1017,7 @@ function PB._encoded_size(x::var"WhileLoopBackendConfig.KnownInductionVariable") return encoded_size end -struct var"PaddingConfig.PaddingConfigDimension" +mutable struct var"PaddingConfig.PaddingConfigDimension" edge_padding_low::Int64 edge_padding_high::Int64 interior_padding::Int64 @@ -1059,7 +1059,7 @@ function PB._encoded_size(x::var"PaddingConfig.PaddingConfigDimension") return encoded_size end -struct GemmPerfTableEntry +mutable struct GemmPerfTableEntry b::Int64 m::Int64 n::Int64 @@ -1113,7 +1113,7 @@ function PB._encoded_size(x::GemmPerfTableEntry) return encoded_size end -struct OutputOperandAliasing +mutable struct OutputOperandAliasing output_shape_index::Vector{Int64} operand_index::Int64 operand_shape_index::Vector{Int64} @@ -1157,7 +1157,7 @@ end @enumx var"PrecisionConfig.Precision" DEFAULT=0 HIGH=1 HIGHEST=2 -struct ExecutionProfile +mutable struct ExecutionProfile compilation_cache_hit::Bool compile_time_ms::Int64 compute_cycle_count::Int64 @@ -1229,7 +1229,7 @@ function PB._encoded_size(x::ExecutionProfile) return encoded_size end -struct var"AxisRefProto.SubAxis" +mutable struct var"AxisRefProto.SubAxis" pre_size::Int64 size::Int64 end @@ -1269,7 +1269,7 @@ end @enumx PaddingType PADDING_INVALID=0 PADDING_VALID=1 PADDING_SAME=2 -struct var"MeshProto.MeshAxis" +mutable struct var"MeshProto.MeshAxis" name::String size::Int64 end @@ -1307,7 +1307,7 @@ end @enumx var"OpSharding.ShardGroupType" AS=0 LIKE=1 -struct CholeskyOptions +mutable struct CholeskyOptions lower::Bool end PB.default_values(::Type{CholeskyOptions}) = (;lower = false) @@ -1337,7 +1337,7 @@ function PB._encoded_size(x::CholeskyOptions) return encoded_size end -struct StatisticsViz +mutable struct StatisticsViz stat_index_to_visualize::Int64 statistics::Vector{Statistic} end @@ -1373,7 +1373,7 @@ function PB._encoded_size(x::StatisticsViz) return encoded_size end -struct TriangularSolveOptions +mutable struct TriangularSolveOptions left_side::Bool lower::Bool unit_diagonal::Bool @@ -1421,7 +1421,7 @@ function PB._encoded_size(x::TriangularSolveOptions) return encoded_size end -struct DeviceAssignmentProto +mutable struct DeviceAssignmentProto replica_count::Int32 computation_count::Int32 computation_devices::Vector{var"DeviceAssignmentProto.ComputationDevice"} @@ -1463,7 +1463,7 @@ function PB._encoded_size(x::DeviceAssignmentProto) return encoded_size end -struct ChannelHandle +mutable struct ChannelHandle handle::Int64 var"#type"::var"ChannelHandle.ChannelType".T end @@ -1499,7 +1499,7 @@ function PB._encoded_size(x::ChannelHandle) return encoded_size end -struct ResultAccuracy +mutable struct ResultAccuracy specs::Union{Nothing,OneOf{<:Union{var"ResultAccuracy.Mode".T,var"ResultAccuracy.Tolerance"}}} end PB.oneof_field_types(::Type{ResultAccuracy}) = (; @@ -1544,7 +1544,7 @@ function PB._encoded_size(x::ResultAccuracy) return encoded_size end -struct RaggedDotDimensionNumbers +mutable struct RaggedDotDimensionNumbers dot_dimension_numbers::Union{Nothing,DotDimensionNumbers} lhs_ragged_dimensions::Vector{Int64} rhs_group_dimensions::Vector{Int64} @@ -1586,7 +1586,7 @@ function PB._encoded_size(x::RaggedDotDimensionNumbers) return encoded_size end -struct Window +mutable struct Window dimensions::Vector{WindowDimension} end PB.default_values(::Type{Window}) = (;dimensions = Vector{WindowDimension}()) @@ -1616,7 +1616,7 @@ function PB._encoded_size(x::Window) return encoded_size end -struct CollectiveDeviceListProto +mutable struct CollectiveDeviceListProto replica_groups::Vector{ReplicaGroup} iota_replica_group_list::Union{Nothing,IotaReplicaGroupListProto} end @@ -1652,7 +1652,7 @@ function PB._encoded_size(x::CollectiveDeviceListProto) return encoded_size end -struct OriginalValueElementProto +mutable struct OriginalValueElementProto shape_index::Vector{Int64} original_array::Union{Nothing,OriginalArrayProto} end @@ -1688,20 +1688,18 @@ function PB._encoded_size(x::OriginalValueElementProto) return encoded_size end -struct WhileLoopBackendConfig +mutable struct WhileLoopBackendConfig known_trip_count::Union{Nothing,var"WhileLoopBackendConfig.KnownTripCount"} known_init_step::Union{Nothing,var"WhileLoopBackendConfig.KnownInitStep"} known_induction_variable::Union{Nothing,var"WhileLoopBackendConfig.KnownInductionVariable"} - dynamic_variable_tuple_indices::Vector{Int64} end -PB.default_values(::Type{WhileLoopBackendConfig}) = (;known_trip_count = nothing, known_init_step = nothing, known_induction_variable = nothing, dynamic_variable_tuple_indices = Vector{Int64}()) -PB.field_numbers(::Type{WhileLoopBackendConfig}) = (;known_trip_count = 1, known_init_step = 2, known_induction_variable = 3, dynamic_variable_tuple_indices = 4) +PB.default_values(::Type{WhileLoopBackendConfig}) = (;known_trip_count = nothing, known_init_step = nothing, known_induction_variable = nothing) +PB.field_numbers(::Type{WhileLoopBackendConfig}) = (;known_trip_count = 1, known_init_step = 2, known_induction_variable = 3) function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:WhileLoopBackendConfig}) known_trip_count = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownTripCount"}}(nothing) known_init_step = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownInitStep"}}(nothing) known_induction_variable = Ref{Union{Nothing,var"WhileLoopBackendConfig.KnownInductionVariable"}}(nothing) - dynamic_variable_tuple_indices = PB.BufferedVector{Int64}() while !PB.message_done(d) field_number, wire_type = PB.decode_tag(d) if field_number == 1 @@ -1710,13 +1708,11 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:WhileLoopBackendConfig}) PB.decode!(d, known_init_step) elseif field_number == 3 PB.decode!(d, known_induction_variable) - elseif field_number == 4 - PB.decode!(d, wire_type, dynamic_variable_tuple_indices) else Base.skip(d, wire_type) end end - return WhileLoopBackendConfig(known_trip_count[], known_init_step[], known_induction_variable[], dynamic_variable_tuple_indices[]) + return WhileLoopBackendConfig(known_trip_count[], known_init_step[], known_induction_variable[]) end function PB.encode(e::PB.AbstractProtoEncoder, x::WhileLoopBackendConfig) @@ -1724,7 +1720,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::WhileLoopBackendConfig) !isnothing(x.known_trip_count) && PB.encode(e, 1, x.known_trip_count) !isnothing(x.known_init_step) && PB.encode(e, 2, x.known_init_step) !isnothing(x.known_induction_variable) && PB.encode(e, 3, x.known_induction_variable) - !isempty(x.dynamic_variable_tuple_indices) && PB.encode(e, 4, x.dynamic_variable_tuple_indices) return position(e.io) - initpos end function PB._encoded_size(x::WhileLoopBackendConfig) @@ -1732,11 +1727,10 @@ function PB._encoded_size(x::WhileLoopBackendConfig) !isnothing(x.known_trip_count) && (encoded_size += PB._encoded_size(x.known_trip_count, 1)) !isnothing(x.known_init_step) && (encoded_size += PB._encoded_size(x.known_init_step, 2)) !isnothing(x.known_induction_variable) && (encoded_size += PB._encoded_size(x.known_induction_variable, 3)) - !isempty(x.dynamic_variable_tuple_indices) && (encoded_size += PB._encoded_size(x.dynamic_variable_tuple_indices, 4)) return encoded_size end -struct PaddingConfig +mutable struct PaddingConfig dimensions::Vector{var"PaddingConfig.PaddingConfigDimension"} end PB.default_values(::Type{PaddingConfig}) = (;dimensions = Vector{var"PaddingConfig.PaddingConfigDimension"}()) @@ -1766,7 +1760,7 @@ function PB._encoded_size(x::PaddingConfig) return encoded_size end -struct GemmPerfTableEntryValues +mutable struct GemmPerfTableEntryValues entries::Vector{GemmPerfTableEntry} end PB.default_values(::Type{GemmPerfTableEntryValues}) = (;entries = Vector{GemmPerfTableEntry}()) @@ -1796,7 +1790,7 @@ function PB._encoded_size(x::GemmPerfTableEntryValues) return encoded_size end -struct PrecisionConfig +mutable struct PrecisionConfig operand_precision::Vector{var"PrecisionConfig.Precision".T} algorithm::var"PrecisionConfig.Algorithm".T end @@ -1832,7 +1826,7 @@ function PB._encoded_size(x::PrecisionConfig) return encoded_size end -struct AxisRefProto +mutable struct AxisRefProto mesh_axis_index::Int64 sub_axis_info::Union{Nothing,var"AxisRefProto.SubAxis"} end @@ -1868,7 +1862,7 @@ function PB._encoded_size(x::AxisRefProto) return encoded_size end -struct var"OpMetadata.ProfileInfo" +mutable struct var"OpMetadata.ProfileInfo" profile_type::Vector{ProfileType.T} relative_speedup::Float64 profile_source::ProfileSource.T @@ -1922,7 +1916,7 @@ function PB._encoded_size(x::var"OpMetadata.ProfileInfo") return encoded_size end -struct MeshProto +mutable struct MeshProto axes::Vector{var"MeshProto.MeshAxis"} device_ids::Vector{Int64} end @@ -1958,7 +1952,7 @@ function PB._encoded_size(x::MeshProto) return encoded_size end -struct OriginalValueProto +mutable struct OriginalValueProto elements::Vector{OriginalValueElementProto} is_synthetic_call::Bool end @@ -1994,7 +1988,7 @@ function PB._encoded_size(x::OriginalValueProto) return encoded_size end -struct GemmPerfTable +mutable struct GemmPerfTable entries::Dict{String,GemmPerfTableEntryValues} end PB.default_values(::Type{GemmPerfTable}) = (;entries = Dict{String,GemmPerfTableEntryValues}()) @@ -2024,7 +2018,7 @@ function PB._encoded_size(x::GemmPerfTable) return encoded_size end -struct var"NamedShardingProto.DimensionSharding" +mutable struct var"NamedShardingProto.DimensionSharding" axes::Vector{AxisRefProto} is_closed::Bool end @@ -2060,7 +2054,7 @@ function PB._encoded_size(x::var"NamedShardingProto.DimensionSharding") return encoded_size end -struct OpMetadata +mutable struct OpMetadata op_type::String op_name::String source_file::String @@ -2169,7 +2163,7 @@ function PB._encoded_size(x::OpMetadata) return encoded_size end -struct MeshAxesReplicaGroupListProto +mutable struct MeshAxesReplicaGroupListProto mesh::Union{Nothing,MeshProto} axes::Vector{AxisRefProto} end @@ -2205,7 +2199,7 @@ function PB._encoded_size(x::MeshAxesReplicaGroupListProto) return encoded_size end -struct NamedShardingProto +mutable struct NamedShardingProto mesh::Union{Nothing,MeshProto} dim_shardings::Vector{var"NamedShardingProto.DimensionSharding"} replicated_axes::Vector{AxisRefProto} @@ -2261,7 +2255,7 @@ function PB._encoded_size(x::NamedShardingProto) end # Stub definitions for cyclic types -struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LayoutProto" +mutable struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LayoutProto" minor_to_major::Vector{Int64} dim_level_types::Vector{DimLevelType.T} dim_unique::Vector{Bool} @@ -2277,7 +2271,7 @@ struct var"##Stub#LayoutProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract split_configs::Vector{SplitConfigProto} end -struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LiteralProto" +mutable struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#LiteralProto" shape::Union{Nothing,T1} preds::Vector{Bool} s1s::Vector{UInt8} @@ -2313,7 +2307,7 @@ struct var"##Stub#LiteralProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstrac sparse_indices::Vector{Int64} end -struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#OpSharding" +mutable struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#OpSharding" var"#type"::var"OpSharding.Type".T tile_shape::Union{Nothing,T1} tile_assignment_dimensions::Vector{Int64} @@ -2330,13 +2324,13 @@ struct var"##Stub#OpSharding"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract# named_sharding::Union{Nothing,NamedShardingProto} end -struct var"##Stub#ProgramShapeProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#ProgramShapeProto" +mutable struct var"##Stub#ProgramShapeProto"{T1<:var"##Abstract#ShapeProto"} <: var"##Abstract#ProgramShapeProto" parameters::Vector{T1} result::Union{Nothing,T1} parameter_names::Vector{String} end -struct var"##Stub#ShapeProto" <: var"##Abstract#ShapeProto" +mutable struct var"##Stub#ShapeProto" <: var"##Abstract#ShapeProto" element_type::PrimitiveType.T dimensions::Vector{Int64} is_dynamic_dimension::Vector{Bool} diff --git a/src/proto/xla/xla_pb.jl b/src/proto/xla/xla_pb.jl index 7cb7a44090..19936eab5a 100644 --- a/src/proto/xla/xla_pb.jl +++ b/src/proto/xla/xla_pb.jl @@ -7,13 +7,12 @@ export var"DebugOptions.LibNvJitLinkMode", var"DebugOptions.LibraryFusionType" export var"DebugOptions.PipelineParallelismOptLevel", ShardableValueUpdatePairProto export var"DebugOptions.AutotuneCacheMode", var"DebugOptions.CommandBufferSchedulingMode" export var"DebugOptions.PartitioningAlgorithm", var"HloModuleConfigProto.Int64List" -export var"ScheduleProto.SchedulerStatisticsProto", var"DebugOptions.AutotuneBackend" -export var"HloModuleConfigProto.BoolList", var"DebugOptions.StepMarkerLocation" -export var"DebugOptions.CollectiveOpType", var"DebugOptions.ShapeChecks" -export var"DebugOptions.XnnGraphFusionMode", GpuCompilationEnvironment -export var"ScheduleConfigProto.Instruction", var"DebugOptions.WhileLoopUnrolling" -export NodeShardingConfigProto, IntRangeInclusive, CompilationEnvironmentsProto -export var"DebugOptions.PGLEStrictnessLevel" +export var"ScheduleProto.SchedulerStatisticsProto", var"HloModuleConfigProto.BoolList" +export var"DebugOptions.StepMarkerLocation", var"DebugOptions.CollectiveOpType" +export var"DebugOptions.ShapeChecks", var"DebugOptions.XnnGraphFusionMode" +export GpuCompilationEnvironment, var"ScheduleConfigProto.Instruction" +export var"DebugOptions.WhileLoopUnrolling", NodeShardingConfigProto, IntRangeInclusive +export CompilationEnvironmentsProto, var"DebugOptions.PGLEStrictnessLevel" export var"HloModuleConfigProto.FusionConfigCollection", var"DebugOptions.DetectionMode" export var"DebugOptions.CommandBufferCmdType", var"HloModuleConfigProto.Int64ListList" export var"ScheduleProto.ComputationScheduleProto", ScheduleConfigProto @@ -21,7 +20,7 @@ export ShardingConfigProto, ThunkBufferDebugFilter, ScheduleProto, DebugOptions export HloModuleConfigProto, ExecutionOptions, HloModuleProtoWithConfig -struct var"ScheduleProto.Instruction" +mutable struct var"ScheduleProto.Instruction" id::Int64 start_timestamp_cycles::Float64 end_timestamp_cycles::Float64 @@ -84,7 +83,7 @@ end @enumx var"DebugOptions.PipelineParallelismOptLevel" PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE=0 PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE=1 PB.reserved_fields(::Type{var"DebugOptions.PipelineParallelismOptLevel".T}) = (names = ["PIPELINE_PARALLELISM_OPT_LEVEL_ENABLE_CYCLE_DECOMPOSER"], numbers = Union{Int,UnitRange{Int}}[2]) -struct ShardableValueUpdatePairProto +mutable struct ShardableValueUpdatePairProto input_parameter_number::Int64 parameter_shape_index::Vector{Int64} output_shape_index::Vector{Int64} @@ -132,7 +131,7 @@ end @enumx var"DebugOptions.PartitioningAlgorithm" PARTITIONING_ALGORITHM_NOOP=0 PARTITIONING_ALGORITHM_EXP0=1 PARTITIONING_ALGORITHM_EXP1=2 PARTITIONING_ALGORITHM_EXP2=3 -struct var"HloModuleConfigProto.Int64List" +mutable struct var"HloModuleConfigProto.Int64List" vals::Vector{Int64} end PB.default_values(::Type{var"HloModuleConfigProto.Int64List"}) = (;vals = Vector{Int64}()) @@ -162,7 +161,7 @@ function PB._encoded_size(x::var"HloModuleConfigProto.Int64List") return encoded_size end -struct var"ScheduleProto.SchedulerStatisticsProto" +mutable struct var"ScheduleProto.SchedulerStatisticsProto" all_gather_wasted_cycles::Float64 all_reduce_wasted_cycles::Float64 collective_broadcast_wasted_cycles::Float64 @@ -264,9 +263,7 @@ function PB._encoded_size(x::var"ScheduleProto.SchedulerStatisticsProto") return encoded_size end -@enumx var"DebugOptions.AutotuneBackend" AUTOTUNE_BACKEND_ALL=0 AUTOTUNE_BACKEND_CUDNN=1 AUTOTUNE_BACKEND_TRITON=2 AUTOTUNE_BACKEND_CUBLAS=3 AUTOTUNE_BACKEND_CUBLASLT=4 - -struct var"HloModuleConfigProto.BoolList" +mutable struct var"HloModuleConfigProto.BoolList" vals::Vector{Bool} end PB.default_values(::Type{var"HloModuleConfigProto.BoolList"}) = (;vals = Vector{Bool}()) @@ -304,7 +301,7 @@ end @enumx var"DebugOptions.XnnGraphFusionMode" XNN_GRAPH_FUSION_MODE_DISABLED=0 XNN_GRAPH_FUSION_MODE_GREEDY=1 XNN_GRAPH_FUSION_MODE_GREEDY_SLINKY=2 XNN_GRAPH_FUSION_MODE_BYPASS_COST_MODEL=3 -struct GpuCompilationEnvironment +mutable struct GpuCompilationEnvironment dummy_flag::Int64 end PB.default_values(::Type{GpuCompilationEnvironment}) = (;dummy_flag = zero(Int64)) @@ -334,7 +331,7 @@ function PB._encoded_size(x::GpuCompilationEnvironment) return encoded_size end -struct var"ScheduleConfigProto.Instruction" +mutable struct var"ScheduleConfigProto.Instruction" name::String end PB.default_values(::Type{var"ScheduleConfigProto.Instruction"}) = (;name = "") @@ -366,7 +363,7 @@ end @enumx var"DebugOptions.WhileLoopUnrolling" WHILE_LOOP_UNROLLING_NO_UNROLL=0 WHILE_LOOP_UNROLLING_DOUBLE_BUFFER=1 WHILE_LOOP_UNROLLING_FULL_UNROLL=2 WHILE_LOOP_UNROLLING_AUTO_UNROLL=3 -struct NodeShardingConfigProto +mutable struct NodeShardingConfigProto sharding::Union{Nothing,OpSharding} nodes::Vector{NodeShardingConfigProto} end @@ -402,7 +399,7 @@ function PB._encoded_size(x::NodeShardingConfigProto) return encoded_size end -struct IntRangeInclusive +mutable struct IntRangeInclusive first::Int64 last::Int64 end @@ -438,7 +435,7 @@ function PB._encoded_size(x::IntRangeInclusive) return encoded_size end -struct CompilationEnvironmentsProto +mutable struct CompilationEnvironmentsProto environments::Vector{google.protobuf.var"#Any"} end PB.default_values(::Type{CompilationEnvironmentsProto}) = (;environments = Vector{google.protobuf.var"#Any"}()) @@ -476,7 +473,7 @@ end @enumx var"DebugOptions.CommandBufferCmdType" INVALID=0 FUSION=1 CUBLAS=2 CUDNN=3 COLLECTIVES=4 CONDITIONAL=5 WHILE=6 CUSTOM_CALL=7 CUBLASLT=8 DYNAMIC_SLICE_FUSION=9 DYNAMIC_SLICE_COPY_FUSION=10 -struct var"HloModuleConfigProto.Int64ListList" +mutable struct var"HloModuleConfigProto.Int64ListList" lists::Vector{var"HloModuleConfigProto.Int64List"} end PB.default_values(::Type{var"HloModuleConfigProto.Int64ListList"}) = (;lists = Vector{var"HloModuleConfigProto.Int64List"}()) @@ -506,7 +503,7 @@ function PB._encoded_size(x::var"HloModuleConfigProto.Int64ListList") return encoded_size end -struct var"ScheduleProto.ComputationScheduleProto" +mutable struct var"ScheduleProto.ComputationScheduleProto" computation_id::Int64 instructions::Vector{var"ScheduleProto.Instruction"} scheduler_statistics::Union{Nothing,var"ScheduleProto.SchedulerStatisticsProto"} @@ -554,7 +551,7 @@ function PB._encoded_size(x::var"ScheduleProto.ComputationScheduleProto") return encoded_size end -struct ScheduleConfigProto +mutable struct ScheduleConfigProto sequence::Vector{var"ScheduleConfigProto.Instruction"} end PB.default_values(::Type{ScheduleConfigProto}) = (;sequence = Vector{var"ScheduleConfigProto.Instruction"}()) @@ -584,7 +581,7 @@ function PB._encoded_size(x::ScheduleConfigProto) return encoded_size end -struct ShardingConfigProto +mutable struct ShardingConfigProto nodes::Vector{NodeShardingConfigProto} end PB.default_values(::Type{ShardingConfigProto}) = (;nodes = Vector{NodeShardingConfigProto}()) @@ -614,7 +611,7 @@ function PB._encoded_size(x::ShardingConfigProto) return encoded_size end -struct ThunkBufferDebugFilter +mutable struct ThunkBufferDebugFilter thunk_id_ranges::Vector{IntRangeInclusive} profile_annotation_regexes::Vector{String} end @@ -650,7 +647,7 @@ function PB._encoded_size(x::ThunkBufferDebugFilter) return encoded_size end -struct ScheduleProto +mutable struct ScheduleProto hlo_module::Union{Nothing,HloModuleProto} computation_schedules::Vector{var"ScheduleProto.ComputationScheduleProto"} end @@ -686,7 +683,7 @@ function PB._encoded_size(x::ScheduleProto) return encoded_size end -struct DebugOptions +mutable struct DebugOptions xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled::Bool xla_disable_automatic_host_compute_offload::Bool xla_enable_scoped_logging_timers::Bool @@ -747,7 +744,6 @@ struct DebugOptions xla_gpu_cuda_data_dir::String xla_gpu_cudnn_gemm_fusion_level::Int32 xla_gpu_cudnn_gemm_max_plans::Int32 - xla_gpu_default_to_alg_dot_bf16_bf16_f32::Bool xla_gpu_deterministic_ops::Bool xla_gpu_disable_async_collectives::Vector{var"DebugOptions.CollectiveOpType".T} xla_gpu_disable_gpuasm_optimizations::Bool @@ -778,7 +774,6 @@ struct DebugOptions xla_gpu_enable_pipelined_all_gather::Bool xla_gpu_enable_pipelined_all_reduce::Bool xla_gpu_enable_pipelined_collectives::Bool - xla_gpu_enable_pipelined_host_offloading::Bool xla_gpu_enable_pipelined_p2p::Bool xla_gpu_enable_pipelined_reduce_scatter::Bool xla_gpu_enable_reassociation_for_converted_ar::Bool @@ -798,7 +793,6 @@ struct DebugOptions xla_gpu_exhaustive_tiling_search::Bool xla_gpu_experimental_allow_unroll_factor_eight::Bool xla_gpu_experimental_aot_compiled_thunks::Bool - xla_gpu_experimental_autotune_backends::Vector{var"DebugOptions.AutotuneBackend".T} xla_gpu_experimental_autotune_cache_mode::var"DebugOptions.AutotuneCacheMode".T xla_gpu_experimental_autotuner_cache_dir::String xla_gpu_experimental_collective_cse_distance_threshold::Int64 @@ -959,8 +953,8 @@ struct DebugOptions xla_backend_extra_options::Dict{String,String} end PB.reserved_fields(::Type{DebugOptions}) = (names = ["hlo_reduce_precision_options", "legacy_command_buffer_custom_call_targets", "xla_allow_get_default_platform", "xla_cpu_dump_unoptimized_hlo_snapshots", "xla_cpu_enable_custom_matmul_tiling", "xla_cpu_enable_experimental_deallocation", "xla_cpu_enable_mlir_fusion_outlining", "xla_cpu_enable_mlir_lowering", "xla_cpu_enable_mlir_tiling_and_fusion", "xla_cpu_matmul_tiling_k_dim", "xla_cpu_matmul_tiling_m_dim", "xla_cpu_matmul_tiling_n_dim", "xla_cpu_sparse_cuda_threads", "xla_cpu_use_thunk_runtime", "xla_cpu_use_xla_runtime", "xla_detailed_logging_and_dumping", "xla_dump_ir", "xla_experimental_exec_time_optimization_effort", "xla_gpu_all_reduce_contiguous", "xla_gpu_allow_all_reduce_kernel", "xla_gpu_asm_extra_flags", "xla_gpu_bef_executable", "xla_gpu_bef_thunk", "xla_gpu_deterministic_reductions", "xla_gpu_disable_multi_streaming", "xla_gpu_dump_hlo_unoptimized_snapshots", "xla_gpu_enable_all_reduce_splitter", "xla_gpu_enable_async_all_gather", "xla_gpu_enable_async_all_reduce", "xla_gpu_enable_async_all_to_all", "xla_gpu_enable_async_collective_broadcast", "xla_gpu_enable_async_collective_permute", "xla_gpu_enable_async_collectives", "xla_gpu_enable_async_reduce_scatter", "xla_gpu_enable_bf16_3way_gemm", "xla_gpu_enable_bf16_6way_gemm", "xla_gpu_enable_cuda_graphs", "xla_gpu_enable_cudnn_fmha", "xla_gpu_enable_cudnn_frontend", "xla_gpu_enable_custom_fusions_re", "xla_gpu_enable_custom_fusions", "xla_gpu_enable_dot_strength_reduction", "xla_gpu_enable_experimental_block_size", "xla_gpu_enable_gpu2_hal", "xla_gpu_enable_gpu2_runtime", "xla_gpu_enable_heuristic_pass_configuration", "xla_gpu_enable_libnvjitlink", "xla_gpu_enable_mlir_emitters", "xla_gpu_enable_mlir_lowering", "xla_gpu_enable_nccl_per_stream_comms", "xla_gpu_enable_persistent_temp_buffers", "xla_gpu_enable_pgle_accuracy_checker", "xla_gpu_enable_priority_fusion", "xla_gpu_enable_softmax_fusion", "xla_gpu_enable_triton_gemm_int4", "xla_gpu_enable_triton_hopper", "xla_gpu_enable_triton_softmax_fusion", "xla_gpu_enable_triton_softmax_priority_fusion", "xla_gpu_enable_xla_runtime_executable", "xla_gpu_ensure_minor_dot_contraction_dims", "xla_gpu_experimental_enable_dynamic_dot_search_space", "xla_gpu_experimental_enable_nan_counter_on_thunks", "xla_gpu_experimental_enable_triton_i4_rewrites", "xla_gpu_experimental_enable_triton_softmax_priority_fusion", "xla_gpu_graph_eviction_timeout_seconds", "xla_gpu_graph_level", "xla_gpu_graph_num_runs_to_instantiate", "xla_gpu_lhs_enable_gpu_async_tracker", "xla_gpu_max_kernel_unroll_factor", "xla_gpu_max_mlir_kernels", "xla_gpu_mlir_emitter_level", "xla_gpu_normalize_layouts", "xla_gpu_redzone_scratch_max_megabytes", "xla_gpu_run_post_layout_collective_pipeliner", "xla_gpu_simplify_all_fp_conversions", "xla_gpu_simplify_gathers", "xla_gpu_simplify_scatters", "xla_gpu_single_wave_autotuning", "xla_gpu_skip_mlir_kernels", "xla_gpu_triton_fusion_level", "xla_gpu_triton_gemm_disable_reduced_precision_reduction", "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error", "xla_gpu_unsafe_pipelined_loop_annotator", "xla_gpu_unsupported_enable_generic_triton_emitter_for_gemms", "xla_gpu_unsupported_force_triton_gemm", "xla_gpu_unsupported_generic_triton_emitter_features", "xla_gpu_use_cudnn_batchnorm", "xla_gpu_use_horizontal_fusion", "xla_gpu_use_random_streams", "xla_hlo_dump_as_graphdef", "xla_hlo_tfgraph_device_scopes", "xla_use_shardy", "xla_gpu_unsupported_annotate_with_emitter_loc", "xla_gpu_experimental_enable_command_buffer_on_thunks", "xla_gpu_experimental_enable_triton_tma"], numbers = Union{Int,UnitRange{Int}}[5, 63, 80, 93, 94, 98, 117, 130, 133, 134, 139, 141, 143, 152, 158, 160, 161, 162, 167, 168, 169, 171, 172, 173, 176, 177, 178, 179, 180, 183, 184, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 204, 206, 207, 211, 214, 218, 220, 221, 226, 229, 230, 233, 234, 238, 242, 249, 263, 264, 266, 270, 271, 275, 276, 278, 279, 281, 282, 286, 298, 299, 302, 303, 309, 313, 314, 319, 320, 325, 326, 332, 346, 352, 355, 358, 361, 367, 369, 371, 385, 394, 398, 402, 423]) -PB.default_values(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false, xla_disable_automatic_host_compute_offload = false, xla_enable_scoped_logging_timers = false, xla_hlo_pass_fix_detect_cycles = false, xla_keep_shardings_after_spmd = false, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = false, xla_unsupported_crash_on_hlo_pass_noop_change = false, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = false, xla_cpu_collective_call_terminate_timeout_seconds = zero(Int32), xla_cpu_collective_call_warn_stuck_seconds = zero(Int32), xla_cpu_collective_timeout_seconds = zero(Int32), xla_cpu_copy_insertion_use_region_analysis = false, xla_cpu_emitter_verification_level = zero(Int32), xla_cpu_enable_concurrency_optimized_scheduler = false, xla_cpu_enable_fast_math = false, xla_cpu_enable_fast_min_max = false, xla_cpu_enable_platform_dependent_math = false, xla_cpu_experimental_onednn_custom_call = false, xla_cpu_experimental_onednn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_graph_fusion_mode = var"DebugOptions.XnnGraphFusionMode".XNN_GRAPH_FUSION_MODE_DISABLED, xla_cpu_experimental_ynn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_fast_math_honor_division = false, xla_cpu_fast_math_honor_functions = false, xla_cpu_fast_math_honor_infs = false, xla_cpu_fast_math_honor_nans = false, xla_cpu_generate_unique_c_style_kernel_entry_points = false, xla_cpu_max_isa = "", xla_cpu_parallel_codegen_split_count = zero(Int32), xla_cpu_prefer_vector_width = zero(Int32), xla_cpu_use_fusion_emitters = false, xla_cpu_use_xnnpack = false, xla_enable_fast_math = false, xla_gpu_experimental_thunk_buffer_debug_filter = nothing, xla_dump_hlo_unoptimized_snapshots = false, xla_enable_enzyme_comms_opt = false, xla_gpu_algorithm_denylist_path = "", xla_gpu_all_gather_combine_threshold_bytes = zero(Int64), xla_gpu_all_reduce_blueconnect_num_devices_per_host = zero(Int32), xla_gpu_all_reduce_combine_threshold_bytes = zero(Int64), xla_gpu_analytical_latency_estimator_options = Dict{String,String}(), xla_gpu_async_dot = false, xla_gpu_auto_spmd_partitioning_memory_budget_gb = zero(Int32), xla_gpu_auto_spmd_partitioning_memory_budget_ratio = zero(Float32), xla_gpu_autotune_gemm_rtol = zero(Float32), xla_gpu_autotune_level = zero(Int32), xla_gpu_autotune_max_solutions = zero(Int64), xla_gpu_collect_cost_model_stats = false, xla_gpu_collective_inflation_factor = zero(Int32), xla_gpu_collective_permute_combine_threshold_bytes = zero(Int64), xla_gpu_collective_permute_decomposer_threshold = zero(Int64), xla_gpu_collectives_use_persistent_cliques = false, xla_gpu_command_buffer_scheduling_mode = var"DebugOptions.CommandBufferSchedulingMode".SERIALIZE, xla_gpu_command_buffer_unroll_loops = false, xla_gpu_copy_insertion_use_region_analysis = false, xla_gpu_crash_on_verification_failures = false, xla_gpu_cublas_fallback = false, xla_gpu_cuda_data_dir = "", xla_gpu_cudnn_gemm_fusion_level = zero(Int32), xla_gpu_cudnn_gemm_max_plans = zero(Int32), xla_gpu_default_to_alg_dot_bf16_bf16_f32 = false, xla_gpu_deterministic_ops = false, xla_gpu_disable_async_collectives = Vector{var"DebugOptions.CollectiveOpType".T}(), xla_gpu_disable_gpuasm_optimizations = false, xla_gpu_dot_merger_threshold_mb = zero(Int32), xla_gpu_dump_autotune_logs_to = "", xla_gpu_dump_autotune_results_to = "", xla_gpu_dump_autotuned_gemm_fusions = false, xla_gpu_dump_llvmir = false, xla_gpu_enable_all_gather_combine_by_dim = false, xla_gpu_enable_analytical_latency_estimator = false, xla_gpu_enable_analytical_sol_latency_estimator = false, xla_gpu_enable_approx_costly_collectives = false, xla_gpu_enable_command_buffer = Vector{var"DebugOptions.CommandBufferCmdType".T}(), xla_gpu_enable_cub_radix_sort = false, xla_gpu_enable_cublaslt = false, xla_gpu_enable_cudnn_int8x32_convolution_reordering = false, xla_gpu_enable_cudnn_layer_norm = false, xla_gpu_enable_dynamic_slice_fusion = false, xla_gpu_enable_fast_min_max = false, xla_gpu_enable_highest_priority_async_stream = false, xla_gpu_enable_host_memory_offloading = false, xla_gpu_enable_latency_hiding_scheduler = false, xla_gpu_enable_libnvptxcompiler = false, xla_gpu_enable_llvm_module_compilation_parallelism = false, xla_gpu_enable_nccl_clique_optimization = false, xla_gpu_enable_nccl_comm_splitting = false, xla_gpu_enable_nccl_user_buffers = false, xla_gpu_enable_pipelined_all_gather = false, xla_gpu_enable_pipelined_all_reduce = false, xla_gpu_enable_pipelined_collectives = false, xla_gpu_enable_pipelined_host_offloading = false, xla_gpu_enable_pipelined_p2p = false, xla_gpu_enable_pipelined_reduce_scatter = false, xla_gpu_enable_reassociation_for_converted_ar = false, xla_gpu_enable_reduce_scatter_combine_by_dim = false, xla_gpu_enable_reduction_epilogue_fusion = false, xla_gpu_enable_scatter_determinism_expander = false, xla_gpu_enable_shared_constants = false, xla_gpu_enable_split_k_autotuning = false, xla_gpu_enable_triton_gemm = false, xla_gpu_enable_while_loop_double_buffering = false, xla_gpu_enable_while_loop_reduce_scatter_code_motion = false, xla_gpu_enable_while_loop_unrolling = var"DebugOptions.WhileLoopUnrolling".WHILE_LOOP_UNROLLING_NO_UNROLL, xla_gpu_exclude_nondeterministic_ops = false, xla_gpu_executable_embed_debug_info = false, xla_gpu_executable_terminate_timeout_seconds = zero(Int32), xla_gpu_executable_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_exhaustive_tiling_search = false, xla_gpu_experimental_allow_unroll_factor_eight = false, xla_gpu_experimental_aot_compiled_thunks = false, xla_gpu_experimental_autotune_backends = Vector{var"DebugOptions.AutotuneBackend".T}(), xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED, xla_gpu_experimental_autotuner_cache_dir = "", xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64), xla_gpu_experimental_collective_perf_table_path = "", xla_gpu_experimental_disable_binary_libraries = false, xla_gpu_experimental_dump_fdo_profiles = false, xla_gpu_experimental_dump_gpu_executable = false, xla_gpu_experimental_enable_alltoall_windowed_einsum = false, xla_gpu_experimental_enable_buffer_saver_on_thunks = false, xla_gpu_experimental_enable_checksum_tracing_on_thunks = false, xla_gpu_experimental_enable_fusion_autotuner = false, xla_gpu_experimental_enable_fusion_block_level_rewriter = false, xla_gpu_experimental_enable_heuristic_collective_combining = false, xla_gpu_experimental_enable_nccl_symmetric_buffers = false, xla_gpu_experimental_enable_nvshmem = false, xla_gpu_experimental_enable_split_k_rewrite = false, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = false, xla_gpu_experimental_enable_triton_heroless_priority_fusion = false, xla_gpu_experimental_enable_triton_warp_specialization = false, xla_gpu_experimental_pack_dot_operands_along_k_dimension = false, xla_gpu_experimental_parallel_collective_overlap_limit = zero(Int32), xla_gpu_experimental_pipeline_parallelism_opt_level = var"DebugOptions.PipelineParallelismOptLevel".PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE, xla_gpu_experimental_stream_annotation = false, xla_gpu_experimental_use_autotuner_pass = false, xla_gpu_experimental_use_ragged_dot_fusion = false, xla_gpu_fail_ptx_compilation_on_register_spilling = false, xla_gpu_filter_kernels_spilling_registers_on_autotuning = false, xla_gpu_first_collective_call_terminate_timeout_seconds = zero(Int32), xla_gpu_first_collective_call_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_force_compilation_parallelism = zero(Int32), xla_gpu_force_conv_nchw = false, xla_gpu_force_conv_nhwc = false, xla_gpu_ftz = false, xla_gpu_fused_attention_use_cudnn_rng = false, xla_gpu_gemm_autotuner_override_file = "", xla_gpu_gemm_rewrite_size_threshold = zero(Int64), xla_gpu_generate_debug_info = false, xla_gpu_generate_line_info = false, xla_gpu_graph_enable_concurrent_region = false, xla_gpu_graph_min_graph_size = zero(Int32), xla_gpu_kernel_cache_file = "", xla_gpu_libnvjitlink_mode = var"DebugOptions.LibNvJitLinkMode".LIB_NV_JIT_LINK_MODE_AUTO, xla_gpu_llvm_ir_file = Vector{String}(), xla_gpu_llvm_verification_level = zero(Int32), xla_gpu_load_autotune_results_from = "", xla_gpu_memory_limit_slop_factor = zero(Int32), xla_gpu_mock_custom_calls = false, xla_gpu_multi_streamed_windowed_einsum = false, xla_gpu_nccl_async_execution = false, xla_gpu_nccl_blocking_communicators = false, xla_gpu_nccl_collective_max_nchannels = zero(Int64), xla_gpu_nccl_init_max_rank_per_root_ratio = zero(Int64), xla_gpu_nccl_p2p_max_nchannels = zero(Int64), xla_gpu_nccl_terminate_on_error = false, xla_gpu_nccl_termination_timeout_seconds = zero(Int64), xla_gpu_operand_bytes_threshold_for_windowed_einsum = zero(Int64), xla_gpu_override_gemm_autotuner = "", xla_gpu_per_fusion_autotune_cache_dir = "", xla_gpu_pgle_accuracy_checker = var"DebugOptions.PGLEStrictnessLevel".PGLE_STRICTNESS_LEVEL_OFF, xla_gpu_pgle_profile_file_or_directory_path = "", xla_gpu_ptx_file = Vector{String}(), xla_gpu_reduce_scatter_combine_threshold_bytes = zero(Int64), xla_gpu_redzone_padding_bytes = zero(Int64), xla_gpu_require_complete_aot_autotune_results = false, xla_gpu_require_exclusive_lock = false, xla_gpu_shape_checks = var"DebugOptions.ShapeChecks".IGNORE, xla_gpu_shard_autotuning = false, xla_gpu_strict_conv_algorithm_picker = false, xla_gpu_target_config_filename = "", xla_gpu_temp_buffer_use_separate_color = false, xla_gpu_threshold_for_windowed_einsum_mib = zero(Int64), xla_gpu_triton_gemm_any = false, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = false, xla_gpu_unsupported_enable_all_reduce_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = false, xla_gpu_unsupported_enable_triton_gemm = false, xla_gpu_unsupported_enable_triton_multi_output_fusion = false, xla_gpu_unsupported_override_fast_interconnect_slice_size = zero(Int64), xla_gpu_unsupported_use_all_reduce_one_shot_kernel = false, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = false, xla_gpu_use_embeded_device_lib = false, xla_gpu_use_inprocess_lld = false, xla_gpu_use_memcpy_local_p2p = false, xla_gpu_use_runtime_fusion = false, xla_gpu_verify_triton_fusion_numerics = false, xla_hlo_graph_addresses = false, xla_hlo_profile = false, xla_disable_hlo_passes = Vector{String}(), xla_enable_hlo_passes_only = Vector{String}(), xla_disable_all_hlo_passes = false, xla_backend_optimization_level = zero(Int32), xla_embed_ir_in_executable = false, xla_eliminate_hlo_implicit_broadcast = false, xla_cpu_multi_thread_eigen = false, xla_llvm_enable_alias_scope_metadata = false, xla_llvm_enable_noalias_metadata = false, xla_llvm_enable_invariant_load_metadata = false, xla_llvm_disable_expensive_passes = false, xla_test_all_output_layouts = false, xla_test_all_input_layouts = false, xla_hlo_graph_sharding_color = false, xla_cpu_use_onednn = false, xla_allow_excess_precision = false, xla_force_host_platform_device_count = zero(Int32), xla_hlo_evaluator_use_fast_path = false, xla_allow_scalar_index_dynamic_ops = false, xla_step_marker_location = var"DebugOptions.StepMarkerLocation".STEP_MARK_AT_ENTRY, xla_dump_to = "", xla_flags_reset = false, xla_dump_hlo_module_re = "", xla_dump_hlo_pass_re = "", xla_dump_emitter_re = "", xla_dump_hlo_as_text = false, xla_dump_hlo_as_proto = false, xla_dump_hlo_as_dot = false, xla_dump_hlo_as_url = false, xla_dump_hlo_as_html = false, xla_dump_fusion_visualization = false, xla_dump_hlo_snapshots = false, xla_dump_include_timestamp = false, xla_dump_max_hlo_modules = zero(Int32), xla_dump_module_metadata = false, xla_dump_compress_protos = false, xla_dump_hlo_as_long_text = false, xla_dump_enable_mlir_pretty_form = false, xla_dump_full_hlo_config = false, xla_tpu_detect_nan = false, xla_tpu_detect_inf = false, xla_cpu_enable_xprof_traceme = false, xla_multiheap_size_constraint_per_heap = zero(Int32), xla_detailed_logging = false, xla_enable_dumping = false, xla_llvm_force_inline_before_split = false, xla_dump_disable_metadata = false, xla_dump_hlo_pipeline_re = "", xla_cpu_use_acl = false, xla_cpu_strict_dot_conv_math = false, xla_dump_latency_hiding_schedule = false, xla_partitioning_algorithm = var"DebugOptions.PartitioningAlgorithm".PARTITIONING_ALGORITHM_NOOP, xla_debug_buffer_assignment_show_max = zero(Int64), xla_detect_unstable_reductions = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_detect_unstable_reductions_post_optimizations = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_nan = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_inf = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_dump_large_constants = false, xla_reduce_window_rewrite_base_length = zero(Int64), xla_cmd_buffer_trace_cache_size = zero(Int64), xla_syntax_sugar_async_ops = false, xla_enable_command_buffers_during_profiling = false, xla_ignore_channel_id = false, xla_pjrt_allow_auto_layout_in_hlo = false, xla_test_add_command_buffer_mode = false, xla_gpu_experimental_matmul_perf_table_path = "", xla_early_exit_with_layouts = false, xla_gpu_experimental_scaled_dot_with_triton = false, xla_gpu_experimental_use_raft_select_k = false, xla_backend_extra_options = Dict{String,String}()) -PB.field_numbers(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439, xla_disable_automatic_host_compute_offload = 408, xla_enable_scoped_logging_timers = 436, xla_hlo_pass_fix_detect_cycles = 370, xla_keep_shardings_after_spmd = 419, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363, xla_unsupported_crash_on_hlo_pass_noop_change = 379, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380, xla_cpu_collective_call_terminate_timeout_seconds = 417, xla_cpu_collective_call_warn_stuck_seconds = 418, xla_cpu_collective_timeout_seconds = 438, xla_cpu_copy_insertion_use_region_analysis = 337, xla_cpu_emitter_verification_level = 395, xla_cpu_enable_concurrency_optimized_scheduler = 307, xla_cpu_enable_fast_math = 99, xla_cpu_enable_fast_min_max = 140, xla_cpu_enable_platform_dependent_math = 425, xla_cpu_experimental_onednn_custom_call = 412, xla_cpu_experimental_onednn_fusion_type = 399, xla_cpu_experimental_xnn_fusion_type = 400, xla_cpu_experimental_xnn_graph_fusion_mode = 365, xla_cpu_experimental_ynn_fusion_type = 422, xla_cpu_fast_math_honor_division = 126, xla_cpu_fast_math_honor_functions = 129, xla_cpu_fast_math_honor_infs = 121, xla_cpu_fast_math_honor_nans = 120, xla_cpu_generate_unique_c_style_kernel_entry_points = 372, xla_cpu_max_isa = 333, xla_cpu_parallel_codegen_split_count = 323, xla_cpu_prefer_vector_width = 308, xla_cpu_use_fusion_emitters = 376, xla_cpu_use_xnnpack = 359, xla_enable_fast_math = 335, xla_gpu_experimental_thunk_buffer_debug_filter = 424, xla_dump_hlo_unoptimized_snapshots = 405, xla_enable_enzyme_comms_opt = 429, xla_gpu_algorithm_denylist_path = 128, xla_gpu_all_gather_combine_threshold_bytes = 212, xla_gpu_all_reduce_blueconnect_num_devices_per_host = 159, xla_gpu_all_reduce_combine_threshold_bytes = 157, xla_gpu_analytical_latency_estimator_options = 357, xla_gpu_async_dot = 321, xla_gpu_auto_spmd_partitioning_memory_budget_gb = 224, xla_gpu_auto_spmd_partitioning_memory_budget_ratio = 225, xla_gpu_autotune_gemm_rtol = 316, xla_gpu_autotune_level = 123, xla_gpu_autotune_max_solutions = 288, xla_gpu_collect_cost_model_stats = 240, xla_gpu_collective_inflation_factor = 205, xla_gpu_collective_permute_combine_threshold_bytes = 378, xla_gpu_collective_permute_decomposer_threshold = 237, xla_gpu_collectives_use_persistent_cliques = 354, xla_gpu_command_buffer_scheduling_mode = 404, xla_gpu_command_buffer_unroll_loops = 411, xla_gpu_copy_insertion_use_region_analysis = 236, xla_gpu_crash_on_verification_failures = 101, xla_gpu_cublas_fallback = 247, xla_gpu_cuda_data_dir = 61, xla_gpu_cudnn_gemm_fusion_level = 285, xla_gpu_cudnn_gemm_max_plans = 318, xla_gpu_default_to_alg_dot_bf16_bf16_f32 = 441, xla_gpu_deterministic_ops = 148, xla_gpu_disable_async_collectives = 289, xla_gpu_disable_gpuasm_optimizations = 103, xla_gpu_dot_merger_threshold_mb = 331, xla_gpu_dump_autotune_logs_to = 292, xla_gpu_dump_autotune_results_to = 222, xla_gpu_dump_autotuned_gemm_fusions = 232, xla_gpu_dump_llvmir = 155, xla_gpu_enable_all_gather_combine_by_dim = 254, xla_gpu_enable_analytical_latency_estimator = 255, xla_gpu_enable_analytical_sol_latency_estimator = 356, xla_gpu_enable_approx_costly_collectives = 305, xla_gpu_enable_command_buffer = 258, xla_gpu_enable_cub_radix_sort = 259, xla_gpu_enable_cublaslt = 166, xla_gpu_enable_cudnn_int8x32_convolution_reordering = 189, xla_gpu_enable_cudnn_layer_norm = 262, xla_gpu_enable_dynamic_slice_fusion = 105, xla_gpu_enable_fast_min_max = 100, xla_gpu_enable_highest_priority_async_stream = 216, xla_gpu_enable_host_memory_offloading = 296, xla_gpu_enable_latency_hiding_scheduler = 186, xla_gpu_enable_libnvptxcompiler = 269, xla_gpu_enable_llvm_module_compilation_parallelism = 268, xla_gpu_enable_nccl_clique_optimization = 244, xla_gpu_enable_nccl_comm_splitting = 272, xla_gpu_enable_nccl_user_buffers = 267, xla_gpu_enable_pipelined_all_gather = 227, xla_gpu_enable_pipelined_all_reduce = 217, xla_gpu_enable_pipelined_collectives = 239, xla_gpu_enable_pipelined_host_offloading = 440, xla_gpu_enable_pipelined_p2p = 246, xla_gpu_enable_pipelined_reduce_scatter = 231, xla_gpu_enable_reassociation_for_converted_ar = 209, xla_gpu_enable_reduce_scatter_combine_by_dim = 257, xla_gpu_enable_reduction_epilogue_fusion = 243, xla_gpu_enable_scatter_determinism_expander = 345, xla_gpu_enable_shared_constants = 165, xla_gpu_enable_split_k_autotuning = 241, xla_gpu_enable_triton_gemm = 188, xla_gpu_enable_while_loop_double_buffering = 248, xla_gpu_enable_while_loop_reduce_scatter_code_motion = 203, xla_gpu_enable_while_loop_unrolling = 294, xla_gpu_exclude_nondeterministic_ops = 297, xla_gpu_executable_embed_debug_info = 437, xla_gpu_executable_terminate_timeout_seconds = 328, xla_gpu_executable_warn_stuck_timeout_seconds = 327, xla_gpu_exhaustive_tiling_search = 219, xla_gpu_experimental_allow_unroll_factor_eight = 430, xla_gpu_experimental_aot_compiled_thunks = 435, xla_gpu_experimental_autotune_backends = 442, xla_gpu_experimental_autotune_cache_mode = 324, xla_gpu_experimental_autotuner_cache_dir = 407, xla_gpu_experimental_collective_cse_distance_threshold = 374, xla_gpu_experimental_collective_perf_table_path = 377, xla_gpu_experimental_disable_binary_libraries = 329, xla_gpu_experimental_dump_fdo_profiles = 338, xla_gpu_experimental_dump_gpu_executable = 427, xla_gpu_experimental_enable_alltoall_windowed_einsum = 360, xla_gpu_experimental_enable_buffer_saver_on_thunks = 431, xla_gpu_experimental_enable_checksum_tracing_on_thunks = 414, xla_gpu_experimental_enable_fusion_autotuner = 409, xla_gpu_experimental_enable_fusion_block_level_rewriter = 334, xla_gpu_experimental_enable_heuristic_collective_combining = 366, xla_gpu_experimental_enable_nccl_symmetric_buffers = 406, xla_gpu_experimental_enable_nvshmem = 388, xla_gpu_experimental_enable_split_k_rewrite = 386, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = 368, xla_gpu_experimental_enable_triton_heroless_priority_fusion = 340, xla_gpu_experimental_enable_triton_warp_specialization = 421, xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362, xla_gpu_experimental_parallel_collective_overlap_limit = 336, xla_gpu_experimental_pipeline_parallelism_opt_level = 351, xla_gpu_experimental_stream_annotation = 342, xla_gpu_experimental_use_autotuner_pass = 396, xla_gpu_experimental_use_ragged_dot_fusion = 401, xla_gpu_fail_ptx_compilation_on_register_spilling = 353, xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250, xla_gpu_first_collective_call_terminate_timeout_seconds = 392, xla_gpu_first_collective_call_warn_stuck_timeout_seconds = 391, xla_gpu_force_compilation_parallelism = 147, xla_gpu_force_conv_nchw = 125, xla_gpu_force_conv_nhwc = 146, xla_gpu_ftz = 62, xla_gpu_fused_attention_use_cudnn_rng = 235, xla_gpu_gemm_autotuner_override_file = 434, xla_gpu_gemm_rewrite_size_threshold = 283, xla_gpu_generate_debug_info = 348, xla_gpu_generate_line_info = 349, xla_gpu_graph_enable_concurrent_region = 215, xla_gpu_graph_min_graph_size = 208, xla_gpu_kernel_cache_file = 306, xla_gpu_libnvjitlink_mode = 343, xla_gpu_llvm_ir_file = 150, xla_gpu_llvm_verification_level = 256, xla_gpu_load_autotune_results_from = 223, xla_gpu_memory_limit_slop_factor = 260, xla_gpu_mock_custom_calls = 245, xla_gpu_multi_streamed_windowed_einsum = 280, xla_gpu_nccl_async_execution = 393, xla_gpu_nccl_blocking_communicators = 390, xla_gpu_nccl_collective_max_nchannels = 273, xla_gpu_nccl_init_max_rank_per_root_ratio = 277, xla_gpu_nccl_p2p_max_nchannels = 274, xla_gpu_nccl_terminate_on_error = 301, xla_gpu_nccl_termination_timeout_seconds = 163, xla_gpu_operand_bytes_threshold_for_windowed_einsum = 339, xla_gpu_override_gemm_autotuner = 295, xla_gpu_per_fusion_autotune_cache_dir = 310, xla_gpu_pgle_accuracy_checker = 341, xla_gpu_pgle_profile_file_or_directory_path = 210, xla_gpu_ptx_file = 127, xla_gpu_reduce_scatter_combine_threshold_bytes = 213, xla_gpu_redzone_padding_bytes = 228, xla_gpu_require_complete_aot_autotune_results = 284, xla_gpu_require_exclusive_lock = 347, xla_gpu_shape_checks = 170, xla_gpu_shard_autotuning = 304, xla_gpu_strict_conv_algorithm_picker = 156, xla_gpu_target_config_filename = 261, xla_gpu_temp_buffer_use_separate_color = 312, xla_gpu_threshold_for_windowed_einsum_mib = 265, xla_gpu_triton_gemm_any = 190, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138, xla_gpu_unsupported_enable_all_reduce_decomposer = 384, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = 350, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = 415, xla_gpu_unsupported_enable_triton_gemm = 322, xla_gpu_unsupported_enable_triton_multi_output_fusion = 382, xla_gpu_unsupported_override_fast_interconnect_slice_size = 416, xla_gpu_unsupported_use_all_reduce_one_shot_kernel = 387, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = 375, xla_gpu_use_embeded_device_lib = 420, xla_gpu_use_inprocess_lld = 389, xla_gpu_use_memcpy_local_p2p = 287, xla_gpu_use_runtime_fusion = 181, xla_gpu_verify_triton_fusion_numerics = 291, xla_hlo_graph_addresses = 2, xla_hlo_profile = 9, xla_disable_hlo_passes = 30, xla_enable_hlo_passes_only = 124, xla_disable_all_hlo_passes = 104, xla_backend_optimization_level = 31, xla_embed_ir_in_executable = 33, xla_eliminate_hlo_implicit_broadcast = 35, xla_cpu_multi_thread_eigen = 60, xla_llvm_enable_alias_scope_metadata = 70, xla_llvm_enable_noalias_metadata = 71, xla_llvm_enable_invariant_load_metadata = 72, xla_llvm_disable_expensive_passes = 73, xla_test_all_output_layouts = 90, xla_test_all_input_layouts = 91, xla_hlo_graph_sharding_color = 92, xla_cpu_use_onednn = 97, xla_allow_excess_precision = 122, xla_force_host_platform_device_count = 102, xla_hlo_evaluator_use_fast_path = 106, xla_allow_scalar_index_dynamic_ops = 107, xla_step_marker_location = 108, xla_dump_to = 109, xla_flags_reset = 364, xla_dump_hlo_module_re = 110, xla_dump_hlo_pass_re = 111, xla_dump_emitter_re = 433, xla_dump_hlo_as_text = 112, xla_dump_hlo_as_proto = 113, xla_dump_hlo_as_dot = 114, xla_dump_hlo_as_url = 115, xla_dump_hlo_as_html = 116, xla_dump_fusion_visualization = 149, xla_dump_hlo_snapshots = 118, xla_dump_include_timestamp = 131, xla_dump_max_hlo_modules = 132, xla_dump_module_metadata = 144, xla_dump_compress_protos = 151, xla_dump_hlo_as_long_text = 164, xla_dump_enable_mlir_pretty_form = 185, xla_dump_full_hlo_config = 381, xla_tpu_detect_nan = 135, xla_tpu_detect_inf = 136, xla_cpu_enable_xprof_traceme = 137, xla_multiheap_size_constraint_per_heap = 142, xla_detailed_logging = 252, xla_enable_dumping = 253, xla_llvm_force_inline_before_split = 300, xla_dump_disable_metadata = 153, xla_dump_hlo_pipeline_re = 154, xla_cpu_use_acl = 174, xla_cpu_strict_dot_conv_math = 175, xla_dump_latency_hiding_schedule = 182, xla_partitioning_algorithm = 187, xla_debug_buffer_assignment_show_max = 251, xla_detect_unstable_reductions = 403, xla_detect_unstable_reductions_post_optimizations = 432, xla_gpu_detect_nan = 426, xla_gpu_detect_inf = 428, xla_dump_large_constants = 290, xla_reduce_window_rewrite_base_length = 293, xla_cmd_buffer_trace_cache_size = 311, xla_syntax_sugar_async_ops = 315, xla_enable_command_buffers_during_profiling = 317, xla_ignore_channel_id = 330, xla_pjrt_allow_auto_layout_in_hlo = 344, xla_test_add_command_buffer_mode = 373, xla_gpu_experimental_matmul_perf_table_path = 383, xla_early_exit_with_layouts = 397, xla_gpu_experimental_scaled_dot_with_triton = 410, xla_gpu_experimental_use_raft_select_k = 413, xla_backend_extra_options = 500) +PB.default_values(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false, xla_disable_automatic_host_compute_offload = false, xla_enable_scoped_logging_timers = false, xla_hlo_pass_fix_detect_cycles = false, xla_keep_shardings_after_spmd = false, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = false, xla_unsupported_crash_on_hlo_pass_noop_change = false, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = false, xla_cpu_collective_call_terminate_timeout_seconds = zero(Int32), xla_cpu_collective_call_warn_stuck_seconds = zero(Int32), xla_cpu_collective_timeout_seconds = zero(Int32), xla_cpu_copy_insertion_use_region_analysis = false, xla_cpu_emitter_verification_level = zero(Int32), xla_cpu_enable_concurrency_optimized_scheduler = false, xla_cpu_enable_fast_math = false, xla_cpu_enable_fast_min_max = false, xla_cpu_enable_platform_dependent_math = false, xla_cpu_experimental_onednn_custom_call = false, xla_cpu_experimental_onednn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_experimental_xnn_graph_fusion_mode = var"DebugOptions.XnnGraphFusionMode".XNN_GRAPH_FUSION_MODE_DISABLED, xla_cpu_experimental_ynn_fusion_type = Vector{var"DebugOptions.LibraryFusionType".T}(), xla_cpu_fast_math_honor_division = false, xla_cpu_fast_math_honor_functions = false, xla_cpu_fast_math_honor_infs = false, xla_cpu_fast_math_honor_nans = false, xla_cpu_generate_unique_c_style_kernel_entry_points = false, xla_cpu_max_isa = "", xla_cpu_parallel_codegen_split_count = zero(Int32), xla_cpu_prefer_vector_width = zero(Int32), xla_cpu_use_fusion_emitters = false, xla_cpu_use_xnnpack = false, xla_enable_fast_math = false, xla_gpu_experimental_thunk_buffer_debug_filter = nothing, xla_dump_hlo_unoptimized_snapshots = false, xla_enable_enzyme_comms_opt = false, xla_gpu_algorithm_denylist_path = "", xla_gpu_all_gather_combine_threshold_bytes = zero(Int64), xla_gpu_all_reduce_blueconnect_num_devices_per_host = zero(Int32), xla_gpu_all_reduce_combine_threshold_bytes = zero(Int64), xla_gpu_analytical_latency_estimator_options = Dict{String,String}(), xla_gpu_async_dot = false, xla_gpu_auto_spmd_partitioning_memory_budget_gb = zero(Int32), xla_gpu_auto_spmd_partitioning_memory_budget_ratio = zero(Float32), xla_gpu_autotune_gemm_rtol = zero(Float32), xla_gpu_autotune_level = zero(Int32), xla_gpu_autotune_max_solutions = zero(Int64), xla_gpu_collect_cost_model_stats = false, xla_gpu_collective_inflation_factor = zero(Int32), xla_gpu_collective_permute_combine_threshold_bytes = zero(Int64), xla_gpu_collective_permute_decomposer_threshold = zero(Int64), xla_gpu_collectives_use_persistent_cliques = false, xla_gpu_command_buffer_scheduling_mode = var"DebugOptions.CommandBufferSchedulingMode".SERIALIZE, xla_gpu_command_buffer_unroll_loops = false, xla_gpu_copy_insertion_use_region_analysis = false, xla_gpu_crash_on_verification_failures = false, xla_gpu_cublas_fallback = false, xla_gpu_cuda_data_dir = "", xla_gpu_cudnn_gemm_fusion_level = zero(Int32), xla_gpu_cudnn_gemm_max_plans = zero(Int32), xla_gpu_deterministic_ops = false, xla_gpu_disable_async_collectives = Vector{var"DebugOptions.CollectiveOpType".T}(), xla_gpu_disable_gpuasm_optimizations = false, xla_gpu_dot_merger_threshold_mb = zero(Int32), xla_gpu_dump_autotune_logs_to = "", xla_gpu_dump_autotune_results_to = "", xla_gpu_dump_autotuned_gemm_fusions = false, xla_gpu_dump_llvmir = false, xla_gpu_enable_all_gather_combine_by_dim = false, xla_gpu_enable_analytical_latency_estimator = false, xla_gpu_enable_analytical_sol_latency_estimator = false, xla_gpu_enable_approx_costly_collectives = false, xla_gpu_enable_command_buffer = Vector{var"DebugOptions.CommandBufferCmdType".T}(), xla_gpu_enable_cub_radix_sort = false, xla_gpu_enable_cublaslt = false, xla_gpu_enable_cudnn_int8x32_convolution_reordering = false, xla_gpu_enable_cudnn_layer_norm = false, xla_gpu_enable_dynamic_slice_fusion = false, xla_gpu_enable_fast_min_max = false, xla_gpu_enable_highest_priority_async_stream = false, xla_gpu_enable_host_memory_offloading = false, xla_gpu_enable_latency_hiding_scheduler = false, xla_gpu_enable_libnvptxcompiler = false, xla_gpu_enable_llvm_module_compilation_parallelism = false, xla_gpu_enable_nccl_clique_optimization = false, xla_gpu_enable_nccl_comm_splitting = false, xla_gpu_enable_nccl_user_buffers = false, xla_gpu_enable_pipelined_all_gather = false, xla_gpu_enable_pipelined_all_reduce = false, xla_gpu_enable_pipelined_collectives = false, xla_gpu_enable_pipelined_p2p = false, xla_gpu_enable_pipelined_reduce_scatter = false, xla_gpu_enable_reassociation_for_converted_ar = false, xla_gpu_enable_reduce_scatter_combine_by_dim = false, xla_gpu_enable_reduction_epilogue_fusion = false, xla_gpu_enable_scatter_determinism_expander = false, xla_gpu_enable_shared_constants = false, xla_gpu_enable_split_k_autotuning = false, xla_gpu_enable_triton_gemm = false, xla_gpu_enable_while_loop_double_buffering = false, xla_gpu_enable_while_loop_reduce_scatter_code_motion = false, xla_gpu_enable_while_loop_unrolling = var"DebugOptions.WhileLoopUnrolling".WHILE_LOOP_UNROLLING_NO_UNROLL, xla_gpu_exclude_nondeterministic_ops = false, xla_gpu_executable_embed_debug_info = false, xla_gpu_executable_terminate_timeout_seconds = zero(Int32), xla_gpu_executable_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_exhaustive_tiling_search = false, xla_gpu_experimental_allow_unroll_factor_eight = false, xla_gpu_experimental_aot_compiled_thunks = false, xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED, xla_gpu_experimental_autotuner_cache_dir = "", xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64), xla_gpu_experimental_collective_perf_table_path = "", xla_gpu_experimental_disable_binary_libraries = false, xla_gpu_experimental_dump_fdo_profiles = false, xla_gpu_experimental_dump_gpu_executable = false, xla_gpu_experimental_enable_alltoall_windowed_einsum = false, xla_gpu_experimental_enable_buffer_saver_on_thunks = false, xla_gpu_experimental_enable_checksum_tracing_on_thunks = false, xla_gpu_experimental_enable_fusion_autotuner = false, xla_gpu_experimental_enable_fusion_block_level_rewriter = false, xla_gpu_experimental_enable_heuristic_collective_combining = false, xla_gpu_experimental_enable_nccl_symmetric_buffers = false, xla_gpu_experimental_enable_nvshmem = false, xla_gpu_experimental_enable_split_k_rewrite = false, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = false, xla_gpu_experimental_enable_triton_heroless_priority_fusion = false, xla_gpu_experimental_enable_triton_warp_specialization = false, xla_gpu_experimental_pack_dot_operands_along_k_dimension = false, xla_gpu_experimental_parallel_collective_overlap_limit = zero(Int32), xla_gpu_experimental_pipeline_parallelism_opt_level = var"DebugOptions.PipelineParallelismOptLevel".PIPELINE_PARALLELISM_OPT_LEVEL_DISABLE, xla_gpu_experimental_stream_annotation = false, xla_gpu_experimental_use_autotuner_pass = false, xla_gpu_experimental_use_ragged_dot_fusion = false, xla_gpu_fail_ptx_compilation_on_register_spilling = false, xla_gpu_filter_kernels_spilling_registers_on_autotuning = false, xla_gpu_first_collective_call_terminate_timeout_seconds = zero(Int32), xla_gpu_first_collective_call_warn_stuck_timeout_seconds = zero(Int32), xla_gpu_force_compilation_parallelism = zero(Int32), xla_gpu_force_conv_nchw = false, xla_gpu_force_conv_nhwc = false, xla_gpu_ftz = false, xla_gpu_fused_attention_use_cudnn_rng = false, xla_gpu_gemm_autotuner_override_file = "", xla_gpu_gemm_rewrite_size_threshold = zero(Int64), xla_gpu_generate_debug_info = false, xla_gpu_generate_line_info = false, xla_gpu_graph_enable_concurrent_region = false, xla_gpu_graph_min_graph_size = zero(Int32), xla_gpu_kernel_cache_file = "", xla_gpu_libnvjitlink_mode = var"DebugOptions.LibNvJitLinkMode".LIB_NV_JIT_LINK_MODE_AUTO, xla_gpu_llvm_ir_file = Vector{String}(), xla_gpu_llvm_verification_level = zero(Int32), xla_gpu_load_autotune_results_from = "", xla_gpu_memory_limit_slop_factor = zero(Int32), xla_gpu_mock_custom_calls = false, xla_gpu_multi_streamed_windowed_einsum = false, xla_gpu_nccl_async_execution = false, xla_gpu_nccl_blocking_communicators = false, xla_gpu_nccl_collective_max_nchannels = zero(Int64), xla_gpu_nccl_init_max_rank_per_root_ratio = zero(Int64), xla_gpu_nccl_p2p_max_nchannels = zero(Int64), xla_gpu_nccl_terminate_on_error = false, xla_gpu_nccl_termination_timeout_seconds = zero(Int64), xla_gpu_operand_bytes_threshold_for_windowed_einsum = zero(Int64), xla_gpu_override_gemm_autotuner = "", xla_gpu_per_fusion_autotune_cache_dir = "", xla_gpu_pgle_accuracy_checker = var"DebugOptions.PGLEStrictnessLevel".PGLE_STRICTNESS_LEVEL_OFF, xla_gpu_pgle_profile_file_or_directory_path = "", xla_gpu_ptx_file = Vector{String}(), xla_gpu_reduce_scatter_combine_threshold_bytes = zero(Int64), xla_gpu_redzone_padding_bytes = zero(Int64), xla_gpu_require_complete_aot_autotune_results = false, xla_gpu_require_exclusive_lock = false, xla_gpu_shape_checks = var"DebugOptions.ShapeChecks".IGNORE, xla_gpu_shard_autotuning = false, xla_gpu_strict_conv_algorithm_picker = false, xla_gpu_target_config_filename = "", xla_gpu_temp_buffer_use_separate_color = false, xla_gpu_threshold_for_windowed_einsum_mib = zero(Int64), xla_gpu_triton_gemm_any = false, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = false, xla_gpu_unsupported_enable_all_reduce_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = false, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = false, xla_gpu_unsupported_enable_triton_gemm = false, xla_gpu_unsupported_enable_triton_multi_output_fusion = false, xla_gpu_unsupported_override_fast_interconnect_slice_size = zero(Int64), xla_gpu_unsupported_use_all_reduce_one_shot_kernel = false, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = false, xla_gpu_use_embeded_device_lib = false, xla_gpu_use_inprocess_lld = false, xla_gpu_use_memcpy_local_p2p = false, xla_gpu_use_runtime_fusion = false, xla_gpu_verify_triton_fusion_numerics = false, xla_hlo_graph_addresses = false, xla_hlo_profile = false, xla_disable_hlo_passes = Vector{String}(), xla_enable_hlo_passes_only = Vector{String}(), xla_disable_all_hlo_passes = false, xla_backend_optimization_level = zero(Int32), xla_embed_ir_in_executable = false, xla_eliminate_hlo_implicit_broadcast = false, xla_cpu_multi_thread_eigen = false, xla_llvm_enable_alias_scope_metadata = false, xla_llvm_enable_noalias_metadata = false, xla_llvm_enable_invariant_load_metadata = false, xla_llvm_disable_expensive_passes = false, xla_test_all_output_layouts = false, xla_test_all_input_layouts = false, xla_hlo_graph_sharding_color = false, xla_cpu_use_onednn = false, xla_allow_excess_precision = false, xla_force_host_platform_device_count = zero(Int32), xla_hlo_evaluator_use_fast_path = false, xla_allow_scalar_index_dynamic_ops = false, xla_step_marker_location = var"DebugOptions.StepMarkerLocation".STEP_MARK_AT_ENTRY, xla_dump_to = "", xla_flags_reset = false, xla_dump_hlo_module_re = "", xla_dump_hlo_pass_re = "", xla_dump_emitter_re = "", xla_dump_hlo_as_text = false, xla_dump_hlo_as_proto = false, xla_dump_hlo_as_dot = false, xla_dump_hlo_as_url = false, xla_dump_hlo_as_html = false, xla_dump_fusion_visualization = false, xla_dump_hlo_snapshots = false, xla_dump_include_timestamp = false, xla_dump_max_hlo_modules = zero(Int32), xla_dump_module_metadata = false, xla_dump_compress_protos = false, xla_dump_hlo_as_long_text = false, xla_dump_enable_mlir_pretty_form = false, xla_dump_full_hlo_config = false, xla_tpu_detect_nan = false, xla_tpu_detect_inf = false, xla_cpu_enable_xprof_traceme = false, xla_multiheap_size_constraint_per_heap = zero(Int32), xla_detailed_logging = false, xla_enable_dumping = false, xla_llvm_force_inline_before_split = false, xla_dump_disable_metadata = false, xla_dump_hlo_pipeline_re = "", xla_cpu_use_acl = false, xla_cpu_strict_dot_conv_math = false, xla_dump_latency_hiding_schedule = false, xla_partitioning_algorithm = var"DebugOptions.PartitioningAlgorithm".PARTITIONING_ALGORITHM_NOOP, xla_debug_buffer_assignment_show_max = zero(Int64), xla_detect_unstable_reductions = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_detect_unstable_reductions_post_optimizations = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_nan = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_gpu_detect_inf = var"DebugOptions.DetectionMode".DETECTION_MODE_NONE, xla_dump_large_constants = false, xla_reduce_window_rewrite_base_length = zero(Int64), xla_cmd_buffer_trace_cache_size = zero(Int64), xla_syntax_sugar_async_ops = false, xla_enable_command_buffers_during_profiling = false, xla_ignore_channel_id = false, xla_pjrt_allow_auto_layout_in_hlo = false, xla_test_add_command_buffer_mode = false, xla_gpu_experimental_matmul_perf_table_path = "", xla_early_exit_with_layouts = false, xla_gpu_experimental_scaled_dot_with_triton = false, xla_gpu_experimental_use_raft_select_k = false, xla_backend_extra_options = Dict{String,String}()) +PB.field_numbers(::Type{DebugOptions}) = (;xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = 439, xla_disable_automatic_host_compute_offload = 408, xla_enable_scoped_logging_timers = 436, xla_hlo_pass_fix_detect_cycles = 370, xla_keep_shardings_after_spmd = 419, xla_unsupported_crash_on_hlo_pass_fix_max_iterations = 363, xla_unsupported_crash_on_hlo_pass_noop_change = 379, xla_unsupported_crash_on_hlo_pass_silent_hlo_change = 380, xla_cpu_collective_call_terminate_timeout_seconds = 417, xla_cpu_collective_call_warn_stuck_seconds = 418, xla_cpu_collective_timeout_seconds = 438, xla_cpu_copy_insertion_use_region_analysis = 337, xla_cpu_emitter_verification_level = 395, xla_cpu_enable_concurrency_optimized_scheduler = 307, xla_cpu_enable_fast_math = 99, xla_cpu_enable_fast_min_max = 140, xla_cpu_enable_platform_dependent_math = 425, xla_cpu_experimental_onednn_custom_call = 412, xla_cpu_experimental_onednn_fusion_type = 399, xla_cpu_experimental_xnn_fusion_type = 400, xla_cpu_experimental_xnn_graph_fusion_mode = 365, xla_cpu_experimental_ynn_fusion_type = 422, xla_cpu_fast_math_honor_division = 126, xla_cpu_fast_math_honor_functions = 129, xla_cpu_fast_math_honor_infs = 121, xla_cpu_fast_math_honor_nans = 120, xla_cpu_generate_unique_c_style_kernel_entry_points = 372, xla_cpu_max_isa = 333, xla_cpu_parallel_codegen_split_count = 323, xla_cpu_prefer_vector_width = 308, xla_cpu_use_fusion_emitters = 376, xla_cpu_use_xnnpack = 359, xla_enable_fast_math = 335, xla_gpu_experimental_thunk_buffer_debug_filter = 424, xla_dump_hlo_unoptimized_snapshots = 405, xla_enable_enzyme_comms_opt = 429, xla_gpu_algorithm_denylist_path = 128, xla_gpu_all_gather_combine_threshold_bytes = 212, xla_gpu_all_reduce_blueconnect_num_devices_per_host = 159, xla_gpu_all_reduce_combine_threshold_bytes = 157, xla_gpu_analytical_latency_estimator_options = 357, xla_gpu_async_dot = 321, xla_gpu_auto_spmd_partitioning_memory_budget_gb = 224, xla_gpu_auto_spmd_partitioning_memory_budget_ratio = 225, xla_gpu_autotune_gemm_rtol = 316, xla_gpu_autotune_level = 123, xla_gpu_autotune_max_solutions = 288, xla_gpu_collect_cost_model_stats = 240, xla_gpu_collective_inflation_factor = 205, xla_gpu_collective_permute_combine_threshold_bytes = 378, xla_gpu_collective_permute_decomposer_threshold = 237, xla_gpu_collectives_use_persistent_cliques = 354, xla_gpu_command_buffer_scheduling_mode = 404, xla_gpu_command_buffer_unroll_loops = 411, xla_gpu_copy_insertion_use_region_analysis = 236, xla_gpu_crash_on_verification_failures = 101, xla_gpu_cublas_fallback = 247, xla_gpu_cuda_data_dir = 61, xla_gpu_cudnn_gemm_fusion_level = 285, xla_gpu_cudnn_gemm_max_plans = 318, xla_gpu_deterministic_ops = 148, xla_gpu_disable_async_collectives = 289, xla_gpu_disable_gpuasm_optimizations = 103, xla_gpu_dot_merger_threshold_mb = 331, xla_gpu_dump_autotune_logs_to = 292, xla_gpu_dump_autotune_results_to = 222, xla_gpu_dump_autotuned_gemm_fusions = 232, xla_gpu_dump_llvmir = 155, xla_gpu_enable_all_gather_combine_by_dim = 254, xla_gpu_enable_analytical_latency_estimator = 255, xla_gpu_enable_analytical_sol_latency_estimator = 356, xla_gpu_enable_approx_costly_collectives = 305, xla_gpu_enable_command_buffer = 258, xla_gpu_enable_cub_radix_sort = 259, xla_gpu_enable_cublaslt = 166, xla_gpu_enable_cudnn_int8x32_convolution_reordering = 189, xla_gpu_enable_cudnn_layer_norm = 262, xla_gpu_enable_dynamic_slice_fusion = 105, xla_gpu_enable_fast_min_max = 100, xla_gpu_enable_highest_priority_async_stream = 216, xla_gpu_enable_host_memory_offloading = 296, xla_gpu_enable_latency_hiding_scheduler = 186, xla_gpu_enable_libnvptxcompiler = 269, xla_gpu_enable_llvm_module_compilation_parallelism = 268, xla_gpu_enable_nccl_clique_optimization = 244, xla_gpu_enable_nccl_comm_splitting = 272, xla_gpu_enable_nccl_user_buffers = 267, xla_gpu_enable_pipelined_all_gather = 227, xla_gpu_enable_pipelined_all_reduce = 217, xla_gpu_enable_pipelined_collectives = 239, xla_gpu_enable_pipelined_p2p = 246, xla_gpu_enable_pipelined_reduce_scatter = 231, xla_gpu_enable_reassociation_for_converted_ar = 209, xla_gpu_enable_reduce_scatter_combine_by_dim = 257, xla_gpu_enable_reduction_epilogue_fusion = 243, xla_gpu_enable_scatter_determinism_expander = 345, xla_gpu_enable_shared_constants = 165, xla_gpu_enable_split_k_autotuning = 241, xla_gpu_enable_triton_gemm = 188, xla_gpu_enable_while_loop_double_buffering = 248, xla_gpu_enable_while_loop_reduce_scatter_code_motion = 203, xla_gpu_enable_while_loop_unrolling = 294, xla_gpu_exclude_nondeterministic_ops = 297, xla_gpu_executable_embed_debug_info = 437, xla_gpu_executable_terminate_timeout_seconds = 328, xla_gpu_executable_warn_stuck_timeout_seconds = 327, xla_gpu_exhaustive_tiling_search = 219, xla_gpu_experimental_allow_unroll_factor_eight = 430, xla_gpu_experimental_aot_compiled_thunks = 435, xla_gpu_experimental_autotune_cache_mode = 324, xla_gpu_experimental_autotuner_cache_dir = 407, xla_gpu_experimental_collective_cse_distance_threshold = 374, xla_gpu_experimental_collective_perf_table_path = 377, xla_gpu_experimental_disable_binary_libraries = 329, xla_gpu_experimental_dump_fdo_profiles = 338, xla_gpu_experimental_dump_gpu_executable = 427, xla_gpu_experimental_enable_alltoall_windowed_einsum = 360, xla_gpu_experimental_enable_buffer_saver_on_thunks = 431, xla_gpu_experimental_enable_checksum_tracing_on_thunks = 414, xla_gpu_experimental_enable_fusion_autotuner = 409, xla_gpu_experimental_enable_fusion_block_level_rewriter = 334, xla_gpu_experimental_enable_heuristic_collective_combining = 366, xla_gpu_experimental_enable_nccl_symmetric_buffers = 406, xla_gpu_experimental_enable_nvshmem = 388, xla_gpu_experimental_enable_split_k_rewrite = 386, xla_gpu_experimental_enable_subchannel_dequantisation_fusion = 368, xla_gpu_experimental_enable_triton_heroless_priority_fusion = 340, xla_gpu_experimental_enable_triton_warp_specialization = 421, xla_gpu_experimental_pack_dot_operands_along_k_dimension = 362, xla_gpu_experimental_parallel_collective_overlap_limit = 336, xla_gpu_experimental_pipeline_parallelism_opt_level = 351, xla_gpu_experimental_stream_annotation = 342, xla_gpu_experimental_use_autotuner_pass = 396, xla_gpu_experimental_use_ragged_dot_fusion = 401, xla_gpu_fail_ptx_compilation_on_register_spilling = 353, xla_gpu_filter_kernels_spilling_registers_on_autotuning = 250, xla_gpu_first_collective_call_terminate_timeout_seconds = 392, xla_gpu_first_collective_call_warn_stuck_timeout_seconds = 391, xla_gpu_force_compilation_parallelism = 147, xla_gpu_force_conv_nchw = 125, xla_gpu_force_conv_nhwc = 146, xla_gpu_ftz = 62, xla_gpu_fused_attention_use_cudnn_rng = 235, xla_gpu_gemm_autotuner_override_file = 434, xla_gpu_gemm_rewrite_size_threshold = 283, xla_gpu_generate_debug_info = 348, xla_gpu_generate_line_info = 349, xla_gpu_graph_enable_concurrent_region = 215, xla_gpu_graph_min_graph_size = 208, xla_gpu_kernel_cache_file = 306, xla_gpu_libnvjitlink_mode = 343, xla_gpu_llvm_ir_file = 150, xla_gpu_llvm_verification_level = 256, xla_gpu_load_autotune_results_from = 223, xla_gpu_memory_limit_slop_factor = 260, xla_gpu_mock_custom_calls = 245, xla_gpu_multi_streamed_windowed_einsum = 280, xla_gpu_nccl_async_execution = 393, xla_gpu_nccl_blocking_communicators = 390, xla_gpu_nccl_collective_max_nchannels = 273, xla_gpu_nccl_init_max_rank_per_root_ratio = 277, xla_gpu_nccl_p2p_max_nchannels = 274, xla_gpu_nccl_terminate_on_error = 301, xla_gpu_nccl_termination_timeout_seconds = 163, xla_gpu_operand_bytes_threshold_for_windowed_einsum = 339, xla_gpu_override_gemm_autotuner = 295, xla_gpu_per_fusion_autotune_cache_dir = 310, xla_gpu_pgle_accuracy_checker = 341, xla_gpu_pgle_profile_file_or_directory_path = 210, xla_gpu_ptx_file = 127, xla_gpu_reduce_scatter_combine_threshold_bytes = 213, xla_gpu_redzone_padding_bytes = 228, xla_gpu_require_complete_aot_autotune_results = 284, xla_gpu_require_exclusive_lock = 347, xla_gpu_shape_checks = 170, xla_gpu_shard_autotuning = 304, xla_gpu_strict_conv_algorithm_picker = 156, xla_gpu_target_config_filename = 261, xla_gpu_temp_buffer_use_separate_color = 312, xla_gpu_threshold_for_windowed_einsum_mib = 265, xla_gpu_triton_gemm_any = 190, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138, xla_gpu_unsupported_enable_all_reduce_decomposer = 384, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer = 350, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer = 415, xla_gpu_unsupported_enable_triton_gemm = 322, xla_gpu_unsupported_enable_triton_multi_output_fusion = 382, xla_gpu_unsupported_override_fast_interconnect_slice_size = 416, xla_gpu_unsupported_use_all_reduce_one_shot_kernel = 387, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel = 375, xla_gpu_use_embeded_device_lib = 420, xla_gpu_use_inprocess_lld = 389, xla_gpu_use_memcpy_local_p2p = 287, xla_gpu_use_runtime_fusion = 181, xla_gpu_verify_triton_fusion_numerics = 291, xla_hlo_graph_addresses = 2, xla_hlo_profile = 9, xla_disable_hlo_passes = 30, xla_enable_hlo_passes_only = 124, xla_disable_all_hlo_passes = 104, xla_backend_optimization_level = 31, xla_embed_ir_in_executable = 33, xla_eliminate_hlo_implicit_broadcast = 35, xla_cpu_multi_thread_eigen = 60, xla_llvm_enable_alias_scope_metadata = 70, xla_llvm_enable_noalias_metadata = 71, xla_llvm_enable_invariant_load_metadata = 72, xla_llvm_disable_expensive_passes = 73, xla_test_all_output_layouts = 90, xla_test_all_input_layouts = 91, xla_hlo_graph_sharding_color = 92, xla_cpu_use_onednn = 97, xla_allow_excess_precision = 122, xla_force_host_platform_device_count = 102, xla_hlo_evaluator_use_fast_path = 106, xla_allow_scalar_index_dynamic_ops = 107, xla_step_marker_location = 108, xla_dump_to = 109, xla_flags_reset = 364, xla_dump_hlo_module_re = 110, xla_dump_hlo_pass_re = 111, xla_dump_emitter_re = 433, xla_dump_hlo_as_text = 112, xla_dump_hlo_as_proto = 113, xla_dump_hlo_as_dot = 114, xla_dump_hlo_as_url = 115, xla_dump_hlo_as_html = 116, xla_dump_fusion_visualization = 149, xla_dump_hlo_snapshots = 118, xla_dump_include_timestamp = 131, xla_dump_max_hlo_modules = 132, xla_dump_module_metadata = 144, xla_dump_compress_protos = 151, xla_dump_hlo_as_long_text = 164, xla_dump_enable_mlir_pretty_form = 185, xla_dump_full_hlo_config = 381, xla_tpu_detect_nan = 135, xla_tpu_detect_inf = 136, xla_cpu_enable_xprof_traceme = 137, xla_multiheap_size_constraint_per_heap = 142, xla_detailed_logging = 252, xla_enable_dumping = 253, xla_llvm_force_inline_before_split = 300, xla_dump_disable_metadata = 153, xla_dump_hlo_pipeline_re = 154, xla_cpu_use_acl = 174, xla_cpu_strict_dot_conv_math = 175, xla_dump_latency_hiding_schedule = 182, xla_partitioning_algorithm = 187, xla_debug_buffer_assignment_show_max = 251, xla_detect_unstable_reductions = 403, xla_detect_unstable_reductions_post_optimizations = 432, xla_gpu_detect_nan = 426, xla_gpu_detect_inf = 428, xla_dump_large_constants = 290, xla_reduce_window_rewrite_base_length = 293, xla_cmd_buffer_trace_cache_size = 311, xla_syntax_sugar_async_ops = 315, xla_enable_command_buffers_during_profiling = 317, xla_ignore_channel_id = 330, xla_pjrt_allow_auto_layout_in_hlo = 344, xla_test_add_command_buffer_mode = 373, xla_gpu_experimental_matmul_perf_table_path = 383, xla_early_exit_with_layouts = 397, xla_gpu_experimental_scaled_dot_with_triton = 410, xla_gpu_experimental_use_raft_select_k = 413, xla_backend_extra_options = 500) function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled = false @@ -1023,7 +1017,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_cuda_data_dir = "" xla_gpu_cudnn_gemm_fusion_level = zero(Int32) xla_gpu_cudnn_gemm_max_plans = zero(Int32) - xla_gpu_default_to_alg_dot_bf16_bf16_f32 = false xla_gpu_deterministic_ops = false xla_gpu_disable_async_collectives = PB.BufferedVector{var"DebugOptions.CollectiveOpType".T}() xla_gpu_disable_gpuasm_optimizations = false @@ -1054,7 +1047,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_enable_pipelined_all_gather = false xla_gpu_enable_pipelined_all_reduce = false xla_gpu_enable_pipelined_collectives = false - xla_gpu_enable_pipelined_host_offloading = false xla_gpu_enable_pipelined_p2p = false xla_gpu_enable_pipelined_reduce_scatter = false xla_gpu_enable_reassociation_for_converted_ar = false @@ -1074,7 +1066,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_exhaustive_tiling_search = false xla_gpu_experimental_allow_unroll_factor_eight = false xla_gpu_experimental_aot_compiled_thunks = false - xla_gpu_experimental_autotune_backends = PB.BufferedVector{var"DebugOptions.AutotuneBackend".T}() xla_gpu_experimental_autotune_cache_mode = var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED xla_gpu_experimental_autotuner_cache_dir = "" xla_gpu_experimental_collective_cse_distance_threshold = zero(Int64) @@ -1355,8 +1346,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_cudnn_gemm_fusion_level = PB.decode(d, Int32) elseif field_number == 318 xla_gpu_cudnn_gemm_max_plans = PB.decode(d, Int32) - elseif field_number == 441 - xla_gpu_default_to_alg_dot_bf16_bf16_f32 = PB.decode(d, Bool) elseif field_number == 148 xla_gpu_deterministic_ops = PB.decode(d, Bool) elseif field_number == 289 @@ -1417,8 +1406,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_enable_pipelined_all_reduce = PB.decode(d, Bool) elseif field_number == 239 xla_gpu_enable_pipelined_collectives = PB.decode(d, Bool) - elseif field_number == 440 - xla_gpu_enable_pipelined_host_offloading = PB.decode(d, Bool) elseif field_number == 246 xla_gpu_enable_pipelined_p2p = PB.decode(d, Bool) elseif field_number == 231 @@ -1457,8 +1444,6 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) xla_gpu_experimental_allow_unroll_factor_eight = PB.decode(d, Bool) elseif field_number == 435 xla_gpu_experimental_aot_compiled_thunks = PB.decode(d, Bool) - elseif field_number == 442 - PB.decode!(d, wire_type, xla_gpu_experimental_autotune_backends) elseif field_number == 324 xla_gpu_experimental_autotune_cache_mode = PB.decode(d, var"DebugOptions.AutotuneCacheMode".T) elseif field_number == 407 @@ -1779,7 +1764,7 @@ function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:DebugOptions}) Base.skip(d, wire_type) end end - return DebugOptions(xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled, xla_disable_automatic_host_compute_offload, xla_enable_scoped_logging_timers, xla_hlo_pass_fix_detect_cycles, xla_keep_shardings_after_spmd, xla_unsupported_crash_on_hlo_pass_fix_max_iterations, xla_unsupported_crash_on_hlo_pass_noop_change, xla_unsupported_crash_on_hlo_pass_silent_hlo_change, xla_cpu_collective_call_terminate_timeout_seconds, xla_cpu_collective_call_warn_stuck_seconds, xla_cpu_collective_timeout_seconds, xla_cpu_copy_insertion_use_region_analysis, xla_cpu_emitter_verification_level, xla_cpu_enable_concurrency_optimized_scheduler, xla_cpu_enable_fast_math, xla_cpu_enable_fast_min_max, xla_cpu_enable_platform_dependent_math, xla_cpu_experimental_onednn_custom_call, xla_cpu_experimental_onednn_fusion_type[], xla_cpu_experimental_xnn_fusion_type[], xla_cpu_experimental_xnn_graph_fusion_mode, xla_cpu_experimental_ynn_fusion_type[], xla_cpu_fast_math_honor_division, xla_cpu_fast_math_honor_functions, xla_cpu_fast_math_honor_infs, xla_cpu_fast_math_honor_nans, xla_cpu_generate_unique_c_style_kernel_entry_points, xla_cpu_max_isa, xla_cpu_parallel_codegen_split_count, xla_cpu_prefer_vector_width, xla_cpu_use_fusion_emitters, xla_cpu_use_xnnpack, xla_enable_fast_math, xla_gpu_experimental_thunk_buffer_debug_filter[], xla_dump_hlo_unoptimized_snapshots, xla_enable_enzyme_comms_opt, xla_gpu_algorithm_denylist_path, xla_gpu_all_gather_combine_threshold_bytes, xla_gpu_all_reduce_blueconnect_num_devices_per_host, xla_gpu_all_reduce_combine_threshold_bytes, xla_gpu_analytical_latency_estimator_options, xla_gpu_async_dot, xla_gpu_auto_spmd_partitioning_memory_budget_gb, xla_gpu_auto_spmd_partitioning_memory_budget_ratio, xla_gpu_autotune_gemm_rtol, xla_gpu_autotune_level, xla_gpu_autotune_max_solutions, xla_gpu_collect_cost_model_stats, xla_gpu_collective_inflation_factor, xla_gpu_collective_permute_combine_threshold_bytes, xla_gpu_collective_permute_decomposer_threshold, xla_gpu_collectives_use_persistent_cliques, xla_gpu_command_buffer_scheduling_mode, xla_gpu_command_buffer_unroll_loops, xla_gpu_copy_insertion_use_region_analysis, xla_gpu_crash_on_verification_failures, xla_gpu_cublas_fallback, xla_gpu_cuda_data_dir, xla_gpu_cudnn_gemm_fusion_level, xla_gpu_cudnn_gemm_max_plans, xla_gpu_default_to_alg_dot_bf16_bf16_f32, xla_gpu_deterministic_ops, xla_gpu_disable_async_collectives[], xla_gpu_disable_gpuasm_optimizations, xla_gpu_dot_merger_threshold_mb, xla_gpu_dump_autotune_logs_to, xla_gpu_dump_autotune_results_to, xla_gpu_dump_autotuned_gemm_fusions, xla_gpu_dump_llvmir, xla_gpu_enable_all_gather_combine_by_dim, xla_gpu_enable_analytical_latency_estimator, xla_gpu_enable_analytical_sol_latency_estimator, xla_gpu_enable_approx_costly_collectives, xla_gpu_enable_command_buffer[], xla_gpu_enable_cub_radix_sort, xla_gpu_enable_cublaslt, xla_gpu_enable_cudnn_int8x32_convolution_reordering, xla_gpu_enable_cudnn_layer_norm, xla_gpu_enable_dynamic_slice_fusion, xla_gpu_enable_fast_min_max, xla_gpu_enable_highest_priority_async_stream, xla_gpu_enable_host_memory_offloading, xla_gpu_enable_latency_hiding_scheduler, xla_gpu_enable_libnvptxcompiler, xla_gpu_enable_llvm_module_compilation_parallelism, xla_gpu_enable_nccl_clique_optimization, xla_gpu_enable_nccl_comm_splitting, xla_gpu_enable_nccl_user_buffers, xla_gpu_enable_pipelined_all_gather, xla_gpu_enable_pipelined_all_reduce, xla_gpu_enable_pipelined_collectives, xla_gpu_enable_pipelined_host_offloading, xla_gpu_enable_pipelined_p2p, xla_gpu_enable_pipelined_reduce_scatter, xla_gpu_enable_reassociation_for_converted_ar, xla_gpu_enable_reduce_scatter_combine_by_dim, xla_gpu_enable_reduction_epilogue_fusion, xla_gpu_enable_scatter_determinism_expander, xla_gpu_enable_shared_constants, xla_gpu_enable_split_k_autotuning, xla_gpu_enable_triton_gemm, xla_gpu_enable_while_loop_double_buffering, xla_gpu_enable_while_loop_reduce_scatter_code_motion, xla_gpu_enable_while_loop_unrolling, xla_gpu_exclude_nondeterministic_ops, xla_gpu_executable_embed_debug_info, xla_gpu_executable_terminate_timeout_seconds, xla_gpu_executable_warn_stuck_timeout_seconds, xla_gpu_exhaustive_tiling_search, xla_gpu_experimental_allow_unroll_factor_eight, xla_gpu_experimental_aot_compiled_thunks, xla_gpu_experimental_autotune_backends[], xla_gpu_experimental_autotune_cache_mode, xla_gpu_experimental_autotuner_cache_dir, xla_gpu_experimental_collective_cse_distance_threshold, xla_gpu_experimental_collective_perf_table_path, xla_gpu_experimental_disable_binary_libraries, xla_gpu_experimental_dump_fdo_profiles, xla_gpu_experimental_dump_gpu_executable, xla_gpu_experimental_enable_alltoall_windowed_einsum, xla_gpu_experimental_enable_buffer_saver_on_thunks, xla_gpu_experimental_enable_checksum_tracing_on_thunks, xla_gpu_experimental_enable_fusion_autotuner, xla_gpu_experimental_enable_fusion_block_level_rewriter, xla_gpu_experimental_enable_heuristic_collective_combining, xla_gpu_experimental_enable_nccl_symmetric_buffers, xla_gpu_experimental_enable_nvshmem, xla_gpu_experimental_enable_split_k_rewrite, xla_gpu_experimental_enable_subchannel_dequantisation_fusion, xla_gpu_experimental_enable_triton_heroless_priority_fusion, xla_gpu_experimental_enable_triton_warp_specialization, xla_gpu_experimental_pack_dot_operands_along_k_dimension, xla_gpu_experimental_parallel_collective_overlap_limit, xla_gpu_experimental_pipeline_parallelism_opt_level, xla_gpu_experimental_stream_annotation, xla_gpu_experimental_use_autotuner_pass, xla_gpu_experimental_use_ragged_dot_fusion, xla_gpu_fail_ptx_compilation_on_register_spilling, xla_gpu_filter_kernels_spilling_registers_on_autotuning, xla_gpu_first_collective_call_terminate_timeout_seconds, xla_gpu_first_collective_call_warn_stuck_timeout_seconds, xla_gpu_force_compilation_parallelism, xla_gpu_force_conv_nchw, xla_gpu_force_conv_nhwc, xla_gpu_ftz, xla_gpu_fused_attention_use_cudnn_rng, xla_gpu_gemm_autotuner_override_file, xla_gpu_gemm_rewrite_size_threshold, xla_gpu_generate_debug_info, xla_gpu_generate_line_info, xla_gpu_graph_enable_concurrent_region, xla_gpu_graph_min_graph_size, xla_gpu_kernel_cache_file, xla_gpu_libnvjitlink_mode, xla_gpu_llvm_ir_file[], xla_gpu_llvm_verification_level, xla_gpu_load_autotune_results_from, xla_gpu_memory_limit_slop_factor, xla_gpu_mock_custom_calls, xla_gpu_multi_streamed_windowed_einsum, xla_gpu_nccl_async_execution, xla_gpu_nccl_blocking_communicators, xla_gpu_nccl_collective_max_nchannels, xla_gpu_nccl_init_max_rank_per_root_ratio, xla_gpu_nccl_p2p_max_nchannels, xla_gpu_nccl_terminate_on_error, xla_gpu_nccl_termination_timeout_seconds, xla_gpu_operand_bytes_threshold_for_windowed_einsum, xla_gpu_override_gemm_autotuner, xla_gpu_per_fusion_autotune_cache_dir, xla_gpu_pgle_accuracy_checker, xla_gpu_pgle_profile_file_or_directory_path, xla_gpu_ptx_file[], xla_gpu_reduce_scatter_combine_threshold_bytes, xla_gpu_redzone_padding_bytes, xla_gpu_require_complete_aot_autotune_results, xla_gpu_require_exclusive_lock, xla_gpu_shape_checks, xla_gpu_shard_autotuning, xla_gpu_strict_conv_algorithm_picker, xla_gpu_target_config_filename, xla_gpu_temp_buffer_use_separate_color, xla_gpu_threshold_for_windowed_einsum_mib, xla_gpu_triton_gemm_any, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found, xla_gpu_unsupported_enable_all_reduce_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer, xla_gpu_unsupported_enable_triton_gemm, xla_gpu_unsupported_enable_triton_multi_output_fusion, xla_gpu_unsupported_override_fast_interconnect_slice_size, xla_gpu_unsupported_use_all_reduce_one_shot_kernel, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel, xla_gpu_use_embeded_device_lib, xla_gpu_use_inprocess_lld, xla_gpu_use_memcpy_local_p2p, xla_gpu_use_runtime_fusion, xla_gpu_verify_triton_fusion_numerics, xla_hlo_graph_addresses, xla_hlo_profile, xla_disable_hlo_passes[], xla_enable_hlo_passes_only[], xla_disable_all_hlo_passes, xla_backend_optimization_level, xla_embed_ir_in_executable, xla_eliminate_hlo_implicit_broadcast, xla_cpu_multi_thread_eigen, xla_llvm_enable_alias_scope_metadata, xla_llvm_enable_noalias_metadata, xla_llvm_enable_invariant_load_metadata, xla_llvm_disable_expensive_passes, xla_test_all_output_layouts, xla_test_all_input_layouts, xla_hlo_graph_sharding_color, xla_cpu_use_onednn, xla_allow_excess_precision, xla_force_host_platform_device_count, xla_hlo_evaluator_use_fast_path, xla_allow_scalar_index_dynamic_ops, xla_step_marker_location, xla_dump_to, xla_flags_reset, xla_dump_hlo_module_re, xla_dump_hlo_pass_re, xla_dump_emitter_re, xla_dump_hlo_as_text, xla_dump_hlo_as_proto, xla_dump_hlo_as_dot, xla_dump_hlo_as_url, xla_dump_hlo_as_html, xla_dump_fusion_visualization, xla_dump_hlo_snapshots, xla_dump_include_timestamp, xla_dump_max_hlo_modules, xla_dump_module_metadata, xla_dump_compress_protos, xla_dump_hlo_as_long_text, xla_dump_enable_mlir_pretty_form, xla_dump_full_hlo_config, xla_tpu_detect_nan, xla_tpu_detect_inf, xla_cpu_enable_xprof_traceme, xla_multiheap_size_constraint_per_heap, xla_detailed_logging, xla_enable_dumping, xla_llvm_force_inline_before_split, xla_dump_disable_metadata, xla_dump_hlo_pipeline_re, xla_cpu_use_acl, xla_cpu_strict_dot_conv_math, xla_dump_latency_hiding_schedule, xla_partitioning_algorithm, xla_debug_buffer_assignment_show_max, xla_detect_unstable_reductions, xla_detect_unstable_reductions_post_optimizations, xla_gpu_detect_nan, xla_gpu_detect_inf, xla_dump_large_constants, xla_reduce_window_rewrite_base_length, xla_cmd_buffer_trace_cache_size, xla_syntax_sugar_async_ops, xla_enable_command_buffers_during_profiling, xla_ignore_channel_id, xla_pjrt_allow_auto_layout_in_hlo, xla_test_add_command_buffer_mode, xla_gpu_experimental_matmul_perf_table_path, xla_early_exit_with_layouts, xla_gpu_experimental_scaled_dot_with_triton, xla_gpu_experimental_use_raft_select_k, xla_backend_extra_options) + return DebugOptions(xla_allow_h2h_copy_when_automatic_host_compute_offload_disabled, xla_disable_automatic_host_compute_offload, xla_enable_scoped_logging_timers, xla_hlo_pass_fix_detect_cycles, xla_keep_shardings_after_spmd, xla_unsupported_crash_on_hlo_pass_fix_max_iterations, xla_unsupported_crash_on_hlo_pass_noop_change, xla_unsupported_crash_on_hlo_pass_silent_hlo_change, xla_cpu_collective_call_terminate_timeout_seconds, xla_cpu_collective_call_warn_stuck_seconds, xla_cpu_collective_timeout_seconds, xla_cpu_copy_insertion_use_region_analysis, xla_cpu_emitter_verification_level, xla_cpu_enable_concurrency_optimized_scheduler, xla_cpu_enable_fast_math, xla_cpu_enable_fast_min_max, xla_cpu_enable_platform_dependent_math, xla_cpu_experimental_onednn_custom_call, xla_cpu_experimental_onednn_fusion_type[], xla_cpu_experimental_xnn_fusion_type[], xla_cpu_experimental_xnn_graph_fusion_mode, xla_cpu_experimental_ynn_fusion_type[], xla_cpu_fast_math_honor_division, xla_cpu_fast_math_honor_functions, xla_cpu_fast_math_honor_infs, xla_cpu_fast_math_honor_nans, xla_cpu_generate_unique_c_style_kernel_entry_points, xla_cpu_max_isa, xla_cpu_parallel_codegen_split_count, xla_cpu_prefer_vector_width, xla_cpu_use_fusion_emitters, xla_cpu_use_xnnpack, xla_enable_fast_math, xla_gpu_experimental_thunk_buffer_debug_filter[], xla_dump_hlo_unoptimized_snapshots, xla_enable_enzyme_comms_opt, xla_gpu_algorithm_denylist_path, xla_gpu_all_gather_combine_threshold_bytes, xla_gpu_all_reduce_blueconnect_num_devices_per_host, xla_gpu_all_reduce_combine_threshold_bytes, xla_gpu_analytical_latency_estimator_options, xla_gpu_async_dot, xla_gpu_auto_spmd_partitioning_memory_budget_gb, xla_gpu_auto_spmd_partitioning_memory_budget_ratio, xla_gpu_autotune_gemm_rtol, xla_gpu_autotune_level, xla_gpu_autotune_max_solutions, xla_gpu_collect_cost_model_stats, xla_gpu_collective_inflation_factor, xla_gpu_collective_permute_combine_threshold_bytes, xla_gpu_collective_permute_decomposer_threshold, xla_gpu_collectives_use_persistent_cliques, xla_gpu_command_buffer_scheduling_mode, xla_gpu_command_buffer_unroll_loops, xla_gpu_copy_insertion_use_region_analysis, xla_gpu_crash_on_verification_failures, xla_gpu_cublas_fallback, xla_gpu_cuda_data_dir, xla_gpu_cudnn_gemm_fusion_level, xla_gpu_cudnn_gemm_max_plans, xla_gpu_deterministic_ops, xla_gpu_disable_async_collectives[], xla_gpu_disable_gpuasm_optimizations, xla_gpu_dot_merger_threshold_mb, xla_gpu_dump_autotune_logs_to, xla_gpu_dump_autotune_results_to, xla_gpu_dump_autotuned_gemm_fusions, xla_gpu_dump_llvmir, xla_gpu_enable_all_gather_combine_by_dim, xla_gpu_enable_analytical_latency_estimator, xla_gpu_enable_analytical_sol_latency_estimator, xla_gpu_enable_approx_costly_collectives, xla_gpu_enable_command_buffer[], xla_gpu_enable_cub_radix_sort, xla_gpu_enable_cublaslt, xla_gpu_enable_cudnn_int8x32_convolution_reordering, xla_gpu_enable_cudnn_layer_norm, xla_gpu_enable_dynamic_slice_fusion, xla_gpu_enable_fast_min_max, xla_gpu_enable_highest_priority_async_stream, xla_gpu_enable_host_memory_offloading, xla_gpu_enable_latency_hiding_scheduler, xla_gpu_enable_libnvptxcompiler, xla_gpu_enable_llvm_module_compilation_parallelism, xla_gpu_enable_nccl_clique_optimization, xla_gpu_enable_nccl_comm_splitting, xla_gpu_enable_nccl_user_buffers, xla_gpu_enable_pipelined_all_gather, xla_gpu_enable_pipelined_all_reduce, xla_gpu_enable_pipelined_collectives, xla_gpu_enable_pipelined_p2p, xla_gpu_enable_pipelined_reduce_scatter, xla_gpu_enable_reassociation_for_converted_ar, xla_gpu_enable_reduce_scatter_combine_by_dim, xla_gpu_enable_reduction_epilogue_fusion, xla_gpu_enable_scatter_determinism_expander, xla_gpu_enable_shared_constants, xla_gpu_enable_split_k_autotuning, xla_gpu_enable_triton_gemm, xla_gpu_enable_while_loop_double_buffering, xla_gpu_enable_while_loop_reduce_scatter_code_motion, xla_gpu_enable_while_loop_unrolling, xla_gpu_exclude_nondeterministic_ops, xla_gpu_executable_embed_debug_info, xla_gpu_executable_terminate_timeout_seconds, xla_gpu_executable_warn_stuck_timeout_seconds, xla_gpu_exhaustive_tiling_search, xla_gpu_experimental_allow_unroll_factor_eight, xla_gpu_experimental_aot_compiled_thunks, xla_gpu_experimental_autotune_cache_mode, xla_gpu_experimental_autotuner_cache_dir, xla_gpu_experimental_collective_cse_distance_threshold, xla_gpu_experimental_collective_perf_table_path, xla_gpu_experimental_disable_binary_libraries, xla_gpu_experimental_dump_fdo_profiles, xla_gpu_experimental_dump_gpu_executable, xla_gpu_experimental_enable_alltoall_windowed_einsum, xla_gpu_experimental_enable_buffer_saver_on_thunks, xla_gpu_experimental_enable_checksum_tracing_on_thunks, xla_gpu_experimental_enable_fusion_autotuner, xla_gpu_experimental_enable_fusion_block_level_rewriter, xla_gpu_experimental_enable_heuristic_collective_combining, xla_gpu_experimental_enable_nccl_symmetric_buffers, xla_gpu_experimental_enable_nvshmem, xla_gpu_experimental_enable_split_k_rewrite, xla_gpu_experimental_enable_subchannel_dequantisation_fusion, xla_gpu_experimental_enable_triton_heroless_priority_fusion, xla_gpu_experimental_enable_triton_warp_specialization, xla_gpu_experimental_pack_dot_operands_along_k_dimension, xla_gpu_experimental_parallel_collective_overlap_limit, xla_gpu_experimental_pipeline_parallelism_opt_level, xla_gpu_experimental_stream_annotation, xla_gpu_experimental_use_autotuner_pass, xla_gpu_experimental_use_ragged_dot_fusion, xla_gpu_fail_ptx_compilation_on_register_spilling, xla_gpu_filter_kernels_spilling_registers_on_autotuning, xla_gpu_first_collective_call_terminate_timeout_seconds, xla_gpu_first_collective_call_warn_stuck_timeout_seconds, xla_gpu_force_compilation_parallelism, xla_gpu_force_conv_nchw, xla_gpu_force_conv_nhwc, xla_gpu_ftz, xla_gpu_fused_attention_use_cudnn_rng, xla_gpu_gemm_autotuner_override_file, xla_gpu_gemm_rewrite_size_threshold, xla_gpu_generate_debug_info, xla_gpu_generate_line_info, xla_gpu_graph_enable_concurrent_region, xla_gpu_graph_min_graph_size, xla_gpu_kernel_cache_file, xla_gpu_libnvjitlink_mode, xla_gpu_llvm_ir_file[], xla_gpu_llvm_verification_level, xla_gpu_load_autotune_results_from, xla_gpu_memory_limit_slop_factor, xla_gpu_mock_custom_calls, xla_gpu_multi_streamed_windowed_einsum, xla_gpu_nccl_async_execution, xla_gpu_nccl_blocking_communicators, xla_gpu_nccl_collective_max_nchannels, xla_gpu_nccl_init_max_rank_per_root_ratio, xla_gpu_nccl_p2p_max_nchannels, xla_gpu_nccl_terminate_on_error, xla_gpu_nccl_termination_timeout_seconds, xla_gpu_operand_bytes_threshold_for_windowed_einsum, xla_gpu_override_gemm_autotuner, xla_gpu_per_fusion_autotune_cache_dir, xla_gpu_pgle_accuracy_checker, xla_gpu_pgle_profile_file_or_directory_path, xla_gpu_ptx_file[], xla_gpu_reduce_scatter_combine_threshold_bytes, xla_gpu_redzone_padding_bytes, xla_gpu_require_complete_aot_autotune_results, xla_gpu_require_exclusive_lock, xla_gpu_shape_checks, xla_gpu_shard_autotuning, xla_gpu_strict_conv_algorithm_picker, xla_gpu_target_config_filename, xla_gpu_temp_buffer_use_separate_color, xla_gpu_threshold_for_windowed_einsum_mib, xla_gpu_triton_gemm_any, xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found, xla_gpu_unsupported_enable_all_reduce_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_decomposer, xla_gpu_unsupported_enable_ragged_all_to_all_multi_host_decomposer, xla_gpu_unsupported_enable_triton_gemm, xla_gpu_unsupported_enable_triton_multi_output_fusion, xla_gpu_unsupported_override_fast_interconnect_slice_size, xla_gpu_unsupported_use_all_reduce_one_shot_kernel, xla_gpu_unsupported_use_ragged_all_to_all_one_shot_kernel, xla_gpu_use_embeded_device_lib, xla_gpu_use_inprocess_lld, xla_gpu_use_memcpy_local_p2p, xla_gpu_use_runtime_fusion, xla_gpu_verify_triton_fusion_numerics, xla_hlo_graph_addresses, xla_hlo_profile, xla_disable_hlo_passes[], xla_enable_hlo_passes_only[], xla_disable_all_hlo_passes, xla_backend_optimization_level, xla_embed_ir_in_executable, xla_eliminate_hlo_implicit_broadcast, xla_cpu_multi_thread_eigen, xla_llvm_enable_alias_scope_metadata, xla_llvm_enable_noalias_metadata, xla_llvm_enable_invariant_load_metadata, xla_llvm_disable_expensive_passes, xla_test_all_output_layouts, xla_test_all_input_layouts, xla_hlo_graph_sharding_color, xla_cpu_use_onednn, xla_allow_excess_precision, xla_force_host_platform_device_count, xla_hlo_evaluator_use_fast_path, xla_allow_scalar_index_dynamic_ops, xla_step_marker_location, xla_dump_to, xla_flags_reset, xla_dump_hlo_module_re, xla_dump_hlo_pass_re, xla_dump_emitter_re, xla_dump_hlo_as_text, xla_dump_hlo_as_proto, xla_dump_hlo_as_dot, xla_dump_hlo_as_url, xla_dump_hlo_as_html, xla_dump_fusion_visualization, xla_dump_hlo_snapshots, xla_dump_include_timestamp, xla_dump_max_hlo_modules, xla_dump_module_metadata, xla_dump_compress_protos, xla_dump_hlo_as_long_text, xla_dump_enable_mlir_pretty_form, xla_dump_full_hlo_config, xla_tpu_detect_nan, xla_tpu_detect_inf, xla_cpu_enable_xprof_traceme, xla_multiheap_size_constraint_per_heap, xla_detailed_logging, xla_enable_dumping, xla_llvm_force_inline_before_split, xla_dump_disable_metadata, xla_dump_hlo_pipeline_re, xla_cpu_use_acl, xla_cpu_strict_dot_conv_math, xla_dump_latency_hiding_schedule, xla_partitioning_algorithm, xla_debug_buffer_assignment_show_max, xla_detect_unstable_reductions, xla_detect_unstable_reductions_post_optimizations, xla_gpu_detect_nan, xla_gpu_detect_inf, xla_dump_large_constants, xla_reduce_window_rewrite_base_length, xla_cmd_buffer_trace_cache_size, xla_syntax_sugar_async_ops, xla_enable_command_buffers_during_profiling, xla_ignore_channel_id, xla_pjrt_allow_auto_layout_in_hlo, xla_test_add_command_buffer_mode, xla_gpu_experimental_matmul_perf_table_path, xla_early_exit_with_layouts, xla_gpu_experimental_scaled_dot_with_triton, xla_gpu_experimental_use_raft_select_k, xla_backend_extra_options) end function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions) @@ -1844,7 +1829,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions) !isempty(x.xla_gpu_cuda_data_dir) && PB.encode(e, 61, x.xla_gpu_cuda_data_dir) x.xla_gpu_cudnn_gemm_fusion_level != zero(Int32) && PB.encode(e, 285, x.xla_gpu_cudnn_gemm_fusion_level) x.xla_gpu_cudnn_gemm_max_plans != zero(Int32) && PB.encode(e, 318, x.xla_gpu_cudnn_gemm_max_plans) - x.xla_gpu_default_to_alg_dot_bf16_bf16_f32 != false && PB.encode(e, 441, x.xla_gpu_default_to_alg_dot_bf16_bf16_f32) x.xla_gpu_deterministic_ops != false && PB.encode(e, 148, x.xla_gpu_deterministic_ops) !isempty(x.xla_gpu_disable_async_collectives) && PB.encode(e, 289, x.xla_gpu_disable_async_collectives) x.xla_gpu_disable_gpuasm_optimizations != false && PB.encode(e, 103, x.xla_gpu_disable_gpuasm_optimizations) @@ -1875,7 +1859,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions) x.xla_gpu_enable_pipelined_all_gather != false && PB.encode(e, 227, x.xla_gpu_enable_pipelined_all_gather) x.xla_gpu_enable_pipelined_all_reduce != false && PB.encode(e, 217, x.xla_gpu_enable_pipelined_all_reduce) x.xla_gpu_enable_pipelined_collectives != false && PB.encode(e, 239, x.xla_gpu_enable_pipelined_collectives) - x.xla_gpu_enable_pipelined_host_offloading != false && PB.encode(e, 440, x.xla_gpu_enable_pipelined_host_offloading) x.xla_gpu_enable_pipelined_p2p != false && PB.encode(e, 246, x.xla_gpu_enable_pipelined_p2p) x.xla_gpu_enable_pipelined_reduce_scatter != false && PB.encode(e, 231, x.xla_gpu_enable_pipelined_reduce_scatter) x.xla_gpu_enable_reassociation_for_converted_ar != false && PB.encode(e, 209, x.xla_gpu_enable_reassociation_for_converted_ar) @@ -1895,7 +1878,6 @@ function PB.encode(e::PB.AbstractProtoEncoder, x::DebugOptions) x.xla_gpu_exhaustive_tiling_search != false && PB.encode(e, 219, x.xla_gpu_exhaustive_tiling_search) x.xla_gpu_experimental_allow_unroll_factor_eight != false && PB.encode(e, 430, x.xla_gpu_experimental_allow_unroll_factor_eight) x.xla_gpu_experimental_aot_compiled_thunks != false && PB.encode(e, 435, x.xla_gpu_experimental_aot_compiled_thunks) - !isempty(x.xla_gpu_experimental_autotune_backends) && PB.encode(e, 442, x.xla_gpu_experimental_autotune_backends) x.xla_gpu_experimental_autotune_cache_mode != var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED && PB.encode(e, 324, x.xla_gpu_experimental_autotune_cache_mode) !isempty(x.xla_gpu_experimental_autotuner_cache_dir) && PB.encode(e, 407, x.xla_gpu_experimental_autotuner_cache_dir) x.xla_gpu_experimental_collective_cse_distance_threshold != zero(Int64) && PB.encode(e, 374, x.xla_gpu_experimental_collective_cse_distance_threshold) @@ -2118,7 +2100,6 @@ function PB._encoded_size(x::DebugOptions) !isempty(x.xla_gpu_cuda_data_dir) && (encoded_size += PB._encoded_size(x.xla_gpu_cuda_data_dir, 61)) x.xla_gpu_cudnn_gemm_fusion_level != zero(Int32) && (encoded_size += PB._encoded_size(x.xla_gpu_cudnn_gemm_fusion_level, 285)) x.xla_gpu_cudnn_gemm_max_plans != zero(Int32) && (encoded_size += PB._encoded_size(x.xla_gpu_cudnn_gemm_max_plans, 318)) - x.xla_gpu_default_to_alg_dot_bf16_bf16_f32 != false && (encoded_size += PB._encoded_size(x.xla_gpu_default_to_alg_dot_bf16_bf16_f32, 441)) x.xla_gpu_deterministic_ops != false && (encoded_size += PB._encoded_size(x.xla_gpu_deterministic_ops, 148)) !isempty(x.xla_gpu_disable_async_collectives) && (encoded_size += PB._encoded_size(x.xla_gpu_disable_async_collectives, 289)) x.xla_gpu_disable_gpuasm_optimizations != false && (encoded_size += PB._encoded_size(x.xla_gpu_disable_gpuasm_optimizations, 103)) @@ -2149,7 +2130,6 @@ function PB._encoded_size(x::DebugOptions) x.xla_gpu_enable_pipelined_all_gather != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_all_gather, 227)) x.xla_gpu_enable_pipelined_all_reduce != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_all_reduce, 217)) x.xla_gpu_enable_pipelined_collectives != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_collectives, 239)) - x.xla_gpu_enable_pipelined_host_offloading != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_host_offloading, 440)) x.xla_gpu_enable_pipelined_p2p != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_p2p, 246)) x.xla_gpu_enable_pipelined_reduce_scatter != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_pipelined_reduce_scatter, 231)) x.xla_gpu_enable_reassociation_for_converted_ar != false && (encoded_size += PB._encoded_size(x.xla_gpu_enable_reassociation_for_converted_ar, 209)) @@ -2169,7 +2149,6 @@ function PB._encoded_size(x::DebugOptions) x.xla_gpu_exhaustive_tiling_search != false && (encoded_size += PB._encoded_size(x.xla_gpu_exhaustive_tiling_search, 219)) x.xla_gpu_experimental_allow_unroll_factor_eight != false && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_allow_unroll_factor_eight, 430)) x.xla_gpu_experimental_aot_compiled_thunks != false && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_aot_compiled_thunks, 435)) - !isempty(x.xla_gpu_experimental_autotune_backends) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotune_backends, 442)) x.xla_gpu_experimental_autotune_cache_mode != var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UNSPECIFIED && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotune_cache_mode, 324)) !isempty(x.xla_gpu_experimental_autotuner_cache_dir) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_autotuner_cache_dir, 407)) x.xla_gpu_experimental_collective_cse_distance_threshold != zero(Int64) && (encoded_size += PB._encoded_size(x.xla_gpu_experimental_collective_cse_distance_threshold, 374)) @@ -2331,7 +2310,7 @@ function PB._encoded_size(x::DebugOptions) return encoded_size end -struct HloModuleConfigProto +mutable struct HloModuleConfigProto entry_computation_layout::Union{Nothing,ProgramShapeProto} seed::UInt64 launch_id::Int32 @@ -2602,7 +2581,7 @@ function PB._encoded_size(x::HloModuleConfigProto) return encoded_size end -struct ExecutionOptions +mutable struct ExecutionOptions shape_with_output_layout::Union{Nothing,ShapeProto} seed::UInt64 debug_options::Union{Nothing,DebugOptions} @@ -2783,7 +2762,7 @@ function PB._encoded_size(x::ExecutionOptions) return encoded_size end -struct HloModuleProtoWithConfig +mutable struct HloModuleProtoWithConfig hlo_module::Union{Nothing,HloModuleProto} config::Union{Nothing,HloModuleConfigProto} end diff --git a/src/proto/xla_autotuning/autotune_results_pb.jl b/src/proto/xla_autotuning/autotune_results_pb.jl index 653e78ccd2..70feec955e 100644 --- a/src/proto/xla_autotuning/autotune_results_pb.jl +++ b/src/proto/xla_autotuning/autotune_results_pb.jl @@ -5,7 +5,7 @@ using ProtoBuf.EnumX: @enumx export AutotuningLogs, var"AutotuneResults.Entry", AutotuneResults -struct AutotuningLogs +mutable struct AutotuningLogs logs::Vector{AutotuningLog} end PB.default_values(::Type{AutotuningLogs}) = (;logs = Vector{AutotuningLog}()) @@ -35,7 +35,7 @@ function PB._encoded_size(x::AutotuningLogs) return encoded_size end -struct var"AutotuneResults.Entry" +mutable struct var"AutotuneResults.Entry" device::String hlo::String result::Union{Nothing,AutotuneResult} @@ -83,7 +83,7 @@ function PB._encoded_size(x::var"AutotuneResults.Entry") return encoded_size end -struct AutotuneResults +mutable struct AutotuneResults version::Int32 results::Vector{var"AutotuneResults.Entry"} end diff --git a/src/proto/xla_autotuning/autotuning_pb.jl b/src/proto/xla_autotuning/autotuning_pb.jl index 57eeeafe8d..dcdcd193e9 100644 --- a/src/proto/xla_autotuning/autotuning_pb.jl +++ b/src/proto/xla_autotuning/autotuning_pb.jl @@ -9,7 +9,7 @@ export var"AutotuneResult.CudaConvPlanKey", ComputeCapability, TritonGemmConfigs export var"AutotuneResult.FailureResult", AutotuneResult, AutotuningLog -struct var"AutotuneResult.TritonGemmKey" +mutable struct var"AutotuneResult.TritonGemmKey" block_m::Int64 block_n::Int64 block_k::Int64 @@ -87,7 +87,7 @@ function PB._encoded_size(x::var"AutotuneResult.TritonGemmKey") return encoded_size end -struct var"AutotuneResult.BackendConfigKey" +mutable struct var"AutotuneResult.BackendConfigKey" name::String config::Union{Nothing,google.protobuf.var"#Any"} end @@ -123,7 +123,7 @@ function PB._encoded_size(x::var"AutotuneResult.BackendConfigKey") return encoded_size end -struct var"AutotuneResult.ConvKey" +mutable struct var"AutotuneResult.ConvKey" algorithm::Int64 tensor_ops_enabled::Bool end @@ -159,7 +159,7 @@ function PB._encoded_size(x::var"AutotuneResult.ConvKey") return encoded_size end -struct CudnnVersion +mutable struct CudnnVersion major::Int32 minor::Int32 patch::Int32 @@ -203,43 +203,37 @@ end @enumx var"AutotuneResult.FailureKind" UNKNOWN=0 REDZONE_MODIFIED=1 WRONG_RESULT=2 DISQUALIFIED=3 -struct var"AutotuneResult.GemmKey" +mutable struct var"AutotuneResult.GemmKey" algorithm::Int64 - autotune_workspace_size::Int64 end -PB.default_values(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = zero(Int64), autotune_workspace_size = zero(Int64)) -PB.field_numbers(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = 1, autotune_workspace_size = 2) +PB.default_values(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = zero(Int64)) +PB.field_numbers(::Type{var"AutotuneResult.GemmKey"}) = (;algorithm = 1) function PB.decode(d::PB.AbstractProtoDecoder, ::Type{<:var"AutotuneResult.GemmKey"}) algorithm = zero(Int64) - autotune_workspace_size = zero(Int64) while !PB.message_done(d) field_number, wire_type = PB.decode_tag(d) if field_number == 1 algorithm = PB.decode(d, Int64) - elseif field_number == 2 - autotune_workspace_size = PB.decode(d, Int64) else Base.skip(d, wire_type) end end - return var"AutotuneResult.GemmKey"(algorithm, autotune_workspace_size) + return var"AutotuneResult.GemmKey"(algorithm) end function PB.encode(e::PB.AbstractProtoEncoder, x::var"AutotuneResult.GemmKey") initpos = position(e.io) x.algorithm != zero(Int64) && PB.encode(e, 1, x.algorithm) - x.autotune_workspace_size != zero(Int64) && PB.encode(e, 2, x.autotune_workspace_size) return position(e.io) - initpos end function PB._encoded_size(x::var"AutotuneResult.GemmKey") encoded_size = 0 x.algorithm != zero(Int64) && (encoded_size += PB._encoded_size(x.algorithm, 1)) - x.autotune_workspace_size != zero(Int64) && (encoded_size += PB._encoded_size(x.autotune_workspace_size, 2)) return encoded_size end -struct var"AutotuneResult.CustomKernelFusionKey" +mutable struct var"AutotuneResult.CustomKernelFusionKey" kernel_index::Int64 end PB.default_values(::Type{var"AutotuneResult.CustomKernelFusionKey"}) = (;kernel_index = zero(Int64)) @@ -269,7 +263,7 @@ function PB._encoded_size(x::var"AutotuneResult.CustomKernelFusionKey") return encoded_size end -struct var"AutotuneResult.CudaConvPlanKey" +mutable struct var"AutotuneResult.CudaConvPlanKey" exec_plan_id::String end PB.default_values(::Type{var"AutotuneResult.CudaConvPlanKey"}) = (;exec_plan_id = "") @@ -299,7 +293,7 @@ function PB._encoded_size(x::var"AutotuneResult.CudaConvPlanKey") return encoded_size end -struct ComputeCapability +mutable struct ComputeCapability major::Int32 minor::Int32 end @@ -335,7 +329,7 @@ function PB._encoded_size(x::ComputeCapability) return encoded_size end -struct TritonGemmConfigsProto +mutable struct TritonGemmConfigsProto config::Vector{var"AutotuneResult.TritonGemmKey"} end PB.default_values(::Type{TritonGemmConfigsProto}) = (;config = Vector{var"AutotuneResult.TritonGemmKey"}()) @@ -365,7 +359,7 @@ function PB._encoded_size(x::TritonGemmConfigsProto) return encoded_size end -struct var"AutotuneResult.FailureResult" +mutable struct var"AutotuneResult.FailureResult" kind::var"AutotuneResult.FailureKind".T msg::String key::Union{Nothing,OneOf{<:Union{var"AutotuneResult.ConvKey",var"AutotuneResult.GemmKey",var"AutotuneResult.CudaConvPlanKey",xla_tsl_dnn.AlgorithmProto}}} @@ -440,7 +434,7 @@ function PB._encoded_size(x::var"AutotuneResult.FailureResult") return encoded_size end -struct AutotuneResult +mutable struct AutotuneResult scratch_bytes::Int64 run_time::Union{Nothing,google.protobuf.Duration} failure::Union{Nothing,var"AutotuneResult.FailureResult"} @@ -533,7 +527,7 @@ function PB._encoded_size(x::AutotuneResult) return encoded_size end -struct AutotuningLog +mutable struct AutotuningLog instr::Union{Nothing,google.protobuf.var"#Any"} results::Vector{AutotuneResult} cudnn_version::Union{Nothing,CudnnVersion} diff --git a/src/proto/xla_tsl_dnn/dnn_pb.jl b/src/proto/xla_tsl_dnn/dnn_pb.jl index aa361c237d..61414b51b8 100644 --- a/src/proto/xla_tsl_dnn/dnn_pb.jl +++ b/src/proto/xla_tsl_dnn/dnn_pb.jl @@ -28,7 +28,7 @@ export AlgorithmConfigProto @enumx FilterLayout kOutputInputYX=0 kOutputYXInput=1 kOutputInputYX4=2 kOutputInputYX32=5 kOutputInputYX32_CudnnReordered=6 kInputYXOutput=3 kYXInputOutput=4 -struct ConvolutionDescriptorProto +mutable struct ConvolutionDescriptorProto paddings::Vector{Int64} strides::Vector{Int64} dilations::Vector{Int64} @@ -94,7 +94,7 @@ function PB._encoded_size(x::ConvolutionDescriptorProto) return encoded_size end -struct AlgorithmProto +mutable struct AlgorithmProto algo_id::Int64 math_type::var"AlgorithmProto.MathType".T tuning_knobs::Dict{Int64,Int64} @@ -149,7 +149,7 @@ function PB._encoded_size(x::AlgorithmProto) return encoded_size end -struct TensorDescriptorProto +mutable struct TensorDescriptorProto dimensions::Vector{Int64} data_type::var"#DataType".T layout_oneof::Union{Nothing,OneOf{<:Union{DataLayout.T,FilterLayout.T}}} @@ -206,7 +206,7 @@ function PB._encoded_size(x::TensorDescriptorProto) return encoded_size end -struct AlgorithmConfigProto +mutable struct AlgorithmConfigProto optional_algorithm::Union{Nothing,OneOf{AlgorithmProto}} optional_algorithm_no_scratch::Union{Nothing,OneOf{AlgorithmProto}} optional_scratch_size::Union{Nothing,OneOf{Int64}} diff --git a/src/xla/CompileOptions.jl b/src/xla/CompileOptions.jl index d7e188b5b3..c3469a20bc 100644 --- a/src/xla/CompileOptions.jl +++ b/src/xla/CompileOptions.jl @@ -1,29 +1,13 @@ -const DEFAULT_XLA_DEBUG_OPTIONS = Ref{Union{Nothing,Reactant.Proto.xla.DebugOptions}}( - nothing -) -const DEFAULT_XLA_COMPILE_OPTIONS = Ref{ - Union{Nothing,Reactant.Proto.xla.CompileOptionsProto} -}( - nothing -) - function get_default_debug_options() - if !isnothing(DEFAULT_XLA_DEBUG_OPTIONS[]) - return DEFAULT_XLA_DEBUG_OPTIONS[]::Reactant.Proto.xla.DebugOptions - end size = Ref{Csize_t}(0) data = @ccall MLIR.API.mlir_c.ReactantGetDebugOptions(size::Ptr{Csize_t})::Ptr{UInt8} bytes = unsafe_wrap(Array, data, (size[],); own=false) proto = Reactant.ProtoUtils.proto_from_bytes(Reactant.Proto.xla.DebugOptions, bytes) @ccall free(data::Ptr{UInt8})::Cvoid - DEFAULT_XLA_DEBUG_OPTIONS[] = proto return proto end function get_default_compile_options() - if !isnothing(DEFAULT_XLA_COMPILE_OPTIONS[]) - return DEFAULT_XLA_COMPILE_OPTIONS[]::Reactant.Proto.xla.CompileOptionsProto - end size = Ref{Csize_t}(0) data = @ccall MLIR.API.mlir_c.ReactantGetCompileOptions(size::Ptr{Csize_t})::Ptr{UInt8} bytes = unsafe_wrap(Array, data, (size[],); own=false) @@ -31,40 +15,46 @@ function get_default_compile_options() Reactant.Proto.xla.CompileOptionsProto, bytes ) @ccall free(data::Ptr{UInt8})::Cvoid - DEFAULT_XLA_COMPILE_OPTIONS[] = proto return proto end function get_debug_options(; kwargs...) debug_options = get_default_debug_options() - # default overrides. can we changed by the user by passing in kwargs - @set! debug_options.xla_gpu_cuda_data_dir = CUDA_DATA_DIR[] - @set! debug_options.xla_enable_enzyme_comms_opt = true - @set! debug_options.xla_gpu_experimental_use_raft_select_k = true + # default overrides. can be changed by the user by passing in kwargs + debug_options.xla_gpu_cuda_data_dir = CUDA_DATA_DIR[] + debug_options.xla_enable_enzyme_comms_opt = true + debug_options.xla_gpu_experimental_use_raft_select_k = true if Reactant.PersistentCompileCache.kernel_cache_enabled() - @set! debug_options.xla_gpu_kernel_cache_file = Reactant.PersistentCompileCache.get_kernel_cache_path() - @set! debug_options.xla_gpu_enable_llvm_module_compilation_parallelism = true + debug_options.xla_gpu_kernel_cache_file = Reactant.PersistentCompileCache.get_kernel_cache_path() + debug_options.xla_gpu_enable_llvm_module_compilation_parallelism = true end if Reactant.PersistentCompileCache.autotune_cache_enabled() - @set! debug_options.xla_gpu_per_fusion_autotune_cache_dir = Reactant.PersistentCompileCache.get_autotune_cache_directory() + debug_options.xla_gpu_per_fusion_autotune_cache_dir = Reactant.PersistentCompileCache.get_autotune_cache_directory() if Reactant.Distributed.local_rank() <= 0 - @set! debug_options.xla_gpu_experimental_autotune_cache_mode = + debug_options.xla_gpu_experimental_autotune_cache_mode = Reactant.Proto.xla.var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_UPDATE else - @set! debug_options.xla_gpu_experimental_autotune_cache_mode = + debug_options.xla_gpu_experimental_autotune_cache_mode = Reactant.Proto.xla.var"DebugOptions.AutotuneCacheMode".AUTOTUNE_CACHE_MODE_READ end end for (key, value) in pairs(kwargs) - debug_options = Setfield.set(debug_options, Setfield.PropertyLens{key}(), value) + setproperty!(debug_options, key, value) end return debug_options end +struct CompileOptionsWithoutProto + device_id::Int64 + global_device_ids::Vector{Int64} + use_shardy_partitioner::Bool + use_spmd_partitioning::Bool +end + function make_compile_options(; device_id::Int64, num_replicas::Int64=1, @@ -74,16 +64,35 @@ function make_compile_options(; xla_executable_build_options=(;), xla_compile_options=(;), ) + if ( + isempty(xla_debug_options) && + ( + isempty(xla_executable_build_options) || ( + length(xla_executable_build_options) == 2 && + haskey(xla_executable_build_options, :use_shardy_partitioner) && + haskey(xla_executable_build_options, :use_spmd_partitioning) + ) + ) && + isempty(xla_compile_options) + ) + return CompileOptionsWithoutProto( + device_id, + mesh_ids === nothing ? Int64[] : mesh_ids, + get(xla_executable_build_options, :use_shardy_partitioner, false), + get(xla_executable_build_options, :use_spmd_partitioning, false), + ) + end + compile_options = get_default_compile_options() executable_build_options = compile_options.executable_build_options - @set! executable_build_options.debug_options = get_debug_options(; xla_debug_options...) - @set! executable_build_options.num_replicas = num_replicas - @set! executable_build_options.num_partitions = num_partitions + executable_build_options.debug_options = get_debug_options(; xla_debug_options...) + executable_build_options.num_replicas = num_replicas + executable_build_options.num_partitions = num_partitions - # default overrides. can we changed by the user by passing in kwargs - @set! executable_build_options.allow_spmd_sharding_propagation_to_parameters = [false] - @set! executable_build_options.allow_spmd_sharding_propagation_to_output = [false] + # default overrides. can be changed by the user by passing in kwargs + executable_build_options.allow_spmd_sharding_propagation_to_parameters = [false] + executable_build_options.allow_spmd_sharding_propagation_to_output = [false] if device_id < 0 @assert mesh_ids !== nothing @@ -94,12 +103,12 @@ function make_compile_options(; mesh_ids[(i - 1) * num_partitions + j] for i in 1:num_replicas ]) for j in 1:num_partitions ] - @set! executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto( + executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto( Int32(num_replicas), Int32(num_partitions), computation_devices ) else - @set! executable_build_options.device_ordinal = device_id - @set! executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto( + executable_build_options.device_ordinal = device_id + executable_build_options.device_assignment = Reactant.Proto.xla.DeviceAssignmentProto( Int32(1), Int32(1), [Reactant.Proto.xla.var"DeviceAssignmentProto.ComputationDevice"([device_id])], @@ -107,15 +116,13 @@ function make_compile_options(; end for (key, val) in pairs(xla_executable_build_options) - executable_build_options = Setfield.set( - executable_build_options, Setfield.PropertyLens{key}(), val - ) + setproperty!(executable_build_options, key, val) end - @set! compile_options.executable_build_options = executable_build_options + compile_options.executable_build_options = executable_build_options for (key, val) in pairs(xla_compile_options) - compile_options = Setfield.set(compile_options, Setfield.PropertyLens{key}(), val) + setproperty!(compile_options, key, val) end return compile_options diff --git a/src/xla/IFRT/Array.jl b/src/xla/IFRT/Array.jl index d3d6af708e..f3eef62fcd 100644 --- a/src/xla/IFRT/Array.jl +++ b/src/xla/IFRT/Array.jl @@ -370,8 +370,8 @@ function replicate_array_to_all_devices(array::Array, sharding, mesh, size_arr) exec = XLA.compile( XLA.client(array), - mod; - compile_options, + mod, + compile_options; num_outputs=1, # unused num_parameters=1, # unused is_sharded=true, diff --git a/src/xla/IFRT/LoadedExecutable.jl b/src/xla/IFRT/LoadedExecutable.jl index 45bfeaddb4..6162a67852 100644 --- a/src/xla/IFRT/LoadedExecutable.jl +++ b/src/xla/IFRT/LoadedExecutable.jl @@ -73,8 +73,8 @@ end function XLA.compile( client::Client, - mod::MLIR.IR.Module; - compile_options::Reactant.Proto.xla.CompileOptionsProto, + mod::MLIR.IR.Module, + compile_options::Reactant.Proto.xla.CompileOptionsProto; num_parameters::Int64, num_outputs::Int64, is_sharded::Bool, @@ -97,6 +97,42 @@ function XLA.compile( ) end +function XLA.compile( + client::Client, + mod::MLIR.IR.Module, + compile_options::Reactant.XLA.CompileOptionsWithoutProto; + num_parameters::Int64, + num_outputs::Int64, + is_sharded::Bool, + num_replicas::Int64, + num_partitions::Int64, +) + GC.@preserve client mod begin + exec = MLIR.IR.try_compile_dump_mlir(mod) do + @ccall MLIR.API.mlir_c.ifrt_compile( + client.client::Ptr{Cvoid}, + mod.module_::MLIR.API.MlirModule, + compile_options.device_id::Clong, + compile_options.global_device_ids::Ptr{Clong}, + length(compile_options.global_device_ids)::Clong, + XLA.CUDA_DATA_DIR[]::Cstring, + compile_options.use_shardy_partitioner::Bool, + num_replicas::Int64, + num_partitions::Int64, + compile_options.use_spmd_partitioning::Bool, + Reactant.PersistentCompileCache.kernel_cache_enabled()::Bool, + Reactant.PersistentCompileCache.get_kernel_cache_path()::Cstring, + Reactant.PersistentCompileCache.autotune_cache_enabled()::Bool, + Reactant.PersistentCompileCache.get_autotune_cache_directory()::Cstring, + Reactant.Distributed.local_rank()::Cint, + )::Ptr{Cvoid} + end + end + return LoadedExecutable( + exec, num_outputs, num_parameters, is_sharded, num_replicas, num_partitions + ) +end + @inline function XLA.execute( exec::LoadedExecutable, inputs::NTuple{N,Ptr{Cvoid}}, diff --git a/src/xla/PJRT/LoadedExecutable.jl b/src/xla/PJRT/LoadedExecutable.jl index 14add5b4e6..7066c3b4d5 100644 --- a/src/xla/PJRT/LoadedExecutable.jl +++ b/src/xla/PJRT/LoadedExecutable.jl @@ -67,8 +67,8 @@ end function XLA.compile( client::Client, - mod::MLIR.IR.Module; - compile_options::Reactant.Proto.xla.CompileOptionsProto, + mod::MLIR.IR.Module, + compile_options::Reactant.Proto.xla.CompileOptionsProto; num_parameters::Int64, num_outputs::Int64, is_sharded::Bool, @@ -91,6 +91,42 @@ function XLA.compile( ) end +function XLA.compile( + client::Client, + mod::MLIR.IR.Module, + compile_options::Reactant.XLA.CompileOptionsWithoutProto; + num_parameters::Int64, + num_outputs::Int64, + is_sharded::Bool, + num_replicas::Int64, + num_partitions::Int64, +) + GC.@preserve client mod begin + exec = MLIR.IR.try_compile_dump_mlir(mod) do + @ccall MLIR.API.mlir_c.ClientCompile( + client.client::Ptr{Cvoid}, + mod.module_::MLIR.API.MlirModule, + compile_options.device_id::Clong, + compile_options.global_device_ids::Ptr{Clong}, + length(compile_options.global_device_ids)::Clong, + XLA.CUDA_DATA_DIR[]::Cstring, + compile_options.use_shardy_partitioner::Bool, + num_replicas::Int64, + num_partitions::Int64, + compile_options.use_spmd_partitioning::Bool, + Reactant.PersistentCompileCache.kernel_cache_enabled()::Bool, + Reactant.PersistentCompileCache.get_kernel_cache_path()::Cstring, + Reactant.PersistentCompileCache.autotune_cache_enabled()::Bool, + Reactant.PersistentCompileCache.get_autotune_cache_directory()::Cstring, + Reactant.Distributed.local_rank()::Cint, + )::Ptr{Cvoid} + end + end + return LoadedExecutable( + exec, num_outputs, num_parameters, is_sharded, num_replicas, num_partitions + ) +end + function execute_ir(N, M, n_outs, with_device::Bool, nmesh_ids::Int64) ptr = @static if VERSION < v"1.12" sizeof(Int) == sizeof(Int64) ? "i64" : "i32" diff --git a/src/xla/XLA.jl b/src/xla/XLA.jl index 8cff7d44c2..6e56c020bc 100644 --- a/src/xla/XLA.jl +++ b/src/xla/XLA.jl @@ -9,8 +9,6 @@ using Enzyme: Compiler using Preferences: load_preference using UUIDs: UUID -using Setfield: Setfield, @set! - const XLA_REACTANT_GPU_MEM_FRACTION = Ref{Float64}(0.75) const XLA_REACTANT_GPU_PREALLOCATE = Ref{Bool}(true) const REACTANT_XLA_RUNTIME = load_preference( @@ -28,6 +26,8 @@ function LLVMclopts(opts...) )::Cvoid end +include("CompileOptions.jl") + include("Distributed.jl") include("Client.jl") include("Device.jl") @@ -48,8 +48,6 @@ include("PJRT/PJRT.jl") include("IFRT/IFRT.jl") -include("CompileOptions.jl") - abstract type AbstractBackendState end for runtime in (:PJRT, :IFRT)