Replies: 2 comments 2 replies
-
|
Can you please sort this report a little bit? It seems that half way through you switched to SDXL? Or were just just saying that you used SDXL-like parameters for z-image? What error message belongs to what? Generally, we cannot test new features on devices like the 1080 because it's hard to find someone who still has them (on Discord, among people willing to test). To disable recently added advanced features, switch back to float8 (W8) and disable "Compile transformer blocks". |
Beta Was this translation helpful? Give feedback.
1 reply
-
Beta Was this translation helpful? Give feedback.
1 reply
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
hi so i try to start z image lora on my old gpu 1080ti
with the default setting looks like there is a bug or not compatible with my gpu in int w8a8 take a long time to start and give errors
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Quantizing model weights: 100%|██████████| 547/547 [00:51<00:00, 10.54it/s]
Quantizing model weights: 100%|██████████| 697/697 [01:22<00:00, 8.45it/s]
Selected layers: 210
Deselected layers: 66
Note: Enable Debug mode to see the full list of layer names
epoch: 0%| | 0/100 [00:00<?, ?it/s]
enumerating sample paths: 100%|██████████| 1/1 [00:00<00:00, 64.01it/s]
C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\util\triton_mm_8bit.py:87:35: error: 'arith.extf' op operand #0 must be floating-point-like, but got 'tensor<128x128xi8, #ttg.dot_op<{opIdx = 0, parent = #ttg.blocked<{sizePerThread = [4, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>}>>'
accumulator = tl.dot(a, b, accumulator, out_dtype=tl.float32 if FLOAT else tl.int32)
^
module {
tt.func public @__mm_kernel(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32) attributes {noinline = false} {
%c127_i32 = arith.constant 127 : i32
%cst = arith.constant dense<0> : tensor<128x128xi32>
%cst_0 = arith.constant dense<0> : tensor<128x128xi8>
%c1_i32 = arith.constant 1 : i32
%cst_1 = arith.constant dense<128> : tensor<128x128xi32>
%c128_i32 = arith.constant 128 : i32
%true = arith.constant true
%c0_i32 = arith.constant 0 : i32
%0 = tt.get_program_id x : i32
%1 = tt.get_program_id y : i32
%2 = arith.cmpi sge, %1, %c0_i32 : i32
llvm.intr.assume %2 : i1
%3 = arith.cmpi sge, %0, %c0_i32 : i32
llvm.intr.assume %3 : i1
%4 = arith.cmpi sgt, %arg6, %c0_i32 : i32
llvm.intr.assume %4 : i1
llvm.intr.assume %true : i1
llvm.intr.assume %true : i1
%5 = arith.cmpi sgt, %arg7, %c0_i32 : i32
llvm.intr.assume %5 : i1
%6 = arith.cmpi sgt, %arg8, %c0_i32 : i32
llvm.intr.assume %6 : i1
llvm.intr.assume %true : i1
%7 = arith.muli %1, %c128_i32 : i32
%8 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
%9 = tt.splat %7 : i32 -> tensor<128xi32>
%10 = arith.addi %9, %8 : tensor<128xi32>
%11 = tt.splat %arg3 : i32 -> tensor<128xi32>
%12 = arith.remsi %10, %11 : tensor<128xi32>
%13 = arith.muli %0, %c128_i32 : i32
%14 = tt.splat %13 : i32 -> tensor<128xi32>
%15 = arith.addi %14, %8 : tensor<128xi32>
%16 = tt.splat %arg4 : i32 -> tensor<128xi32>
%17 = arith.remsi %15, %16 : tensor<128xi32>
%18 = tt.expand_dims %12 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%19 = tt.splat %arg6 : i32 -> tensor<128x1xi32>
%20 = arith.muli %18, %19 : tensor<128x1xi32>
%21 = tt.expand_dims %8 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
%22 = tt.broadcast %20 : tensor<128x1xi32> -> tensor<128x128xi32>
%23 = tt.broadcast %21 : tensor<1x128xi32> -> tensor<128x128xi32>
%24 = arith.addi %22, %23 : tensor<128x128xi32>
%25 = tt.splat %arg0 : !tt.ptr -> tensor<128x128x!tt.ptr>
%26 = tt.addptr %25, %24 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%27 = tt.expand_dims %8 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%28 = tt.splat %arg7 : i32 -> tensor<128x1xi32>
%29 = arith.muli %27, %28 : tensor<128x1xi32>
%30 = tt.expand_dims %17 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
%31 = tt.broadcast %29 : tensor<128x1xi32> -> tensor<128x128xi32>
%32 = tt.broadcast %30 : tensor<1x128xi32> -> tensor<128x128xi32>
%33 = arith.addi %31, %32 : tensor<128x128xi32>
%34 = tt.splat %arg1 : !tt.ptr -> tensor<128x128x!tt.ptr>
%35 = tt.addptr %34, %33 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%36 = arith.addi %arg5, %c127_i32 : i32
%37 = arith.divsi %36, %c128_i32 : i32
%38:3 = scf.for %arg10 = %c0_i32 to %37 step %c1_i32 iter_args(%arg11 = %cst, %arg12 = %26, %arg13 = %35) -> (tensor<128x128xi32>, tensor<128x128x!tt.ptr>, tensor<128x128x!tt.ptr>) : i32 {
%55 = tt.splat %arg3 : i32 -> tensor<128x1xi32>
%56 = arith.cmpi slt, %18, %55 : tensor<128x1xi32>
%57 = arith.muli %arg10, %c128_i32 : i32
%58 = arith.subi %arg5, %57 : i32
%59 = tt.splat %58 : i32 -> tensor<1x128xi32>
%60 = arith.cmpi slt, %21, %59 : tensor<1x128xi32>
%61 = tt.broadcast %56 : tensor<128x1xi1> -> tensor<128x128xi1>
%62 = tt.broadcast %60 : tensor<1x128xi1> -> tensor<128x128xi1>
%63 = arith.andi %61, %62 : tensor<128x128xi1>
%64 = tt.splat %arg4 : i32 -> tensor<1x128xi32>
%65 = arith.cmpi slt, %30, %64 : tensor<1x128xi32>
%66 = tt.splat %58 : i32 -> tensor<128x1xi32>
%67 = arith.cmpi slt, %27, %66 : tensor<128x1xi32>
%68 = tt.broadcast %65 : tensor<1x128xi1> -> tensor<128x128xi1>
%69 = tt.broadcast %67 : tensor<128x1xi1> -> tensor<128x128xi1>
%70 = arith.andi %68, %69 : tensor<128x128xi1>
%71 = tt.load %arg12, %63, %cst_0 : tensor<128x128x!tt.ptr>
%72 = tt.load %arg13, %70, %cst_0 : tensor<128x128x!tt.ptr>
%73 = tt.dot %71, %72, %arg11, inputPrecision = tf32 : tensor<128x128xi8> * tensor<128x128xi8> -> tensor<128x128xi32>
%74 = tt.addptr %arg12, %cst_1 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%75 = arith.muli %arg7, %c128_i32 : i32
%76 = tt.splat %75 : i32 -> tensor<128x128xi32>
%77 = tt.addptr %arg13, %76 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
scf.yield %73, %74, %77 : tensor<128x128xi32>, tensor<128x128x!tt.ptr>, tensor<128x128x!tt.ptr>
}
%39 = tt.expand_dims %10 {axis = 1 : i32} : tensor<128xi32> -> tensor<128x1xi32>
%40 = tt.splat %arg8 : i32 -> tensor<128x1xi32>
%41 = arith.muli %40, %39 : tensor<128x1xi32>
%42 = tt.splat %arg2 : !tt.ptr -> tensor<128x1x!tt.ptr>
%43 = tt.addptr %42, %41 : tensor<128x1x!tt.ptr>, tensor<128x1xi32>
%44 = tt.expand_dims %15 {axis = 0 : i32} : tensor<128xi32> -> tensor<1x128xi32>
%45 = tt.broadcast %43 : tensor<128x1x!tt.ptr> -> tensor<128x128x!tt.ptr>
%46 = tt.broadcast %44 : tensor<1x128xi32> -> tensor<128x128xi32>
%47 = tt.addptr %45, %46 : tensor<128x128x!tt.ptr>, tensor<128x128xi32>
%48 = tt.splat %arg3 : i32 -> tensor<128x1xi32>
%49 = arith.cmpi slt, %39, %48 : tensor<128x1xi32>
%50 = tt.splat %arg4 : i32 -> tensor<1x128xi32>
%51 = arith.cmpi slt, %44, %50 : tensor<1x128xi32>
%52 = tt.broadcast %49 : tensor<128x1xi1> -> tensor<128x128xi1>
%53 = tt.broadcast %51 : tensor<1x128xi1> -> tensor<128x128xi1>
%54 = arith.andi %52, %53 : tensor<128x128xi1>
tt.store %47, %38#0, %54 : tensor<128x128x!tt.ptr>
tt.return
}
}
{-#
external_resources: {
mlir_reproducer: {
pipeline: "builtin.module(convert-triton-to-tritongpu{enable-source-remat=false num-ctas=1 num-warps=4 target=cuda:61 threads-per-warp=32}, tritongpu-coalesce, triton-nvidia-gpu-plan-cta, tritongpu-remove-layout-conversions, tritongpu-optimize-thread-locality, tritongpu-accelerate-matmul, tritongpu-remove-layout-conversions, tritongpu-optimize-dot-operands{hoist-layout-conversion=false}, triton-nvidia-optimize-descriptor-encoding, triton-loop-aware-cse, triton-licm, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true}, triton-loop-aware-cse, tritongpu-prefetch, tritongpu-optimize-dot-operands{hoist-layout-conversion=false}, tritongpu-coalesce-async-copy, triton-nvidia-optimize-tmem-layouts, tritongpu-remove-layout-conversions, triton-nvidia-interleave-tmem, tritongpu-reduce-data-duplication, tritongpu-reorder-instructions, triton-loop-aware-cse, symbol-dce, sccp, canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true})",
disable_threading: false,
verify_each: true
}
}
#-}
C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\util\triton_mm_8bit.py:50:0: error: Failures have been detected while processing an MLIR pass pipeline
C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\util\triton_mm_8bit.py:50:0: note: Pipeline failed while executing [
TritonGPUAccelerateMatmulon 'builtin.module' operation]: reproducer generated atstd::errs, please share the reproducer above with Triton project.step: 0%| | 0/600 [41:42<?, ?it/s]
epoch: 0%| | 0/100 [41:46<?, ?it/s]
Traceback (most recent call last):
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\ui\TrainUI.py", line 755, in _training_thread_function
trainer.train()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\trainer\GenericTrainer.py", line 748, in train
scaler.scale(loss).backward()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch_tensor.py", line 647, in backward
torch.autograd.backward(
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\autograd_init.py", line 354, in backward
_engine_run_backward(
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\autograd\graph.py", line 829, in _engine_run_backward
return Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\autograd\function.py", line 311, in apply
return user_fn(self, *args)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\module\quantized\LinearW8A8.py", line 62, in backward
return int8_backward_axiswise(output, weight, weight_scale), None, None, None, None
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\utils_contextlib.py", line 120, in decorate_context
return func(*args, **kwargs)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\module\quantized\LinearW8A8.py", line 40, in int8_backward_axiswise
mm_res = triton_mm_8bit(output_8.contiguous(), weight)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\util\triton_mm_8bit.py", line 112, in mm_8bit
__mm_kernel[grid](
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\jit.py", line 390, in
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\autotuner.py", line 239, in run
benchmark()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\autotuner.py", line 228, in benchmark
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\autotuner.py", line 228, in
timings = {config: self._bench(*args, config=config, **kwargs) for config in pruned_configs}
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\autotuner.py", line 160, in _bench
return self.do_bench(kernel_call, quantiles=(0.5, 0.2, 0.8))
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\testing.py", line 149, in do_bench
fn()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\autotuner.py", line 146, in kernel_call
self.fn.run(
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\runtime\jit.py", line 594, in run
kernel = self.compile(src, target=target, options=options.dict)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\compiler\compiler.py", line 359, in compile
next_module = compile_ir(module, metadata)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\backends\nvidia\compiler.py", line 465, in
stages["ttgir"] = lambda src, metadata: self.make_ttgir(src, metadata, options, capability)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\triton\backends\nvidia\compiler.py", line 308, in make_ttgir
pm.run(mod)
RuntimeError: PassManager::run failed
can't train on bfloat16, not compatible it try to use 47 gigas of vram
on sdxl i use float8 and train in float16 so i resue that setting but it take up to 2 hours before it start first step and then fail
i also tried to quantization layer on full (as it was like that on sdxl default)
i use prodigy_adv + factored optimizer
Selected layers: 210
Deselected layers: 66
Note: Enable Debug mode to see the full list of layer names
epoch: 0%| | 0/100 [00:00<?, ?it/s]
enumerating sample paths: 100%|██████████| 1/1 [00:00<?, ?it/s]
step: 0%| | 0/600 [2:11:48<?, ?it/s]
epoch: 0%| | 0/100 [2:11:53<?, ?it/s]
Traceback (most recent call last):
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\ui\TrainUI.py", line 755, in __training_thread_function
trainer.train()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\trainer\GenericTrainer.py", line 788, in train
raise RuntimeError("Training loss became NaN. This may be due to invalid parameters, precision issues, or a bug in the loss computation.")
RuntimeError: Training loss became NaN. This may be due to invalid parameters, precision issues, or a bug in the loss computation.
i put all in float32 to try and training in float16 i got this
Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 320.08it/s]
Selected layers: 210
Deselected layers: 66
Note: Enable Debug mode to see the full list of layer names
epoch: 0%| | 0/100 [00:00<?, ?it/s]
enumerating sample paths: 0%| | 0/1 [00:00<?, ?it/s]
enumerating sample paths: 100%|██████████| 1/1 [00:00<00:00, 2.67it/s]
step: 0%| | 0/600 [00:17<?, ?it/s]
epoch: 0%| | 0/100 [12:14<?, ?it/s]
Traceback (most recent call last):
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\ui\TrainUI.py", line 755, in __training_thread_function
trainer.train()
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\trainer\GenericTrainer.py", line 742, in train
model_output_data = self.model_setup.predict(self.model, batch, self.config, train_progress)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\modules\modelSetup\BaseZImageSetup.py", line 132, in predict
output_list = model.transformer(
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\nn\modules\module.py", line 1773, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\torch\nn\modules\module.py", line 1784, in _call_impl
return forward_call(*args, **kwargs)
File "C:\Users\carlo\AppData\Roaming\StabilityMatrix\Packages\OneTrainer\venv\lib\site-packages\diffusers\models\transformers\transformer_z_image.py", line 571, in forward
x[torch.cat(x_inner_pad_mask)] = self.x_pad_token
RuntimeError: Index put requires the source and destination dtypes match, got Half for the destination and Float for the source.
i tried in float4 but that crash onetrainer
Beta Was this translation helpful? Give feedback.
All reactions