@@ -468,19 +468,13 @@ update_branches(
468468 >>,
469469 <<DirectBranch /binary , Nops /binary >>;
470470 true ->
471- % Keep far branch sequence: auipc + lw + jalr + data
472- % RISC-V far branch is always 16 bytes
473- case Size of
474- 16 ->
475- % 16-byte sequence: auipc + lw + jalr + data
476- I1 = jit_riscv32_asm :auipc (TempReg , 0 ),
477- I2 = jit_riscv32_asm :lw (TempReg , TempReg , 8 ),
478- I3 = jit_riscv32_asm :jalr (zero , TempReg , 0 ),
479- % Calculate absolute target address
480- TargetAddress = LabelOffset ,
481- I4 = <<TargetAddress :32 /little >>,
482- <<I1 /binary , I2 /binary , I3 /binary , I4 /binary >>
483- end
471+ % Keep far branch sequence: auipc + jalr (PC-relative, 8 bytes)
472+ % Split the relative offset into upper 20 bits and lower 12 bits
473+ Hi20 = (Rel + 16#800 ) bsr 12 ,
474+ Lo12 = Rel - (Hi20 bsl 12 ),
475+ I1 = jit_riscv32_asm :auipc (TempReg , Hi20 ),
476+ I2 = jit_riscv32_asm :jalr (zero , TempReg , Lo12 ),
477+ <<I1 /binary , I2 /binary >>
484478 end ;
485479 jump_table_auipc_jalr ->
486480 % Calculate PC-relative offset from AUIPC instruction to target
@@ -679,7 +673,8 @@ return_if_not_equal_to_ctx(
679673 end ,
680674 I3 = jit_riscv32_asm :ret (),
681675 % Branch if equal (skip the return)
682- I1 = jit_riscv32_asm :beq (Reg , ? CTX_REG , byte_size (I2 ) + byte_size (I3 )),
676+ % Offset must account for the beq instruction itself (4 bytes) plus I2 and I3
677+ I1 = jit_riscv32_asm :beq (Reg , ? CTX_REG , 4 + byte_size (I2 ) + byte_size (I3 )),
683678 Stream1 = StreamModule :append (Stream0 , <<I1 /binary , I2 /binary , I3 /binary >>),
684679 {AvailableRegs1 , UsedRegs1 } = free_reg (
685680 AvailableRegs0 , UsedRegs0 , Reg
@@ -754,38 +749,43 @@ branch_to_offset_code(_State, Offset, TargetOffset) when
754749 Rel = TargetOffset - Offset ,
755750 jit_riscv32_asm :j (Rel );
756751branch_to_offset_code (
757- # state {available_regs = [TempReg | _ ]}, _Offset , TargetOffset
752+ # state {available_regs = [TempReg | _ ]}, Offset , TargetOffset
758753) ->
759- % Far branch: use auipc + lw + jalr sequence (RISC-V)
760- % This creates a PC-relative load sequence - always 16 bytes (4-byte aligned)
754+ % Far branch: use auipc + jalr sequence for PC-relative addressing
755+ % This computes: PC + Immediate and jumps to it
761756
762- % TempReg = PC
763- I1 = jit_riscv32_asm :auipc (TempReg , 0 ),
764- % TempReg = *(PC+8)
765- I2 = jit_riscv32_asm :lw (TempReg , TempReg , 8 ),
766- % Jump to TempReg
767- I3 = jit_riscv32_asm :jalr (zero , TempReg , 0 ),
768- % The literal value is the absolute target offset
769- I4 = <<TargetOffset :32 /little >>,
770- <<I1 /binary , I2 /binary , I3 /binary , I4 /binary >>.
757+ Rel = TargetOffset - Offset ,
758+ % Split the relative offset into upper 20 bits and lower 12 bits
759+ % RISC-V PC-relative addressing: target = PC + (imm20 << 12) + sign_extend(imm12)
760+ % Since jalr's imm12 is sign-extended, if bit 11 of Rel is set,
761+ % we need to add 0x800 before splitting to compensate
762+ Hi20 = (Rel + 16#800 ) bsr 12 ,
763+ Lo12Unsigned = Rel band 16#FFF ,
764+ % Convert to signed 12-bit value: if bit 11 is set, subtract 4096
765+ Lo12 = if
766+ Lo12Unsigned >= 16#800 -> Lo12Unsigned - 16#1000 ;
767+ true -> Lo12Unsigned
768+ end ,
769+
770+ % TempReg = PC + (Hi20 << 12)
771+ I1 = jit_riscv32_asm :auipc (TempReg , Hi20 ),
772+ % Jump to TempReg + sign_extend(Lo12)
773+ I2 = jit_riscv32_asm :jalr (zero , TempReg , Lo12 ),
774+ <<I1 /binary , I2 /binary >>.
771775
772776branch_to_label_code (State , Offset , Label , {Label , LabelOffset }) ->
773777 CodeBlock = branch_to_offset_code (State , Offset , LabelOffset ),
774778 {State , CodeBlock };
775779branch_to_label_code (
776780 # state {available_regs = [TempReg | _ ], branches = Branches } = State0 , Offset , Label , false
777781) ->
778- % RISC-V: Far branch sequence - always 16 bytes (4-byte aligned )
782+ % RISC-V: Far branch sequence using PC-relative auipc + jalr (8 bytes )
779783
780- % Load PC into temp
784+ % Placeholder: auipc TempReg, 0
781785 I1 = jit_riscv32_asm :auipc (TempReg , 0 ),
782- % Load offset from PC+8
783- I2 = jit_riscv32_asm :lw (TempReg , TempReg , 8 ),
784- % Jump to address
785- I3 = jit_riscv32_asm :jalr (zero , TempReg , 0 ),
786- % Placeholder offset
787- I4 = <<0 :32 /little >>,
788- CodeBlock = <<I1 /binary , I2 /binary , I3 /binary , I4 /binary >>,
786+ % Placeholder: jalr zero, TempReg, 0
787+ I2 = jit_riscv32_asm :jalr (zero , TempReg , 0 ),
788+ CodeBlock = <<I1 /binary , I2 /binary >>,
789789 SequenceSize = byte_size (CodeBlock ),
790790 % Add relocation entry
791791 Reloc = {Label , Offset , {far_branch , SequenceSize , TempReg }},
@@ -795,17 +795,13 @@ branch_to_label_code(
795795 # state {available_regs = [], branches = Branches } = State0 , Offset , Label , false
796796) ->
797797 % RISC-V: Use t6 as scratch (caller-saved, safe to clobber)
798- % Same sequence as when we have available regs - always 16 bytes (4-byte aligned )
798+ % Far branch sequence using PC-relative auipc + jalr (8 bytes )
799799
800- % Load PC into t6
800+ % Placeholder: auipc t6, 0
801801 I1 = jit_riscv32_asm :auipc (t6 , 0 ),
802- % Load offset from PC+8
803- I2 = jit_riscv32_asm :lw (t6 , t6 , 8 ),
804- % Jump to address
805- I3 = jit_riscv32_asm :jalr (zero , t6 , 0 ),
806- % Placeholder offset
807- I4 = <<0 :32 /little >>,
808- CodeBlock = <<I1 /binary , I2 /binary , I3 /binary , I4 /binary >>,
802+ % Placeholder: jalr zero, t6, 0
803+ I2 = jit_riscv32_asm :jalr (zero , t6 , 0 ),
804+ CodeBlock = <<I1 /binary , I2 /binary >>,
809805 SequenceSize = byte_size (CodeBlock ),
810806 % Add relocation entry
811807 Reloc = {Label , Offset , {far_branch , SequenceSize , t6 }},
@@ -1528,9 +1524,17 @@ call_func_ptr(
15281524 % Calculate stack offset: find register index in SavedRegs * 4 bytes
15291525 ResultReg = element (2 , FuncPtrTuple ),
15301526 RegIndex = index_of (ResultReg , SavedRegs ),
1531- StoreResultStackOffset = RegIndex * 4 ,
1532- StoreResult = jit_riscv32_asm :sw (sp , a0 , StoreResultStackOffset ),
1533- {StreamModule :append (Stream5 , StoreResult ), [ResultReg | UsedRegs1 ]};
1527+ case RegIndex >= 0 of
1528+ true ->
1529+ StoreResultStackOffset = RegIndex * 4 ,
1530+ StoreResult = jit_riscv32_asm :sw (sp , a0 , StoreResultStackOffset ),
1531+ {StreamModule :append (Stream5 , StoreResult ), [ResultReg | UsedRegs1 ]};
1532+ false ->
1533+ % FuncPtrReg was not in SavedRegs, use an available register
1534+ [ResultReg1 | _ ] = AvailableRegs1 -- SavedRegs ,
1535+ MoveResult = jit_riscv32_asm :mv (ResultReg1 , a0 ),
1536+ {StreamModule :append (Stream5 , MoveResult ), [ResultReg1 | UsedRegs1 ]}
1537+ end ;
15341538 _ ->
15351539 % Use any free that is not in SavedRegs
15361540 [ResultReg | _ ] = AvailableRegs1 -- SavedRegs ,
@@ -1632,8 +1636,8 @@ parameter_regs0([], _, Acc) ->
16321636 lists :reverse (Acc );
16331637parameter_regs0 ([{avm_int64_t , _ } | T ], [a0 , a1 | Rest ], Acc ) ->
16341638 parameter_regs0 (T , Rest , [a1 , a0 | Acc ]);
1635- parameter_regs0 ([{avm_int64_t , _ } | T ], [a1 , a2 , a3 | Rest ], Acc ) ->
1636- parameter_regs0 (T , Rest , [a3 , a2 | Acc ]);
1639+ parameter_regs0 ([{avm_int64_t , _ } | T ], [a1 , a2 | Rest ], Acc ) ->
1640+ parameter_regs0 (T , Rest , [a2 , a1 | Acc ]);
16371641parameter_regs0 ([{avm_int64_t , _ } | T ], [a2 , a3 | Rest ], Acc ) ->
16381642 parameter_regs0 (T , Rest , [a3 , a2 | Acc ]);
16391643parameter_regs0 ([_Other | T ], [Reg | Rest ], Acc ) ->
@@ -2637,7 +2641,9 @@ decrement_reductions_and_maybe_schedule_next(
26372641 I4 = jit_riscv32_asm :bne (Temp , zero , 0 ),
26382642 % Set continuation to the next instruction
26392643 ADROffset = BNEOffset + byte_size (I4 ),
2640- I5 = pc_relative_address (Temp , 0 ),
2644+ % Use 8-byte placeholder (2 words of 0xFFFFFFFF) for pc_relative_address
2645+ % This ensures we can always rewrite with either auipc alone (4 bytes) or auipc+addi (8 bytes)
2646+ I5 = <<16#FFFFFFFF :32 /little , 16#FFFFFFFF :32 /little >>,
26412647 I6 = jit_riscv32_asm :sw (? JITSTATE_REG , Temp , ? JITSTATE_CONTINUATION_OFFSET ),
26422648 % Append the instructions to the stream
26432649 Stream2 = StreamModule :append (Stream1 , <<I4 /binary , I5 /binary , I6 /binary >>),
@@ -2647,7 +2653,16 @@ decrement_reductions_and_maybe_schedule_next(
26472653 # state {stream = Stream3 } = State2 ,
26482654 NewOffset = StreamModule :offset (Stream3 ),
26492655 NewI4 = jit_riscv32_asm :bne (Temp , zero , NewOffset - BNEOffset ),
2650- NewI5 = pc_relative_address (Temp , NewOffset - ADROffset ),
2656+ NewI5Offset = NewOffset - ADROffset ,
2657+ % Generate the new pc_relative_address instruction, padding with NOP if needed
2658+ NewI5 = case pc_relative_address (Temp , NewI5Offset ) of
2659+ I when byte_size (I ) =:= 4 ->
2660+ % Only auipc, pad with NOP
2661+ <<I /binary , (jit_riscv32_asm :nop ())/binary >>;
2662+ I when byte_size (I ) =:= 8 ->
2663+ % auipc + addi, no padding needed
2664+ I
2665+ end ,
26512666 Stream4 = StreamModule :replace (
26522667 Stream3 , BNEOffset , <<NewI4 /binary , NewI5 /binary >>
26532668 ),
@@ -2753,17 +2768,12 @@ set_cp(#state{available_regs = [TempReg | AvailT], used_regs = UsedRegs} = State
27532768 % Reserve space for offset load instruction
27542769 % li can generate 1 instruction (4 bytes) for small immediates (< 2048)
27552770 % or 2 instructions (8 bytes) for large immediates
2756- % Since we use (offset bsl 2), threshold is when offset >= 512 bytes
2757- % To be safe, use same threshold as AArch64 relative to instruction encoding limits
2758- {I2 , I3 } =
2759- if
2760- Offset >= 512 ->
2761- % Need 2 instructions (lui + addi) for large offsets
2762- {jit_riscv32_asm :nop (), jit_riscv32_asm :nop ()};
2763- true ->
2764- % Need 1 instruction (addi) for small offsets
2765- {jit_riscv32_asm :nop (), <<>>}
2766- end ,
2771+ % Since we don't know the final CP value yet (it depends on code size),
2772+ % we must always reserve 2 instructions (8 bytes) to be safe
2773+ % The final CP value is (final_offset << 2), and final_offset is unknown
2774+ % Use 0xFFFFFFFF placeholders for flash compatibility (can only flip 1->0)
2775+ I2 = <<16#FFFFFFFF :32 /little >>,
2776+ I3 = <<16#FFFFFFFF :32 /little >>,
27672777 MOVOffset = Offset + byte_size (I1 ),
27682778 % OR the module index with the offset (loaded in temp register)
27692779 I4 = jit_riscv32_asm :or_ (Reg , TempReg ),
@@ -2783,8 +2793,15 @@ rewrite_cp_offset(
27832793 TempReg
27842794) ->
27852795 NewOffset = StreamModule :offset (Stream0 ) - CodeOffset ,
2786- NewMoveInstr = jit_riscv32_asm :li (TempReg , NewOffset bsl 2 ),
2787- Stream1 = StreamModule :replace (Stream0 , RewriteOffset , NewMoveInstr ),
2796+ CPValue = NewOffset bsl 2 ,
2797+ NewMoveInstr = jit_riscv32_asm :li (TempReg , CPValue ),
2798+ % We reserved 8 bytes (2 instructions) for the CP value
2799+ % If li generates only 4 bytes, pad with a NOP to maintain alignment
2800+ PaddedInstr = case byte_size (NewMoveInstr ) of
2801+ 4 -> <<NewMoveInstr /binary , (jit_riscv32_asm :nop ())/binary >>;
2802+ 8 -> NewMoveInstr
2803+ end ,
2804+ Stream1 = StreamModule :replace (Stream0 , RewriteOffset , PaddedInstr ),
27882805 State0 # state {stream = Stream1 }.
27892806
27902807set_bs (
0 commit comments