Skip to content

Commit 6e697bc

Browse files
committed
CHB: support for ternary assignment aggregate
1 parent d909124 commit 6e697bc

File tree

8 files changed

+748
-361
lines changed

8 files changed

+748
-361
lines changed

CodeHawk/CH/xprlib/xsimplify.ml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,14 +175,17 @@ and reduce_neg (m: bool) (e1: xpr_t): (bool * xpr_t) =
175175
| _ -> default ()
176176

177177

178-
(* Note that for values other than zero the result depends on word size.*)
178+
(* Note that, in general, for values other than zero the result depends on word
179+
size. Here we assume that the word is at least 8 bits wide. *)
179180
and reduce_bitwise_not (m: bool) (e1: xpr_t): (bool * xpr_t) =
180181
let default () = (m, XOp (XBNot, [e1])) in
181182
match e1 with
182183
| XConst (IntConst num) when num#equal numerical_zero ->
183184
(true, XConst (IntConst numerical_one#neg))
184185
| XConst (IntConst num) when num#equal numerical_one ->
185186
(true, XConst (IntConst (mkNumerical 2)#neg))
187+
| XConst (IntConst n) when n#lt numerical_e8 ->
188+
(true, XConst (IntConst (n#neg#sub numerical_one)))
186189
| _ -> default ()
187190

188191

CodeHawk/CHB/bchlibarm32/bCHARMInstructionAggregate.ml

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
------------------------------------------------------------------------------
55
The MIT License (MIT)
66
7-
Copyright (c) 2022-2024 Aarno Labs, LLC
7+
Copyright (c) 2022-2025 Aarno Labs, LLC
88
99
Permission is hereby granted, free of charge, to any person obtaining a copy
1010
of this software and associated documentation files (the "Software"), to deal
@@ -66,6 +66,13 @@ let arm_aggregate_kind_to_string (k: arm_aggregate_kind_t) =
6666
| ARMPredicateAssignment (inverse, op) ->
6767
let inv = if inverse then " (inverse)" else "" in
6868
"predicate assignment to " ^ op#toString ^ inv
69+
| ARMTernaryAssignment (dst, op1, op2) ->
70+
"ternary assignment: "
71+
^ dst#toString
72+
^ " = cond ? "
73+
^ op1#toString
74+
^ " : "
75+
^ op2#toString
6976
| BXCall (_, i2) -> "BXCall at " ^ i2#get_address#to_hex_string
7077

7178

@@ -140,6 +147,11 @@ object (self)
140147
| ARMPredicateAssignment _ -> true
141148
| _ -> false
142149

150+
method is_ternary_assign =
151+
match self#kind with
152+
| ARMTernaryAssignment _ -> true
153+
| _ -> false
154+
143155
method write_xml (_node: xml_element_int) = ()
144156

145157
method toCHIF (_faddr: doubleword_int) = []
@@ -258,6 +270,21 @@ let make_predassign_aggregate
258270
~anchor:mov2
259271

260272

273+
let make_ternassign_aggregate
274+
(mov1: arm_assembly_instruction_int)
275+
(mov2: arm_assembly_instruction_int)
276+
(dstop: arm_operand_int)
277+
(n1: numerical_t)
278+
(n2: numerical_t): arm_instruction_aggregate_int =
279+
let kind = ARMTernaryAssignment(dstop, n1, n2) in
280+
make_arm_instruction_aggregate
281+
~kind
282+
~instrs:[mov1; mov2]
283+
~entry:mov1
284+
~exitinstr:mov2
285+
~anchor:mov2
286+
287+
261288
let disassemble_arm_instructions
262289
(ch: pushback_stream_int) (iaddr: doubleword_int) (n: int) =
263290
for _i = 1 to n do
@@ -293,6 +320,8 @@ let identify_jumptable
293320
| Add (_, ACCNotUnsignedHigher, rd, rn, _, false)
294321
when rd#is_pc_register && rn#is_pc_register ->
295322
create_addls_pc_jumptable ch instr
323+
| Add (_, ACCAlways, rd, _, _, false) when rd#is_pc_register ->
324+
create_arm_add_pc_adr_jumptable ch instr
296325
| BranchExchange (ACCAlways, regop) when regop#is_register ->
297326
create_arm_bx_jumptable ch instr
298327
| _ -> None
@@ -494,6 +523,67 @@ let identify_predicate_assignment
494523
| _ -> None
495524

496525

526+
(* format of ternary assignment (in ARM): assigns one value or another depending
527+
on the result of a test:
528+
529+
MOVNE Rx, imm1
530+
MOVEQ Rx, imm2
531+
532+
or
533+
534+
MVNEQ Rx, imm1
535+
MOVNE Rx, imm2
536+
*)
537+
let identify_ternary_assignment
538+
(_ch: pushback_stream_int)
539+
(instr: arm_assembly_instruction_int):
540+
(arm_assembly_instruction_int
541+
* arm_assembly_instruction_int
542+
* arm_operand_int
543+
* numerical_t
544+
* numerical_t) option =
545+
let negval (n: numerical_t) =
546+
match Xsimplify.simplify_xpr (XOp (XBNot, [Xprt.num_constant_expr n])) with
547+
| XConst (IntConst nn) -> Some nn
548+
| _ -> None in
549+
match instr#get_opcode with
550+
| Move (false, c2, rd, imm2, _, _)
551+
| BitwiseNot (false, c2, rd, imm2, _)
552+
when imm2#is_immediate && (has_inverse_cc c2) ->
553+
let optn2 =
554+
let n2 = imm2#to_numerical in
555+
match instr#get_opcode with
556+
| Move _ -> Some n2
557+
| BitwiseNot _ -> negval n2
558+
| _ -> None in
559+
(match optn2 with
560+
| Some n2 ->
561+
let rdreg = rd#get_register in
562+
let addr = instr#get_address in
563+
let movinstr_r = get_arm_assembly_instruction (addr#add_int (-4)) in
564+
(match TR.to_option movinstr_r with
565+
| Some movinstr ->
566+
(match movinstr#get_opcode with
567+
| Move (false, c1, rd, imm1, _, _)
568+
when imm1#is_immediate
569+
&& (rd#get_register = rdreg)
570+
&& (has_inverse_cc c1)
571+
&& ((Option.get (get_inverse_cc c1)) = c2) ->
572+
Some (movinstr, instr, rd, imm1#to_numerical, n2)
573+
| BitwiseNot (false, c1, rd, imm1, _)
574+
when imm1#is_immediate
575+
&& (rd#get_register = rdreg)
576+
&& (has_inverse_cc c1)
577+
&& ((Option.get (get_inverse_cc c1)) = c2) ->
578+
(match (negval imm1#to_numerical) with
579+
| Some n1 -> Some (movinstr, instr, rd, n1, n2)
580+
| _ -> None)
581+
| _ -> None)
582+
| _ -> None)
583+
| _ -> None)
584+
| _ -> None
585+
586+
497587
let identify_arm_aggregate
498588
(ch: pushback_stream_int)
499589
(instr: arm_assembly_instruction_int):
@@ -558,4 +648,12 @@ let identify_arm_aggregate
558648
| Some (inverse, mov1, mov2, dstop) ->
559649
Some (make_predassign_aggregate inverse mov1 mov2 dstop)
560650
| _ -> None in
651+
let result =
652+
match result with
653+
| Some _ -> result
654+
| _ ->
655+
match identify_ternary_assignment ch instr with
656+
| Some (mov1, mov2, dstop, n1, n2) ->
657+
Some (make_ternassign_aggregate mov1 mov2 dstop n1 n2)
658+
| _ -> None in
561659
result

CodeHawk/CHB/bchlibarm32/bCHARMInstructionAggregate.mli

Lines changed: 168 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
(* =============================================================================
2-
CodeHawk Binary Analyzer
2+
CodeHawk Binary Analyzer
33
Author: Henny Sipma
44
------------------------------------------------------------------------------
55
The MIT License (MIT)
6-
7-
Copyright (c) 2022-2024 Aarno Labs, LLC
6+
7+
Copyright (c) 2022-2025 Aarno Labs, LLC
88
99
Permission is hereby granted, free of charge, to any person obtaining a copy
1010
of this software and associated documentation files (the "Software"), to deal
1111
in the Software without restriction, including without limitation the rights
1212
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1313
copies of the Software, and to permit persons to whom the Software is
1414
furnished to do so, subject to the following conditions:
15-
15+
1616
The above copyright notice and this permission notice shall be included in all
1717
copies or substantial portions of the Software.
18-
18+
1919
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
2020
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
2121
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -25,18 +25,169 @@
2525
SOFTWARE.
2626
============================================================================= *)
2727

28-
(** Sequence of consecutive assembly instructions that represents a semantic unit.
29-
30-
Examples:
31-
- switch statement constructed by
32-
- table branch instructions (TBB/TBH), or
33-
- load from table into pc
34-
- indirect jump from table
35-
36-
- question expression constructed by Thumb-2 if-then instruction
37-
38-
In translation to CHIF, semantics are obtained from the anchor, all other
39-
instructions belonging to the aggregate are ignored.
28+
29+
(** {1 Instruction aggregates: Overview} *)
30+
31+
(** {2 Description}
32+
33+
An instruction aggregate is a (usually, but not always, contiguous)
34+
sequence of two or more instructions that is treated as a single
35+
semantic unit. The semantics for the entire sequence is assigned to
36+
one of the instructions (often the last one in the sequence) and all
37+
other instructions are considere no-ops by all subsequent analyses.
38+
39+
Instruction aggregates cover a variety of constructs ranging from
40+
predicate and ternary assignments to jump tables.
41+
42+
{2 Motivation}
43+
44+
The rationale behind combining a sequence of instructions into an
45+
aggregate is that the collective action of the individual instructions
46+
combined is not easily apparent from considering their semantics in
47+
isolation. The instructions usually collaborate in a very specific
48+
way to achieve a particular result, with operands from different instructions
49+
playing playing a specific role. This is especially true of the jump
50+
table aggregates. For some of the other aggregate kinds the semantics
51+
of the individual instructions combined would be similar, but analysis
52+
of the aggregate is more efficient and precise. For example, this is
53+
the case with the predicate assignment:
54+
{[
55+
<test>
56+
MOVcc Rx, #1
57+
MOVcc' Rx, #0 where cc' is (not cc)
58+
]}
59+
The two MOV instructions can be combined into one assignment of the
60+
boolean <test-cc>:
61+
{[
62+
Rx := <test-cc>
63+
]}
64+
Translating and analyzing the two instructions in isolation produces
65+
two joins, and potentially three reaching definitions for a subsequent
66+
use of Rx. In contrast, the two instructions combined into one
67+
predicate assignment produces zero joins, and only a single reaching
68+
definition for a subsequent use of Rx, a considerable improvement.
69+
70+
{2 Identification}
71+
72+
Instruction aggregates are identified during disassembly by pattern
73+
matching. Identification is entirely syntactic, so only the information
74+
present in the instructions themselves, such as opcode and operands,
75+
are used; not the possible values that those operands may have, unless
76+
they are immediates.
77+
78+
{2 Representation}
79+
80+
The aggregate is represented by objects of the class type
81+
[arm_instruction_aggregate_int]. These objects contain the kind of
82+
aggregate, a list of the contributing instructions, and information
83+
about its entry, exit, and anchor address. The anchor is the
84+
instruction to which the full semantics of the aggregate is assigned.
85+
86+
Cross references to instances are recorded in the [arm_assembly_instruction_int]
87+
instances of the instructions that make up the aggregate to enable
88+
triggering proper translation and analysis. The collection of all
89+
aggregates themselves is maintained in the singleton object of type
90+
[arm_assembly_instructions_int].
91+
92+
Most aggregates are contained within a single basic block and do not
93+
affect control flow. The exceptions are jump tables, whose components
94+
are directly incorporated into the CFG during disassembly and require
95+
no further semantic handling.
96+
97+
{2 Translation into CHIF}
98+
99+
The semantics of the aggregate is explicated in the translation into
100+
CHIF. In general, instructions are individually translated into CHIF
101+
(in the function [translate_arm_instruction]). In the case of aggregates
102+
CHIF is generated for the full semantics at the anchor instruction,
103+
while all other other instructions in the aggregate are treated as
104+
no-ops. In the case of jump tables the full semantics is already
105+
incorporated in the CFG during disassembly and all instructions in
106+
the aggregate are treated as no-ops.
107+
108+
{2 Transfer to front end}
109+
110+
The kind and structure of aggregates and analysis results involving
111+
their components are communicated to the (python) front end in the
112+
instruction results data contained in the function opcode dictionary
113+
(see bCHFnARMDictionary). The format varies with the kind of aggregate,
114+
as each kind has different types of values that characterize its
115+
operation. In the (python) front end these values are made accessible
116+
to the various instructions in the [InstrXData] objects created for
117+
each instruction. The documentation of each aggregate kind below and
118+
elsewhere presents more details on the actual format for each of these.
119+
120+
{2 Tests}
121+
122+
Currenly only two of the kinds of aggregates have associated unit tests:
123+
- bCHARMJumpTableTest
124+
- bCHThumbITSequenceTest
125+
These tests contain instances of code fragments encountered in actual
126+
binaries and give some insight in their use.
127+
128+
129+
{1 Predicate Assignments}
130+
131+
{2 Description}
132+
133+
A predicate assignment aggregate consists of two assignment instructions
134+
that combine into a single assignment of a boolean predicate. The pattern
135+
for the aggregate is two adjacent MOV instructions of the form:
136+
{[
137+
<test>
138+
MOVcc Rx, #1
139+
MOVcc' Rx, #0 where cc' is (not cc)
140+
]}
141+
or
142+
{[
143+
<test>
144+
MOVcc' Rx, #0
145+
MOVcc Rx, #1 where cc is not (cc')
146+
]}
147+
In all executions exactly one of these two MOV instructions is executed.
148+
If the joint condition <test-cc> is true Rx is assigned 1, if it is false
149+
Rx is assigned 0, and thus the postcondition of these two instructions
150+
is essentially the assignment of the boolean <test-cc>.
151+
152+
{3 Example}
153+
154+
{[
155+
<CMP R0, #5>
156+
MOVNE R1, #0
157+
MOVEQ R1, #1
158+
]}
159+
is translated into
160+
{[ R1 := (R0 == 5) ]}
161+
162+
{2 Representation}
163+
164+
The [arm_instruction_aggregate] instance of the predicate assignment
165+
records the address of the second instruction as the anchor, the
166+
destination operand, and whether the <test-cc> predicate is to be
167+
inverted, where cc is the condition code for the anchor instruction.
168+
The predicate is to be inverted if the anchor instruction assigns 0.
169+
170+
{2 Translation to CHIF}
171+
172+
The first MOV instruction is translated into a no-op. The second
173+
MOV instruction is translated into the assignment
174+
{[ Rx := [not] predicate ]}
175+
with the single defined variable Rx, and with variables used all
176+
variables referenced in the predicate.
177+
178+
If a predicate cannot be constructed or derived, the destination
179+
variable Rx is abstracted.
180+
181+
{2 Transfer to front end}
182+
183+
The first instruction is tagged as 'subsumed' by the second
184+
instruction, and thus to be treated as a no-op. The second
185+
instruction is tagged with 'agg:predassign' with a list of
186+
dependents (the first instruction in this case). It lists as
187+
a variable the destination operand, along with its (high)
188+
uses. If a predicate was construction, it lists
189+
(the possibly inverse of) the predicate in its original and
190+
rewritten form, along with its reaching definitions.
40191
*)
41192

42193
(* bchlib *)

0 commit comments

Comments
 (0)