Skip to content

Commit fb99ef5

Browse files
Merge pull request #986 from mandiant/feature-981
add Address abstraction
2 parents 2ceed78 + be2dffe commit fb99ef5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+3484
-2310
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,16 @@
1414
- add unmanaged call characteristic for dotnet files #1023 @mike-hunhoff
1515
- add mixed mode characteristic feature extraction for dotnet files #1024 @mike-hunhoff
1616
- emit class and namespace features for dotnet files #1030 @mike-hunhoff
17+
- render: support Addresses that aren't simple integers, like .NET token+offset #981 @williballenthin
1718

1819
### Breaking Changes
1920

2021
- instruction scope and operand feature are new and are not backwards compatible with older versions of capa
2122
- Python 3.7 is now the minimum supported Python version #866 @williballenthin
2223
- remove /x32 and /x64 flavors of number and operand features #932 @williballenthin
2324
- the tool now accepts multiple paths to rules, and JSON doc updated accordingly @williballenthin
25+
- extractors must use handles to identify functions/basic blocks/instructions #981 @williballenthin
26+
- the freeze file format schema was updated, including format version bump to v2 #986 @williballenthin
2427

2528
### New Rules (7)
2629

capa/engine.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import capa.perf
1414
import capa.features.common
1515
from capa.features.common import Result, Feature
16+
from capa.features.address import Address
1617

1718
if TYPE_CHECKING:
1819
# circular import, otherwise
@@ -26,7 +27,7 @@
2627
# to collect the locations of a feature, do: `features[Number(0x10)]`
2728
#
2829
# aliased here so that the type can be documented and xref'd.
29-
FeatureSet = Dict[Feature, Set[int]]
30+
FeatureSet = Dict[Feature, Set[Address]]
3031

3132

3233
class Statement:
@@ -257,10 +258,10 @@ def evaluate(self, ctx, **kwargs):
257258
# inspect(match_details)
258259
#
259260
# aliased here so that the type can be documented and xref'd.
260-
MatchResults = Mapping[str, List[Tuple[int, Result]]]
261+
MatchResults = Mapping[str, List[Tuple[Address, Result]]]
261262

262263

263-
def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[int]):
264+
def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations: Iterable[Address]):
264265
"""
265266
record into the given featureset that the given rule matched at the given locations.
266267
@@ -277,7 +278,7 @@ def index_rule_matches(features: FeatureSet, rule: "capa.rules.Rule", locations:
277278
namespace, _, _ = namespace.rpartition("/")
278279

279280

280-
def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tuple[FeatureSet, MatchResults]:
281+
def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -> Tuple[FeatureSet, MatchResults]:
281282
"""
282283
match the given rules against the given features,
283284
returning an updated set of features and the matches.
@@ -315,10 +316,10 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, va: int) -> Tupl
315316
# sanity check
316317
assert bool(res) is True
317318

318-
results[rule.name].append((va, res))
319+
results[rule.name].append((addr, res))
319320
# we need to update the current `features`
320321
# because subsequent iterations of this loop may use newly added features,
321322
# such as rule or namespace matches.
322-
index_rule_matches(features, rule, [va])
323+
index_rule_matches(features, rule, [addr])
323324

324325
return (features, results)

capa/features/address.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import abc
2+
3+
from dncil.clr.token import Token
4+
5+
6+
class Address(abc.ABC):
7+
@abc.abstractmethod
8+
def __eq__(self, other):
9+
...
10+
11+
@abc.abstractmethod
12+
def __lt__(self, other):
13+
# implement < so that addresses can be sorted from low to high
14+
...
15+
16+
@abc.abstractmethod
17+
def __hash__(self):
18+
# implement hash so that addresses can be used in sets and dicts
19+
...
20+
21+
@abc.abstractmethod
22+
def __repr__(self):
23+
# implement repr to help during debugging
24+
...
25+
26+
27+
class AbsoluteVirtualAddress(int, Address):
28+
"""an absolute memory address"""
29+
30+
def __new__(cls, v):
31+
assert v >= 0
32+
return int.__new__(cls, v)
33+
34+
def __repr__(self):
35+
return f"absolute(0x{self:x})"
36+
37+
38+
class RelativeVirtualAddress(int, Address):
39+
"""a memory address relative to a base address"""
40+
41+
def __repr__(self):
42+
return f"relative(0x{self:x})"
43+
44+
45+
class FileOffsetAddress(int, Address):
46+
"""an address relative to the start of a file"""
47+
48+
def __new__(cls, v):
49+
assert v >= 0
50+
return int.__new__(cls, v)
51+
52+
def __repr__(self):
53+
return f"file(0x{self:x})"
54+
55+
56+
class DNTokenAddress(Address):
57+
"""a .NET token"""
58+
59+
def __init__(self, token: Token):
60+
self.token = token
61+
62+
def __eq__(self, other):
63+
return self.token.value == other.token.value
64+
65+
def __lt__(self, other):
66+
return self.token.value < other.token.value
67+
68+
def __hash__(self):
69+
return hash(self.token.value)
70+
71+
def __repr__(self):
72+
return f"token(0x{self.token.value:x})"
73+
74+
75+
class DNTokenOffsetAddress(Address):
76+
"""an offset into an object specified by a .NET token"""
77+
78+
def __init__(self, token: Token, offset: int):
79+
assert offset >= 0
80+
self.token = token
81+
self.offset = offset
82+
83+
def __eq__(self, other):
84+
return (self.token.value, self.offset) == (other.token.value, other.offset)
85+
86+
def __lt__(self, other):
87+
return (self.token.value, self.offset) < (other.token.value, other.offset)
88+
89+
def __hash__(self):
90+
return hash((self.token.value, self.offset))
91+
92+
def __repr__(self):
93+
return f"token(0x{self.token.value:x})+(0x{self.offset:x})"
94+
95+
96+
class _NoAddress(Address):
97+
def __eq__(self, other):
98+
return True
99+
100+
def __lt__(self, other):
101+
return False
102+
103+
def __hash__(self):
104+
return hash(0)
105+
106+
def __repr__(self):
107+
return "no address"
108+
109+
110+
NO_ADDRESS = _NoAddress()

capa/features/basicblock.py

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,18 +10,11 @@
1010

1111

1212
class BasicBlock(Feature):
13-
def __init__(self):
14-
super(BasicBlock, self).__init__(None)
13+
def __init__(self, description=None):
14+
super(BasicBlock, self).__init__(None, description=description)
1515

1616
def __str__(self):
1717
return "basic block"
1818

1919
def get_value_str(self):
2020
return ""
21-
22-
def freeze_serialize(self):
23-
return (self.__class__.__name__, [])
24-
25-
@classmethod
26-
def freeze_deserialize(cls, args):
27-
return cls()

capa/features/common.py

Lines changed: 24 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import codecs
1212
import logging
1313
import collections
14-
from typing import TYPE_CHECKING, Set, Dict, List, Union
14+
from typing import TYPE_CHECKING, Set, Dict, List, Union, Optional, Sequence
1515

1616
if TYPE_CHECKING:
1717
# circular import, otherwise
@@ -20,6 +20,7 @@
2020
import capa.perf
2121
import capa.features
2222
import capa.features.extractors.elf
23+
from capa.features.address import Address
2324

2425
logger = logging.getLogger(__name__)
2526
MAX_BYTES_FEATURE_SIZE = 0x100
@@ -70,20 +71,13 @@ def __init__(
7071
success: bool,
7172
statement: Union["capa.engine.Statement", "Feature"],
7273
children: List["Result"],
73-
locations=None,
74+
locations: Optional[Set[Address]] = None,
7475
):
75-
"""
76-
args:
77-
success (bool)
78-
statement (capa.engine.Statement or capa.features.Feature)
79-
children (list[Result])
80-
locations (iterable[VA])
81-
"""
8276
super(Result, self).__init__()
8377
self.success = success
8478
self.statement = statement
8579
self.children = children
86-
self.locations = locations if locations is not None else ()
80+
self.locations = locations if locations is not None else set()
8781

8882
def __eq__(self, other):
8983
if isinstance(other, bool):
@@ -98,7 +92,7 @@ def __nonzero__(self):
9892

9993

10094
class Feature(abc.ABC):
101-
def __init__(self, value: Union[str, int, bytes], description=None):
95+
def __init__(self, value: Union[str, int, float, bytes], description=None):
10296
"""
10397
Args:
10498
value (any): the value of the feature, such as the number or string.
@@ -116,6 +110,15 @@ def __hash__(self):
116110
def __eq__(self, other):
117111
return self.name == other.name and self.value == other.value
118112

113+
def __lt__(self, other):
114+
# TODO: this is a huge hack!
115+
import capa.features.freeze.features
116+
117+
return (
118+
capa.features.freeze.features.feature_from_capa(self).json()
119+
< capa.features.freeze.features.feature_from_capa(other).json()
120+
)
121+
119122
def get_value_str(self) -> str:
120123
"""
121124
render the value of this feature, for use by `__str__` and friends.
@@ -137,27 +140,10 @@ def __str__(self):
137140
def __repr__(self):
138141
return str(self)
139142

140-
def evaluate(self, ctx: Dict["Feature", Set[int]], **kwargs) -> Result:
143+
def evaluate(self, ctx: Dict["Feature", Set[Address]], **kwargs) -> Result:
141144
capa.perf.counters["evaluate.feature"] += 1
142145
capa.perf.counters["evaluate.feature." + self.name] += 1
143-
return Result(self in ctx, self, [], locations=ctx.get(self, []))
144-
145-
def freeze_serialize(self):
146-
return (self.__class__.__name__, [self.value])
147-
148-
@classmethod
149-
def freeze_deserialize(cls, args):
150-
# as you can see below in code,
151-
# if the last argument is a dictionary,
152-
# consider it to be kwargs passed to the feature constructor.
153-
if len(args) == 1:
154-
return cls(*args)
155-
elif isinstance(args[-1], dict):
156-
kwargs = args[-1]
157-
args = args[:-1]
158-
return cls(*args, **kwargs)
159-
else:
160-
return cls(*args)
146+
return Result(self in ctx, self, [], locations=ctx.get(self, set()))
161147

162148

163149
class MatchedRule(Feature):
@@ -230,7 +216,7 @@ def evaluate(self, ctx, short_circuit=True):
230216
# instead, return a new instance that has a reference to both the substring and the matched values.
231217
return Result(True, _MatchedSubstring(self, matches), [], locations=locations)
232218
else:
233-
return Result(False, _MatchedSubstring(self, None), [])
219+
return Result(False, _MatchedSubstring(self, {}), [])
234220

235221
def __str__(self):
236222
return "substring(%s)" % self.value
@@ -244,11 +230,11 @@ class _MatchedSubstring(Substring):
244230
note: this type should only ever be constructed by `Substring.evaluate()`. it is not part of the public API.
245231
"""
246232

247-
def __init__(self, substring: Substring, matches):
233+
def __init__(self, substring: Substring, matches: Dict[str, Set[Address]]):
248234
"""
249235
args:
250-
substring (Substring): the substring feature that matches.
251-
match (Dict[string, List[int]]|None): mapping from matching string to its locations.
236+
substring: the substring feature that matches.
237+
match: mapping from matching string to its locations.
252238
"""
253239
super(_MatchedSubstring, self).__init__(str(substring.value), description=substring.description)
254240
# we want this to collide with the name of `Substring` above,
@@ -327,7 +313,7 @@ def evaluate(self, ctx, short_circuit=True):
327313
# see #262.
328314
return Result(True, _MatchedRegex(self, matches), [], locations=locations)
329315
else:
330-
return Result(False, _MatchedRegex(self, None), [])
316+
return Result(False, _MatchedRegex(self, {}), [])
331317

332318
def __str__(self):
333319
return "regex(string =~ %s)" % self.value
@@ -341,11 +327,11 @@ class _MatchedRegex(Regex):
341327
note: this type should only ever be constructed by `Regex.evaluate()`. it is not part of the public API.
342328
"""
343329

344-
def __init__(self, regex: Regex, matches):
330+
def __init__(self, regex: Regex, matches: Dict[str, Set[Address]]):
345331
"""
346332
args:
347-
regex (Regex): the regex feature that matches.
348-
match (Dict[string, List[int]]|None): mapping from matching string to its locations.
333+
regex: the regex feature that matches.
334+
matches: mapping from matching string to its locations.
349335
"""
350336
super(_MatchedRegex, self).__init__(str(regex.value), description=regex.description)
351337
# we want this to collide with the name of `Regex` above,
@@ -389,13 +375,6 @@ def evaluate(self, ctx, **kwargs):
389375
def get_value_str(self):
390376
return hex_string(bytes_to_str(self.value))
391377

392-
def freeze_serialize(self):
393-
return (self.__class__.__name__, [bytes_to_str(self.value).upper()])
394-
395-
@classmethod
396-
def freeze_deserialize(cls, args):
397-
return cls(*[codecs.decode(x, "hex") for x in args])
398-
399378

400379
# other candidates here: https://docs.microsoft.com/en-us/windows/win32/debug/pe-format#machine-types
401380
ARCH_I386 = "i386"

0 commit comments

Comments
 (0)