-
Notifications
You must be signed in to change notification settings - Fork 453
Expand file tree
/
Copy pathhelpers.py
More file actions
540 lines (437 loc) · 19.3 KB
/
helpers.py
File metadata and controls
540 lines (437 loc) · 19.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
import contextlib
import inspect
from collections import deque
from dataclasses import dataclass
from functools import wraps
from types import FunctionType, MethodType
from typing import TYPE_CHECKING, Any, Callable
import torch
from accelerate.hooks import remove_hook_from_module
from compressed_tensors.offload import disable_onloading
from compressed_tensors.utils import patch_attr
from compressed_tensors.utils.match import match_named_modules
from loguru import logger
from torch.fx import Graph, GraphModule, Node
from torch.fx.graph import PythonCode
from torch.fx.proxy import Argument
from torch.nn import Module
from transformers import PreTrainedModel
from transformers.configuration_utils import PretrainedConfig
from llmcompressor.modifiers import Modifier
from llmcompressor.modifiers.utils.hooks import HooksMixin
from llmcompressor.pipelines.sequential.transformers_helpers import HFTracer
from llmcompressor.utils.helpers import calibration_forward_context
from llmcompressor.utils.pytorch.module import get_no_split_params
from .ast_helpers import append_autowrap_source_on_fail, autowrap_forwards
if TYPE_CHECKING:
from llmcompressor.args.dataset_arguments import DatasetArguments
__all__ = [
"trace_subgraphs",
"Subgraph",
"get_sequential_targets",
"handle_sequential_oom",
]
@dataclass
class Subgraph:
"""
Dataclass specifying an executable subgraph of a model graph
:param graph: subgraph of model graph
:param input_names: argument names of the compiled forward function
:param consumed_names: argument names which are not used by any subsequent subgraphs
and can therefore be deleted from the intermediates cache
"""
graph: Graph
input_names: set[str]
consumed_names: set[str]
_code: PythonCode | None = None
def forward(self, *args, **kwargs) -> dict[str, Any]:
"""
Execute the operations within the subgraph
:param \\*args: argument inputs to subgraph forward function
:param \\**kwargs: keyword inputs to subgraph forward function
:return keyword outputs of subgraph forward function (non-consumed variables):
"""
if self._code is None:
self._code = self.graph.python_code("self")
exec(self._code.src, self._code.globals)
forward_fn = self._code.globals.get("forward")
with append_autowrap_source_on_fail():
return forward_fn(*args, **kwargs)
def submodules(self, model: Module, recurse: bool = False) -> set[Module]:
nodes = self.graph.find_nodes(op="call_module")
modules = set(model.get_submodule(node.target) for node in nodes)
if recurse:
modules = set(m for module in modules for m in module.modules())
return modules
def trace_subgraphs(
model: PreTrainedModel,
sample_input: dict[str, Any],
sequential_targets: list[str],
ignore: list[str],
) -> list[Subgraph]:
"""
Trace a model to produce subgraphs, where each sequential target belongs to exactly
one subgraph and where executing each subgraph in order is equivalent to executing
the original model
:param model: model being traced
:param sample_input: inputs whose values will change during execution but whose
__len__, __bool__, and __contains__ values are assumed constant across batches
:param sequential_targets: list of patterns matching sequential targets
:param ignore: function and method names to skip during tracing
:return: a list of Subgraphs in order of execution
"""
# find modules
targets = set(
module for _, module in match_named_modules(model, sequential_targets)
)
ancestors = get_sequential_ancestors(model, targets)
offloaded = set() # TODO: cleanup logic
# initialize arguments
tracer = SequentialTracer(ancestors, offloaded)
concrete_args = populate_concrete_args(model, sample_input)
with contextlib.ExitStack() as stack:
# calibration context
stack.enter_context(calibration_forward_context(model))
stack.enter_context(HooksMixin.disable_hooks())
# flags useful for tracing
stack.enter_context(patch_attr(model.config, "_attn_implementation", "eager"))
stack.enter_context(patch_attr(torch.compiler, "_is_compiling_flag", True))
# autowrap forwards
stack.enter_context(autowrap_forwards(ancestors, ignore))
# avoid bug where pytorch cannot handle wrapped root functions
unwrapped = inspect.unwrap(model.forward).__get__(model)
stack.enter_context(patch_attr(model, "forward", unwrapped))
stack.enter_context(patch_attr(type(model), "forward", unwrapped.__func__))
assert isinstance(model.forward, MethodType)
assert isinstance(type(model).forward, FunctionType)
# avoid device movement during tracing
stack.enter_context(disable_onloading())
with append_autowrap_source_on_fail():
graph = GraphModule(
model,
tracer.trace(
model,
dummy_inputs=sample_input,
concrete_args=concrete_args,
complete_concrete_args_with_inputs_not_in_dummy_inputs=False,
# bug in trace throws an error for variadic
# args and kwargs in function signature
),
)
# copy metadata
graph.config = model.config
graph.class_for_deserialization = model.__class__
graph.device = model.device
# perform subgraph partition
partitions = topological_partition(graph, targets)
subgraphs = partition_graph(model, partitions)
trace_consumed_names(subgraphs)
# As currently implemented, `topological_partition` generates an extra subgraph at
# the beginning which does not contain a target. This adds a little more runtime,
# and could be folded into the first subgraph in the future
if len(subgraphs) != len(targets) + 1:
logger.warning(
f"Expected {len(targets)} subgraphs, but only traced {len(subgraphs)}. "
"This is likely due to having wrapped code which calls sequential targets"
)
return subgraphs
class SequentialTracer(HFTracer):
"""
Get a tracer specialized for the given model. The resulting tracer will not trace
inside of sequential targets, nor any modules which are not call graph ancestors of
sequential targets
Tracing within sequential targets is unnecessary, and tracing within offloaded
modules may result in meta tensors being added to the model graph
:param ancestors: modules which are ancestors of sequential targets
:param offloaded: modules which have offloaded params and should not be traced
"""
def __init__(self, ancestors: set[Module], offloaded: set[Module]):
self.ancestors = ancestors
self.offloaded = offloaded
# skip any mask creation functions not already caught by the autowrapper
super().__init__(autowrap_functions=_get_autowrap_functions())
# check unlikely case that ancestors have direct params which are offloaded
offloaded_ancestors = offloaded & ancestors
for ancestor in offloaded_ancestors:
remove_hook_from_module(ancestor, recurse=False)
self.offloaded.remove(ancestor)
logger.warning(
f"Direct parameters attached to {ancestor.__class__.__name__} have "
"been onloaded in order to ensure safe graph capture and execution"
)
def create_arg(self, a: Any) -> Argument:
# special extension allows models which depend on config values to be traced
if isinstance(a, PretrainedConfig):
kwargs = {k: self.create_arg(v) for k, v in a.to_dict().items()}
return self.create_node("call_function", a.__class__, (), kwargs)
else:
return super().create_arg(a)
def is_leaf_module(self, module: Module, module_qualified_name: str) -> bool:
# do not trace non-ancestors or modules with offloaded params
return module not in self.ancestors or module in self.offloaded
def populate_concrete_args(model: Module, sample_input: dict) -> dict:
"""
Creates concrete args which, unlike the equivalent function provided by
transformers.utils.fx, creates default values for variadic arguments, which are
needed by some models.
:param model: model being traced
:param sample_input: values used to symbolically trace the model. All arguments
to the model.forward function which are not in the sample_input are considered
concrete args
:return: dictionary mapping concrete argument names to their default values
"""
sig = inspect.signature(model.forward)
concrete_args = {}
for parameter in sig.parameters.values():
if parameter.name in sample_input:
continue
if parameter.kind == inspect._ParameterKind.VAR_POSITIONAL:
value = list()
elif parameter.kind == inspect._ParameterKind.VAR_KEYWORD:
value = dict()
elif parameter.name == "use_cache":
value = False
else:
value = parameter.default
concrete_args[parameter.name] = value
return concrete_args
def find_target_nodes(graph: GraphModule, targets: set[Module]) -> set[Node]:
"""
Find all nodes whose execution is equivalent to executing the target modules.
Note that these nodes are guaranteed to be treated as leaf nodes by SequentialTracer
:param graph: graph containing target nodes
:param targets: modules whose nodes are being searched for
:return: set of all nodes which call the target modules
"""
return set(
node
for node in graph.graph.nodes
if node.op == "call_module" and graph.get_submodule(node.target) in targets
)
def topological_partition(graph: GraphModule, targets: set[Module]) -> list[list[Node]]:
"""
Partition the graph into partitions such that each `target` belongs to exactly one
partition and executing each partition depends only on intermediate values produced
by executing the partitions before it.
:param graph: graph being partitioned
:param targets: target modules which will be assigned to disjoint partitions
:return: list of partitions, where each partition is a list of nodes belonging to
that partition
"""
assert graph_is_well_formed(graph.graph)
target_nodes = find_target_nodes(graph, targets)
partitions: list[list[Node]] = [[]]
remaining_indegrees = {
node: len([node for node in node.all_input_nodes if node.op != "get_attr"])
for node in graph.graph.nodes
}
partition_index = 0 # global counter
# start with graph input nodes,
# but delay the `get_attr` nodes as long as possible
queue = deque(
node
for node in graph.graph.nodes
if remaining_indegrees[node] == 0 and node.op != "get_attr"
)
while len(queue) > 0:
node = queue.popleft()
# assign to partition
partitions[partition_index].append(node)
# guarantee targets are assigned to disjoint partitions
if node in target_nodes:
partition_index += 1
partitions.append([])
# recurse on last indegree only in order to guarantee that
# the node is assigned to maximal partition
for user in node.users:
remaining_indegrees[user] -= 1
if remaining_indegrees[user] == 0:
queue.append(user)
# an ideal implementation would involve implicitly consolidating partition indices
# so that each node is assigned to the maximum partition possible (in order to delay
# execution as long as possible), but saving these nodes for last covers the most
# common and costly case (get_attr)
for node in graph.graph.find_nodes(op="get_attr"):
user_partitions = []
for user in node.users:
for index in range(len(partitions)):
if user in partitions[index]:
user_partitions.append(index)
break
# workaround
if len(user_partitions):
partition_index = min(user_partitions)
partitions[partition_index].insert(0, node)
return partitions
def partition_graph(model: Module, partitions: list[list[Node]]) -> list[Subgraph]:
"""
Convert each partition into a Subgraph. Each Subgraph returns a dictionary mapping
of output node names to their computed values. Note that the `consumed_names`
attribute of each Subgraph remains empty, to be later populated by
`trace_consumed_names`
:param model: model which owns the produced Subgraphs
:param partitions: list of partitions, where each partition is a list of nodes
belonging to that partition
:return: list of subgraphs in order of execution
"""
subgraphs = []
# create subgraphs
for partition_nodes in partitions:
# create a new graph for the partition
graph = Graph(model)
node_map = {}
# add placeholders for inputs not in this subgraph. use set to deduplicate
new_input_nodes = {
input_node
for node in partition_nodes
for input_node in node.all_input_nodes
if input_node not in partition_nodes and input_node.op
}
for input_node in new_input_nodes:
node_map[input_node] = graph.placeholder(input_node.name)
# add the nodes to subgraph
for node in partition_nodes:
node_map[node] = graph.node_copy(node, lambda n: node_map[n])
# add an output node to collect all subgraph outputs into a dictionary
if len(graph.find_nodes(op="output")) <= 0:
output_dict = {
node.name: node_map[node]
for node in partition_nodes
if any(user not in partition_nodes for user in node.users.keys())
}
graph.output(output_dict)
# save the subgraph for this partition
graph.lint()
input_names = set(node.name for node in graph.nodes if node.op == "placeholder")
subgraphs.append(
Subgraph(
graph=graph,
input_names=input_names,
consumed_names=set(), # populated later
)
)
assert graph_is_well_formed(graph)
return subgraphs
def trace_consumed_names(subgraphs: list[Subgraph]):
"""
Populate the `consumed_names` attribute of each Subgraph according to when inputs
are last used in order to vacate the `intermediates` cache and save memory
:param subgraphs: list of subgraphs with empty `consumed_names` attributes
"""
# populate consumed_names according to when inputs are last used
# in order to vacate the `intermediates` cache and save memory
all_input_names = set().union(*(subgraph.input_names for subgraph in subgraphs))
for input_name in all_input_names:
for subgraph in reversed(subgraphs):
if input_name in subgraph.input_names:
subgraph.consumed_names.add(input_name)
break
else:
raise ValueError(f"Could not find input name {input_name} in subgraphs")
def graph_is_well_formed(graph: Graph) -> bool:
"""
A graph is well formed if and only if
`nodeA in NodeB.users <=> nodeB in Node.A.all_input_nodes`
:param graph: graph being checked
:return: True if the graph is well formed, False otherwise
"""
for node in graph.nodes:
for user in node.users:
if node not in user.all_input_nodes:
return False
for input_node in node.all_input_nodes:
if node not in input_node.users:
return False
if len(node.users) != len(set(node.users)) or len(node.all_input_nodes) != len(
set(node.all_input_nodes)
):
return False
return True
def get_sequential_targets(
modifiers: list[Modifier], model: PreTrainedModel, args: "DatasetArguments"
) -> list[str]:
"""
Infer sequential targets from modifiers list and dataset args
:param model: model being calibrated
:param modifiers: list of modifiers being applied during calibration
:param dataset_args: dataset arguments passed by user
:return: list of sequential targets
"""
modifier_targets = [
(modifier, modifier.sequential_targets)
for modifier in modifiers
if getattr(modifier, "sequential_targets", None) is not None
]
# deprecation warning
if len(modifier_targets) >= 1:
logger.warning(
"Passing sequential targets through modifiers is deprecated, "
"please use `oneshot(sequential_targets=...)`"
)
# cannot infer from multiple modifiers
if len(modifier_targets) >= 2:
types = [type(modifier) for modifier, _ in modifier_targets]
raise ValueError(
"Cannot infer sequential targets from multiple sequential modifiers "
f"({types})"
)
# resolve single modifier
if len(modifier_targets) == 1:
if args.sequential_targets is not None:
raise ValueError(
f"Got sequential targets from both {type(modifier_targets[0][0])} "
"and dataset arguments `sequential_targets`"
)
sequential_targets = modifier_targets[0][1]
# if no modifiers, use data args
else:
sequential_targets = args.sequential_targets # may be `None`
# validate and infer
match sequential_targets:
case None:
return get_no_split_params(model)
case str():
return [sequential_targets]
case _:
return sequential_targets
def add_line_numbers(text: str) -> str:
lines = text.splitlines()
numbered_lines = [f"{i + 1} {line}" for i, line in enumerate(lines)]
return "\n".join(numbered_lines)
def get_sequential_ancestors(model: Module, targets: set[Module]) -> set[Module]:
"""
Find modules which are call graph ancestors of the given sequential targets
:param model: model containing sequential targets
:param targets: sequential targets to find ancestors of
:return: call graph ancestors of sequential targets
"""
ancestors = set()
def is_ancestor(module: Module) -> bool:
if module in ancestors or module in targets:
return True
# eagerly compute list in order to avoid early stopping and :. missing ancestors
_is_ancestor = any([is_ancestor(child) for child in module.children()])
if _is_ancestor:
ancestors.add(module)
return _is_ancestor
is_ancestor(model)
return ancestors
def _get_autowrap_functions() -> tuple[Callable[[Any], Any], ...]:
try:
from transformers.masking_utils import LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING
return tuple(LAYER_PATTERN_TO_MASK_FUNCTION_MAPPING.values())
except ImportError:
return tuple()
def handle_sequential_oom(func):
"""Catch ooms and suggest changing sequential targets"""
@wraps(func)
def wrapper(*args, **kwargs):
try:
return func(*args, **kwargs)
except torch.cuda.OutOfMemoryError as e:
raise torch.cuda.OutOfMemoryError(
"Sequential pipeline ran out of memory. "
"Please consider choosing a smaller module "
"for `sequential_targets` argument, ex. 'Linear'"
) from e
return wrapper