diff --git a/examples/quantization_w8a8_fp8/fp8_block_example.py b/examples/quantization_w8a8_fp8/fp8_block_example.py index b5d6ca1f9..68f13cf93 100644 --- a/examples/quantization_w8a8_fp8/fp8_block_example.py +++ b/examples/quantization_w8a8_fp8/fp8_block_example.py @@ -15,9 +15,7 @@ # In this case, we: # * quantize the weights to fp8 with per channel via ptq # * quantize the activations to fp8 with dynamic per token -recipe = QuantizationModifier( - targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"] -) +recipe = QuantizationModifier(targets="Linear", scheme="FP8_BLOCK", ignore=["lm_head"]) # Apply quantization. oneshot(model=model, recipe=recipe) diff --git a/src/llmcompressor/pipelines/sequential/helpers.py b/src/llmcompressor/pipelines/sequential/helpers.py index 51e7c3a74..51e86cf32 100644 --- a/src/llmcompressor/pipelines/sequential/helpers.py +++ b/src/llmcompressor/pipelines/sequential/helpers.py @@ -277,14 +277,14 @@ def topological_partition(graph: GraphModule, targets: Set[Module]) -> List[List while len(queue) > 0: node = queue.popleft() - # assign to partition - partitions[partition_index].append(node) - # guarantee targets are assigned to disjoint partitions - if node in target_nodes: + if node in target_nodes and len(partitions[partition_index]) > 0: partition_index += 1 partitions.append([]) + # assign to partition + partitions[partition_index].append(node) + # recurse on last indegree only in order to guarantee that # the node is assigned to maximal partition for user in node.users: