Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9a14505
WIP
daniil-lyakhov May 21, 2024
8352917
WIP transformation is ok, performance is bad
daniil-lyakhov May 22, 2024
566bff2
Performant model modified by hands!
daniil-lyakhov May 23, 2024
4f77678
Init grpah converter
daniil-lyakhov May 23, 2024
fdd87f2
WIP min_max support
daniil-lyakhov May 27, 2024
4b842b3
WIP statistic collection and quantizer params calculation
daniil-lyakhov May 27, 2024
2c2921a
WIP FQ insertion
daniil-lyakhov May 27, 2024
857a255
BN fused, conv and bias separated
daniil-lyakhov May 29, 2024
8934a06
WIP resnet18 accuracy check
daniil-lyakhov May 29, 2024
ceb9c4b
Default quantization file
daniil-lyakhov May 31, 2024
59f003d
WIP model performs in torch.compile(backend="openvino")
daniil-lyakhov May 31, 2024
5a0d546
WIP
daniil-lyakhov May 31, 2024
35ec4b0
Resnet18 example acc and performance alligned nncf/x86 inductor
daniil-lyakhov Jun 3, 2024
3b1e7f0
Assymetric activations are forced
daniil-lyakhov Jun 3, 2024
a40c281
Init Torch.fx BiasCorrection
daniil-lyakhov Jun 3, 2024
4ded876
Attempt to quantize an anomalib model
daniil-lyakhov Jun 4, 2024
9eaa0a4
YOLO_v8 conversion attempt
daniil-lyakhov Jun 4, 2024
d45a55f
Yolo v8 quantization
daniil-lyakhov Jun 17, 2024
16b3126
ssd_vgg300, ssdlite mobilenetv3, yolov8 repro
daniil-lyakhov Jun 21, 2024
7cedba5
WIP
daniil-lyakhov Jun 21, 2024
8831330
Transformers quantization WIP
daniil-lyakhov Jun 21, 2024
85eb529
YOLO v8: check_export_not_strict
daniil-lyakhov Jun 24, 2024
4641f8d
WIP
daniil-lyakhov Jun 24, 2024
37c2f10
Sanity test and code cleaning
daniil-lyakhov Jun 25, 2024
24e96b1
Rebase partly
daniil-lyakhov Jun 25, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
456 changes: 456 additions & 0 deletions aa_torch_fx.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion examples/llm_compression/openvino/tiny_llama/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
import time
from functools import partial

import datasets
import numpy as np
import openvino as ov
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer

import datasets
import nncf


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@

import numpy as np
import openvino as ov
from datasets import load_dataset
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
from whowhatbench import Evaluator

import nncf
from datasets import load_dataset
from nncf.common.logging import nncf_logger

DataItem = TypeVar("DataItem")
Expand Down
282 changes: 245 additions & 37 deletions examples/post_training_quantization/openvino/yolov8/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,36 +8,102 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

os.environ["TORCHINDUCTOR_FREEZING"] = "1"

import re
import subprocess
import time
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, Tuple
from typing import Dict, Tuple

import numpy as np
import openvino as ov
import openvino.torch # noqa
import torch
from torch._export import capture_pre_autograd_graph
from torch.export import Dim # noqa
from torch.fx.passes.graph_drawer import FxGraphDrawer
from tqdm import tqdm
from ultralytics.cfg import get_cfg
from ultralytics.data.converter import coco80_to_coco91_class
from ultralytics.data.utils import check_det_dataset
from ultralytics.engine.validator import BaseValidator as Validator
from ultralytics.models.yolo import YOLO
from ultralytics.utils import DATASETS_DIR
from ultralytics.utils import DEFAULT_CFG
from ultralytics.utils.metrics import ConfusionMatrix
from ultralytics.utils.torch_utils import de_parallel

import nncf

ROOT = Path(__file__).parent.resolve()


def validate(
def measure_time(model, example_inputs, num_iters=500):
with torch.no_grad():
model(*example_inputs)
total_time = 0
for i in range(0, num_iters):
start_time = time.time()
model(*example_inputs)
total_time += time.time() - start_time
average_time = (total_time / num_iters) * 1000
return average_time


def measure_time_ov(model, example_inputs, num_iters=1000):
ie = ov.Core()
compiled_model = ie.compile_model(model, "CPU")
infer_request = compiled_model.create_infer_request()
infer_request.infer(example_inputs)
total_time = 0
for i in range(0, num_iters):
start_time = time.time()
infer_request.infer(example_inputs)
total_time += time.time() - start_time
average_time = (total_time / num_iters) * 1000
return average_time


def validate_fx(
model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None
) -> Tuple[Dict, int, int]:
validator.seen = 0
validator.jdict = []
validator.stats = []
validator.confusion_matrix = ConfusionMatrix(nc=validator.nc)
# validator.seen = 0
# validator.jdict = []
# validator.stats = []
# validator.confusion_matrix = ConfusionMatrix(nc=validator.nc)
for batch_i, batch in enumerate(data_loader):
if num_samples is not None and batch_i == num_samples:
break
batch = validator.preprocess(batch)
preds = model(batch["img"])
preds = validator.postprocess(preds)
validator.update_metrics(preds, batch)
stats = validator.get_stats()
return stats, validator.seen, validator.nt_per_class.sum()


def print_statistics_short(stats: np.ndarray) -> None:
mp, mr, map50, mean_ap = (
stats["metrics/precision(B)"],
stats["metrics/recall(B)"],
stats["metrics/mAP50(B)"],
stats["metrics/mAP50-95(B)"],
)
s = ("%20s" + "%12s" * 4) % ("Class", "Precision", "Recall", "[email protected]", "[email protected]:.95")
print(s)
pf = "%20s" + "%12.3g" * 4 # print format
print(pf % ("all", mp, mr, map50, mean_ap))


def validate_ov(
model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, num_samples: int = None
) -> Tuple[Dict, int, int]:
# validator.seen = 0
# validator.jdict = []
# validator.stats = []
# validator.confusion_matrix = ConfusionMatrix(nc=validator.nc)
model.reshape({0: [1, 3, -1, -1]})
compiled_model = ov.compile_model(model)
output_layer = compiled_model.output(0)
Expand Down Expand Up @@ -65,21 +131,19 @@ def print_statistics(stats: np.ndarray, total_images: int, total_objects: int) -
print(pf % ("all", total_images, total_objects, mp, mr, map50, mean_ap))


def prepare_validation(model: YOLO, args: Any) -> Tuple[Validator, torch.utils.data.DataLoader]:
validator = model.smart_load("validator")(args)
validator.data = check_det_dataset(args.data)
dataset = validator.data["val"]
print(f"{dataset}")
def prepare_validation(model: YOLO, data: str) -> Tuple[Validator, torch.utils.data.DataLoader]:
# custom = {"rect": True, "batch": 1} # method defaults
# rect: false forces to resize all input pictures to one size
custom = {"rect": False, "batch": 1} # method defaults
args = {**model.overrides, **custom, "mode": "val"} # highest priority args on the right

data_loader = validator.get_dataloader(f"{DATASETS_DIR}/coco128", 1)
validator = model._smart_load("validator")(args=args, _callbacks=model.callbacks)
stride = 32 # default stride
validator.stride = stride # used in get_dataloader() for padding
validator.data = check_det_dataset(data)
validator.init_metrics(de_parallel(model))

validator = model.smart_load("validator")(args)

validator.is_coco = True
validator.class_map = coco80_to_coco91_class()
validator.names = model.model.names
validator.metrics.names = validator.names
validator.nc = model.model.model[-1].nc
data_loader = validator.get_dataloader(validator.data.get(validator.args.split), validator.args.batch)

return validator, data_loader

Expand All @@ -104,7 +168,9 @@ def prepare_openvino_model(model: YOLO, model_name: str) -> Tuple[ov.Model, Path
return ov.Core().read_model(ir_model_path), ir_model_path


def quantize(model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator) -> ov.Model:
def quantize(
model: ov.Model, data_loader: torch.utils.data.DataLoader, validator: Validator, original_model
) -> ov.Model:
def transform_fn(data_item: Dict):
"""
Quantization transform function. Extracts and preprocess input data from dataloader
Expand Down Expand Up @@ -136,44 +202,186 @@ def transform_fn(data_item: Dict):
return quantized_model


NNCF_QUANTIZATION = False


def quantize_impl(exported_model, val_loader, validator):
def transform_fn(x):
batch = validator.preprocess(x)
return batch["img"]

calibration_dataset = nncf.Dataset(val_loader, transform_fn)
dir_name = str(Path(__file__).parent)
if NNCF_QUANTIZATION:
converted_model = nncf.quantize(
exported_model,
calibration_dataset,
ignored_scope=nncf.IgnoredScope(
types=["mul", "sub", "sigmoid"],
subgraphs=[
nncf.Subgraph(
inputs=["cat_13", "cat_14", "cat_15"],
outputs=["output"],
)
],
),
)
g = FxGraphDrawer(converted_model, "yolo_nncf_fx_int8")
g.get_dot_graph().write_svg(dir_name + "/yolo_nncf_fx_int8.svg")

quantized_model = torch.compile(converted_model, backend="openvino")
return quantized_model
else:
from torch.ao.quantization.quantize_pt2e import convert_pt2e
from torch.ao.quantization.quantize_pt2e import prepare_pt2e
from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer
from torch.ao.quantization.quantizer.x86_inductor_quantizer import get_default_x86_inductor_quantization_config

quantizer = X86InductorQuantizer()
quantizer.set_global(get_default_x86_inductor_quantization_config())

prepared_model = prepare_pt2e(exported_model, quantizer)

for idx, batch in tqdm(enumerate(calibration_dataset.get_inference_data())):
if idx >= 300:
break
prepared_model(batch)

converted_model = convert_pt2e(prepared_model)

g = FxGraphDrawer(prepared_model, "yolo_torch_fx_int8")
g.get_dot_graph().write_svg(dir_name + "/yolo_torch_fx_int8.svg")
import torch._inductor.config as config

config.cpp_wrapper = True

quantized_model = torch.compile(converted_model)
return quantized_model


TORCH_FX = True
MODEL_NAME = "yolov8n"


def main():
MODEL_NAME = "yolov8n"

model = YOLO(f"{ROOT}/{MODEL_NAME}.pt")
args = get_cfg(cfg=DEFAULT_CFG)
args.data = "coco128.yaml"

# args = get_cfg(cfg=DEFAULT_CFG)
# args.data = "coco128.yaml"
# Prepare validation dataset and helper
validator, data_loader = prepare_validation(model, args)

validator, data_loader = prepare_validation(model, "coco128.yaml")

# Convert to OpenVINO model
batch = next(iter(data_loader))
batch = validator.preprocess(batch)

if TORCH_FX:
fp_stats, total_images, total_objects = validate_fx(model.model, tqdm(data_loader), validator)
print("Floating-point Torch model validation results:")
print_statistics(fp_stats, total_images, total_objects)

if NNCF_QUANTIZATION:
fp32_compiled_model = torch.compile(model.model, backend="openvino")
else:
fp32_compiled_model = torch.compile(model.model)
fp32_stats, total_images, total_objects = validate_fx(fp32_compiled_model, tqdm(data_loader), validator)
print("FP32 FX model validation results:")
print_statistics(fp32_stats, total_images, total_objects)

print("Start quantization...")
# Rebuild model to reset ultralitics cache
model = YOLO(f"{ROOT}/{MODEL_NAME}.pt")
with torch.no_grad():
model.model.eval()
model.model(batch["img"])
# dynamic_shapes = ((None, None, Dim("H", min=1, max=29802), Dim("W", min=1, max=29802)),)
dynamic_shapes = ((None, None, None, None),)
exported_model = capture_pre_autograd_graph(
model.model, args=(batch["img"],), dynamic_shapes=dynamic_shapes
)
quantized_model = quantize_impl(deepcopy(exported_model), data_loader, validator)

int8_stats, total_images, total_objects = validate_fx(quantized_model, tqdm(data_loader), validator)
print("INT8 FX model validation results:")
print_statistics(int8_stats, total_images, total_objects)

print("Start FX fp32 model benchmarking...")
fp32_latency = measure_time(fp32_compiled_model, (batch["img"],))
print(f"fp32 FX latency: {fp32_latency}")

print("Start FX int8 model benchmarking...")
int8_latency = measure_time(quantized_model, (batch["img"],))
print(f"FX int8 latency: {int8_latency}")
print(f"Speed up: {fp32_latency / int8_latency}")
return

ov_model, ov_model_path = prepare_openvino_model(model, MODEL_NAME)

# Quantize mode in OpenVINO representation
quantized_model = quantize(ov_model, data_loader, validator)
quantized_model = quantize(ov_model, data_loader, validator, model)
quantized_model_path = Path(f"{ROOT}/{MODEL_NAME}_openvino_model/{MODEL_NAME}_quantized.xml")
ov.save_model(quantized_model, str(quantized_model_path), compress_to_fp16=False)

args = get_cfg(cfg=DEFAULT_CFG)
args.data = "coco128.yaml"
# Validate FP32 model
fp_stats, total_images, total_objects = validate(ov_model, tqdm(data_loader), validator)
fp_stats, total_images, total_objects = validate_ov(ov_model, tqdm(data_loader), validator)
print("Floating-point model validation results:")
print_statistics(fp_stats, total_images, total_objects)

# Validate quantized model
q_stats, total_images, total_objects = validate(quantized_model, tqdm(data_loader), validator)
q_stats, total_images, total_objects = validate_ov(quantized_model, tqdm(data_loader), validator)
print("Quantized model validation results:")
print_statistics(q_stats, total_images, total_objects)

# Benchmark performance of FP32 model
fp_model_perf = benchmark_performance(ov_model_path, args)
print(f"Floating-point model performance: {fp_model_perf} FPS")

# Benchmark performance of quantized model
quantized_model_perf = benchmark_performance(quantized_model_path, args)
print(f"Quantized model performance: {quantized_model_perf} FPS")
fps = True
latency = True
fp_model_perf = -1
quantized_model_perf = -1
if fps:
# Benchmark performance of FP32 model
fp_model_perf = benchmark_performance(ov_model_path, args)
print(f"Floating-point model performance: {fp_model_perf} FPS")

# Benchmark performance of quantized model
quantized_model_perf = benchmark_performance(quantized_model_path, args)
print(f"Quantized model performance: {quantized_model_perf} FPS")
if latency:
fp_model_latency = measure_time_ov(ov_model, batch["img"])
print(f"FP32 OV model latency: {fp_model_latency}")
int8_model_latency = measure_time_ov(quantized_model, batch["img"])
print(f"INT8 OV model latency: {int8_model_latency}")

return fp_stats["metrics/mAP50-95(B)"], q_stats["metrics/mAP50-95(B)"], fp_model_perf, quantized_model_perf


def main_export_not_strict():
model = YOLO(f"{ROOT}/{MODEL_NAME}.pt")

# Prepare validation dataset and helper
validator, data_loader = prepare_validation(model, "coco128.yaml")

batch = next(iter(data_loader))
batch = validator.preprocess(batch)

model.model(batch["img"])
ex_model = torch.export.export(model.model, args=(batch["img"],), strict=False)
ex_model = capture_pre_autograd_graph(ex_model.module(), args=(batch["img"],))
ex_model = torch.compile(ex_model)

fp_stats, total_images, total_objects = validate_fx(ex_model, tqdm(data_loader), validator)
print("Floating-point ex strict=False")
print_statistics(fp_stats, total_images, total_objects)

quantized_model = quantize_impl(deepcopy(ex_model), data_loader, validator)
int8_stats, total_images, total_objects = validate_fx(quantized_model, tqdm(data_loader), validator)
print("Int8 ex strict=False")
print_statistics(int8_stats, total_images, total_objects)
# No quantized were inserted, metrics are OK


if __name__ == "__main__":
# main_export_not_strict()
main()
Loading