daniil-lyakhov · daniil-lyakhov · Nov 8, 2023 · Nov 10, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/.github/workflows/model_hub.yml b/.github/workflows/model_hub.yml
@@ -0,0 +1,21 @@
+name: Model Hub
+
+on:
+  workflow_dispatch:
+
+jobs:
+  torch:
+    runs-on: ubuntu-20.04-16-cores
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.8.10
+      - name: Install NNCF and test requirements
+        run: make install-models-hub-torch
+
+      - name: Run models-hub-torch test scope
+        run: make test-models-hub-torch
diff --git a/Makefile b/Makefile
@@ -50,6 +50,7 @@ test-examples-onnx:
 install-openvino-test:
 	pip install -U pip
 	pip install -e .[openvino]
+	pip install tensorflow==2.12.0
 	pip install -r tests/openvino/requirements.txt
 	pip install -r tests/cross_fw/install/requirements.txt
 	pip install -r tests/cross_fw/examples/requirements.txt
@@ -113,8 +114,17 @@ install-torch-dev: install-torch-test install-pre-commit
 	pip install -r examples/post_training_quantization/torch/mobilenet_v2/requirements.txt
 	pip install -r examples/post_training_quantization/torch/ssd300_vgg16/requirements.txt
 
+install-models-hub-torch:
+	pip install -U pip
+	pip install -e .
+	pip install -r tests/torch/models_hub_test/requirements.txt
+	# Install wheel to run pip with --no-build-isolation
+	pip install wheel
+	pip install --no-build-isolation -r tests/torch/models_hub_test/requirements_secondary.txt
+
+
 test-torch:
-	pytest ${COVERAGE_ARGS} tests/torch -m "not weekly and not nightly" --junitxml ${JUNITXML_PATH} $(DATA_ARG)
+	pytest ${COVERAGE_ARGS} tests/torch -m "not weekly and not nightly and not models_hub" --junitxml ${JUNITXML_PATH} $(DATA_ARG)
 
 test-torch-nightly:
 	pytest ${COVERAGE_ARGS} tests/torch -m nightly --junitxml ${JUNITXML_PATH} $(DATA_ARG)
@@ -138,6 +148,9 @@ test-examples-torch:
 		--backend torch                     \
 		--junitxml ${JUNITXML_PATH}
 
+test-models-hub-torch:
+	pytest tests/torch/models_hub_test --junitxml ${JUNITXML_PATH}
+
 ###############################################################################
 # Common part
 install-common-test:

diff --git a/ReleaseNotes.md b/ReleaseNotes.md
@@ -1,5 +1,52 @@
 # Release Notes
 
+## New in Release 2.7.0
+
+Post-training Quantization:
+
+- Features:
+  - (OpenVINO) Added support for data-free 4-bit weights compression through NF4 and INT4 data types (`compress_weights(…)` pipeline).
+  - (OpenVINO) Added support for [IF operation](https://docs.openvino.ai/latest/openvino_docs_ops_infrastructure_If_8.html) quantization.
+  - (OpenVINO) Added `dump_intermediate_model` parameter support for AccuracyAwareAlgorithm (`quantize_with_accuracy_control(…)` pipeline).
+  - (OpenVINO) Added support for SmoothQuant and ChannelAlignment algorithms for HyperparameterTuner algorithm (`quantize_with_tune_hyperparams(…)` pipeline).
+  - (PyTorch) Post-training Quantization is now supported with `quantize(…)` pipeline and the common implementation of quantization algorithms. Deprecated `create_compressed_model()` method for Post-training Quantization.
+  - Added new types (AvgPool, GroupNorm, LayerNorm) to the ignored scope for `ModelType.Transformer` scheme.
+  - `QuantizationPreset.Mixed` was set as the default for `ModelType.Transformer` scheme.
+- Fixes:
+  - (OpenVINO, ONNX, PyTorch) Aligned/added patterns between backends (SE block, MVN layer, multiple activations, etc.) to restore performance/metrics.
+  - Fixed patterns for `ModelType.Transformer` to align with the [quantization scheme](https://docs.openvino.ai/latest/openvino_docs_OV_UG_lpt.html).
+- Improvements:
+  - Improved UX with the new progress bar for pipeline, new exceptions, and .dot graph visualization updates.
+  - (OpenVINO) Optimized WeightsCompression algorithm (`compress_weights(…)` pipeline) execution time for LLM's quantization, added ignored scope support.
+  - (OpenVINO) Optimized AccuracyAwareQuantization algorithm execution time with multi-threaded approach while calculating ranking score (`quantize_with_accuracy_control(…)` pipeline).
+  - (OpenVINO) Added [extract_ov_subgraph tool](tools/extract_ov_subgraph.py) for large IR subgraph extraction.
+  - (ONNX) Optimized quantization pipeline (up to 1.15x speed up).
+- Tutorials:
+  - [Post-Training Optimization of BLIP Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/233-blip-visual-language-processing)
+  - [Post-Training Optimization of DeepFloyd IF Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/238-deepfloyd-if)
+  - [Post-Training Optimization of Grammatical Error Correction Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/214-grammar-correction)
+  - [Post-Training Optimization of Dolly 2.0 Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/240-dolly-2-instruction-following)
+  - [Post-Training Optimization of Massively Multilingual Speech Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/255-mms-massively-multilingual-speech)
+  - [Post-Training Optimization of OneFormer Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/249-oneformer-segmentation)
+  - [Post-Training Optimization of InstructPix2Pix Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/231-instruct-pix2pix-image-editing)
+  - [Post-Training Optimization of LLaVA Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/257-llava-multimodal-chatbot)
+  - [Post-Training Optimization of Latent Consistency Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/263-latent-consistency-models-image-generation)
+  - [Post-Training Optimization of Distil-Whisper Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/267-distil-whisper-asr)
+  - [Post-Training Optimization of FastSAM Model](https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/261-fast-segment-anything)
+- Known issues:
+  - (ONNX) `quantize(...)` method can generate inaccurate int8 results for models with the BatchNormalization layer that contains biases. To get the best accuracy, use the `do_constant_folding=True` option during export from PyTorch to ONNX.
+
+Compression-aware training:
+
+- Fixes:
+  - (PyTorch) Fixed Hessian trace calculation to solve [#2155](https://github.com/openvinotoolkit/nncf/issues/2155) issue.
+- Requirements:
+  - Updated PyTorch version (2.1.0).
+  - Updated numpy version (<1.27).
+- Deprecations/Removals:
+  - (PyTorch) Removed legacy external quantizer storage names.
+  - (PyTorch) Removed torch < 2.0 version support.
+
 ## New in Release 2.6.0
 
 Post-training Quantization:

diff --git a/codecov.yml b/codecov.yml
@@ -5,8 +5,7 @@ ignore:
 
 codecov:
   notify:
-     after_n_builds: 2
-     wait_for_ci: no
+     wait_for_ci: true
   max_report_age: off
 
 coverage:
@@ -15,6 +14,7 @@ coverage:
       default:
         branches:
           - develop
+        target: 90%
         informational: true
         only_pulls: true
         paths:
@@ -23,15 +23,72 @@ coverage:
       default:
         branches:
           - develop
+        target: 90%
         informational: true
         only_pulls: true
         paths:
-          - "nncf/onnx"
-          - "nncf/common"  # extend this once we collect coverage reports for more than just onnx and common part of precommit
+          - "nncf"
 
 comment:
-  layout: "diff, flags, files"
+  layout: "reach, diff, files, flags, components"
   require_changes: false
 
   require_head: false
   require_base: false
+
+flag_management:
+  # Flag coverage percentage seems to show the "percentage of lines under the flag path covered as reported ONLY
+  # by the upload with the corresponding flag", so e.g. for COMMON the flag coverage percentage will report the
+  # percentage of common code tested ONLY by the common tests, and e.g. not by backend-specific precommit parts
+  # (which also run common code and are therefore indirectly providing coverage). Ideally each flag-specific path
+  # would be described below with the corresponding flag and provide valuable information on whether the test code base
+  # is written efficiently, e.g. that the backend-specific tests predominantly validate backend-specific code and the
+  # common tests completely cover the common code on their own. However, if we set all flags with paths here, then the
+  # total repo coverage percentage will sink, because codecov currently reports the overall coverage based on the union
+  # of the "flag" coverages - not the "component" coverages (see below) - and currently NNCF's precommit tests are
+  # biased toward validating common code via backend-specific tests. In the future the tests will be gradually
+  # refactored to have more "locality" in what each precommit section tests.
+  individual_flags:
+    - name: COMMON
+      paths:
+        - nncf/common
+        - nncf/quantization
+
+component_management:
+  # In contrast to the "flag" coverage above, the "component" display seems to calculate percentage based on the
+  # coverage information from ALL uploads for the code in the specified path. With this, the "component" coverage
+  # percentage is a better representation of what sub-paths in the NNCF code base are covered with at least one test,
+  # without distinction whether the test was run in the
+  individual_components:
+    - component_id: common
+      name: common
+      paths:
+        - nncf/common
+        - "!nncf/**/torch_*.py"
+        - "!nncf/**/tensorflow_*.py"
+        - "!nncf/**/onnx_*.py"
+        - "!nncf/**/openvino_*.py"
+    - component_id: torch
+      name: torch
+      paths:
+        - nncf/torch
+        - nncf/**/torch_*.py
+    - component_id: tensorflow
+      name: tensorflow
+      paths:
+        - nncf/tensorflow
+        - nncf/**/tensorflow_*.py
+    - component_id: onnx
+      name: onnx
+      paths:
+        - nncf/onnx
+        - nncf/**/onnx_*.py
+    - component_id: openvino
+      name: openvino
+      paths:
+        - nncf/openvino
+        - nncf/**/openvino_*.py
+    - component_id: quantization
+      name: ptq
+      paths:
+        - nncf/quantization
diff --git a/docs/Installation.md b/docs/Installation.md
@@ -69,7 +69,8 @@ as well as the supported versions of Python:
 
 | NNCF      | OpenVINO   | PyTorch  | ONNX     | TensorFlow | Python |
 |-----------|------------|----------|----------|------------|--------|
-| `develop` | `2023.1.0` | `2.1`    | `1.13.1` | `2.12.0`   | `3.8`  |
+| `develop` | `2023.2.0` | `2.1`    | `1.13.1` | `2.12.0`   | `3.8`  |
+| `2.7.0`   | `2023.2.0` | `2.1`    | `1.13.1` | `2.12.0`   | `3.8`  |
 | `2.6.0`   | `2023.1.0` | `2.0.1`  | `1.13.1` | `2.12.0`   | `3.8`  |
 | `2.5.0`   | `2023.0.0` | `1.13.1` | `1.13.1` | `2.11.1`   | `3.8`  |
 | `2.4.0`   | `2022.1.0` | `1.12.1` | `1.12.0` | `2.8.2`    | `3.8`  |
diff --git a/docs/compression_algorithms/CompressWeights.md b/docs/compression_algorithms/CompressWeights.md
@@ -8,22 +8,30 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod
 
 #### Supported modes
 
-By default, weights are compressed to 8-bit integer data type - "INT8" mode.
+By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode.
 OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is unsigned 4-bit integer and weights are quantized to it [symmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) with a fixed zero point equals to 8. In case of INT4_ASYM mode - also unsigned 4-bit integer, but weight are quantized to it [asymmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
 All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale).
-First embedding and last linear layers are always compressed to 8-bit integer data type.
-Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit integer data type.
+All embeddings and last linear layers are always compressed to 8-bit integer data type.
+Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit asymmetric integer data type.
 
 #### User guide
 
-- Compress weights to 8-bit integer data type.
+- Compress weights asymmetrically to 8-bit integer data type.
 
 ```python
 from nncf import compress_weights
 compressed_model = compress_weights(model)
 ```
 
-- Compress weights symmetrically to 4-bit integer data type with group size = 128, except first embedding and last linear layers - they are compressed to 8-bit integer data type.
+- Compress weights symmetrically to 8-bit integer data type.
+
+```python
+from nncf import compress_weights
+from nncf import CompressWeightsMode
+compressed_model = compress_weights(model, mode=CompressWeightsMode.INT8_SYM)
+```
+
+- Compress weights symmetrically to 4-bit integer data type with group size = 128, except embeddings and last linear layers - they are compressed asymmetrically to 8-bit integer data type.
 
 ```python
 from nncf import compress_weights
@@ -36,7 +44,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM)
   If the accuracy or perplexity is still not satisfying, there are 2 more hyper-parameters to tune: `group_size` and `ratio`.
   Lower group size and less ratio of 4-bit layers usually improve accuracy at the sacrifice of inference speed.
   Below is the example how to compress weights of 90% of layers to 4-bit integer asymmetrically with the group size 64, and
-  the rest of layers to 8-bit integer data type. The same parametrization is applicable for `INT4_SYM` mode.
+  the rest of layers to 8-bit asymmetric integer data type. The same parametrization is applicable for `INT4_SYM` mode.
 
 ```python
 from nncf import compress_weights
@@ -45,7 +53,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM, g
 ```
 
 - `NF4` mode can be considered for improving accuracy, but currently models quantized to nf4 should not be faster models
-  quantized to 8-bit integer. Here's the example how to compress weights to nf4 data type with group size = 128.
+  quantized to 8-bit asymmetric integer. Here's the example how to compress weights to nf4 data type with group size = 128.
   Different `group_size` and `ratio` are also supported.
 
 ```python
@@ -79,7 +87,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">databricks/dolly-v2-3b</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">5.07</td>
     <td class="tg-0pky">0.05</td>
     <td class="tg-0pky">2.6</td>
@@ -107,7 +115,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">facebook/opt-6.7b</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">4.27</td>
     <td class="tg-0pky">0.01</td>
     <td class="tg-0pky">6.2</td>
@@ -135,7 +143,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">meta-llama/Llama-2-7b-chat-hf</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">3.29</td>
     <td class="tg-0pky">0.01</td>
     <td class="tg-0pky">6.3</td>
@@ -163,7 +171,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">togethercomputer/RedPajama-INCITE-7B-Instruct</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">4.17</td>
     <td class="tg-0pky">0.02</td>
     <td class="tg-0pky">6.4</td>
@@ -191,7 +199,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">meta-llama/Llama-2-13b-chat-hf</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">2.91</td>
     <td class="tg-0pky">0</td>
     <td class="tg-0pky">12.1</td>
@@ -218,7 +226,7 @@ Here is the perplexity and model size before and after weight compression for di
 - The algorithm is supported for OpenVINO and PyTorch models.
 - The compression applies in-place.
 - The compressed model is not trainable.
-- INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only.
+- INT8_SYM, INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only.
 - NF4 support is experimental - models quantized to nf4 should not be faster models quantized to 8-bit integer.
 
 #### Additional resources

diff --git a/docs/styleguide/PyGuide.md b/docs/styleguide/PyGuide.md
@@ -775,6 +775,39 @@ Always use a `.py` filename extension. Never use dashes.
 Python filenames must have a `.py` extension and must not contain dashes (`-`).
 This allows them to be imported and unit tested.
 
+Avoid having `.py` files with names such as `utils`, `helpers` that are a "swiss army knife" containing many unrelated pieces of code used across the code base.
+Instead group your new code in dedicated files/modules that are named explicitly according to the purpose of code.
+
+Bad:
+
+*utils.py*
+
+```python3
+def log_current_time(log_stream: LogStream):
+    ...
+
+def convert_checkpoint(ckpt: CheckpointType) -> AnotherCheckpointType:
+    ...
+```
+
+Good:
+
+*logger.py*
+
+```python3
+def log_current_time(log_stream: LogStream):
+    ...
+```
+
+*checkpointing/converter.py*
+
+```python3
+class CheckpointConverter:
+    # ... 
+    def convert(self, ckpt: CheckpointType) -> AnotherCheckpointType:
+        pass
+```
+
 <a id="s4.8-main"></a>
 <a id="4.8-main"></a>
 <a id="main"></a>

diff --git a/examples/post_training_quantization/onnx/mobilenet_v2/main.py b/examples/post_training_quantization/onnx/mobilenet_v2/main.py
@@ -140,11 +140,11 @@ def transform_fn(data_item):
 print("[4/7] Benchmark INT8 model:")
 int8_fps = run_benchmark(int8_model_path, shape=[1, 3, 224, 224], verbose=True)
 
-print("[5/7] Validate OpenVINO FP32 model:")
+print("[5/7] Validate ONNX FP32 model in OpenVINO:")
 fp32_top1 = validate(fp32_model_path, val_loader)
 print(f"Accuracy @ top1: {fp32_top1:.3f}")
 
-print("[6/7] Validate OpenVINO INT8 model:")
+print("[6/7] Validate ONNX INT8 model in OpenVINO:")
 int8_top1 = validate(int8_model_path, val_loader)
 print(f"Accuracy @ top1: {int8_top1:.3f}")
 

diff --git a/examples/post_training_quantization/onnx/mobilenet_v2/requirements.txt b/examples/post_training_quantization/onnx/mobilenet_v2/requirements.txt
@@ -4,4 +4,4 @@ scikit-learn
 fastdownload
 onnx~=1.13.1
 onnxruntime~=1.14.1
-openvino-dev==2023.1
+openvino-dev==2023.2
diff --git a/.../post_training_quantization/openvino/anomaly_stfpm_quantize_with_accuracy_control/main.py b/.../post_training_quantization/openvino/anomaly_stfpm_quantize_with_accuracy_control/main.py
@@ -39,8 +39,7 @@
 
 DATASET_INFO = download.DownloadInfo(
     name="mvtec_capsule",
-    url="https://www.mydrive.ch/shares/38536/3830184030e49fe74747669442f0f282/"
-    "download/420937454-1629951595/capsule.tar.xz",
+    url="https://huggingface.co/datasets/alexsu52/mvtec_capsule/resolve/main/capsule.tar.xz",
     hash="380afc46701c99cb7b9a928edbe16eb5",
 )
 DATASET_PATH = HOME_PATH / ".cache/nncf/datasets/mvtec_capsule"

diff --git a/...ining_quantization/openvino/anomaly_stfpm_quantize_with_accuracy_control/requirements.txt b/...ining_quantization/openvino/anomaly_stfpm_quantize_with_accuracy_control/requirements.txt
@@ -1,2 +1,2 @@
 anomalib==0.6.0
-openvino-dev==2023.1
+openvino-dev==2023.2