intel · chensuyue · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/.azure-pipelines/model-test-3x.yml b/.azure-pipelines/model-test-3x.yml
@@ -11,7 +11,7 @@ pr:
       - neural_compressor/common
       - neural_compressor/torch
       - neural_compressor/transformers
-      - examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only
+      - examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/weight_only
       - setup.py
       - requirements_pt.txt
       - .azure-pipelines/scripts/models

diff --git a/.azure-pipelines/model-test.yml b/.azure-pipelines/model-test.yml
@@ -14,7 +14,7 @@ pr:
       - .azure-pipelines/model-test.yml
       - .azure-pipelines/template/docker-template.yml
       - .azure-pipelines/scripts/models
-      - examples/tensorflow/oob_models/quantization/ptq
+      - examples/deprecated/tensorflow/oob_models/quantization/ptq
       - .azure-pipelines/model-test.yml
       - .azure-pipelines/scripts/fwk_version.sh
       - .azure-pipelines/scripts/install_nc.sh

diff --git a/.azure-pipelines/scripts/models/env_setup.sh b/.azure-pipelines/scripts/models/env_setup.sh
@@ -51,13 +51,13 @@ SCRIPTS_PATH="/neural-compressor/.azure-pipelines/scripts/models"
 log_dir="/neural-compressor/.azure-pipelines/scripts/models"
 if [[ "${inc_new_api}" == "3x"* ]]; then
     pip install cmake==3.31.6
-    WORK_SOURCE_DIR="/neural-compressor/examples/3.x_api/${framework}"
+    WORK_SOURCE_DIR="/neural-compressor/examples/${framework}"
     git clone https://github.com/intel/intel-extension-for-transformers.git /itrex
     cd /itrex
     pip install -r requirements.txt
     pip install -v .
 else
-    WORK_SOURCE_DIR="/neural-compressor/examples/${framework}"
+    WORK_SOURCE_DIR="/neural-compressor/examples/deprecated/${framework}"
 fi
 
 $BOLD_YELLOW && echo "processing ${framework}-${fwk_ver}-${model}" && $RESET

diff --git a/.azure-pipelines/scripts/models/run_model_trigger_common.sh b/.azure-pipelines/scripts/models/run_model_trigger_common.sh
@@ -58,9 +58,9 @@ function check_results() {
 log_dir="/neural-compressor/.azure-pipelines/scripts/models"
 SCRIPTS_PATH="/neural-compressor/.azure-pipelines/scripts/models"
 if [[ "${inc_new_api}" == "3x"* ]]; then
-    WORK_SOURCE_DIR="/neural-compressor/examples/3.x_api/${framework}"
-else
     WORK_SOURCE_DIR="/neural-compressor/examples/${framework}"
+else
+    WORK_SOURCE_DIR="/neural-compressor/examples/deprecated/${framework}"
 fi
 $BOLD_YELLOW && echo "processing ${framework}-${fwk_ver}-${model}" && $RESET
 

diff --git a/README.md b/README.md
@@ -22,7 +22,7 @@ In particular, the tool provides the key features, typical examples, and open co
 * Support a wide range of Intel hardware such as [Intel Gaudi Al Accelerators](https://www.intel.com/content/www/us/en/products/details/processors/ai-accelerators/gaudi-overview.html), [Intel Core Ultra Processors](https://www.intel.com/content/www/us/en/products/details/processors/core-ultra.html), [Intel Xeon Scalable Processors](https://www.intel.com/content/www/us/en/products/details/processors/xeon/scalable.html), [Intel Xeon CPU Max Series](https://www.intel.com/content/www/us/en/products/details/processors/xeon/max-series.html), [Intel Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html), and [Intel Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) with extensive testing;
 support AMD CPU, ARM CPU, and NVidia GPU through ONNX Runtime with limited testing; support NVidia GPU for some WOQ algorithms like AutoRound and HQQ.
 
-* Validate popular LLMs such as [LLama2](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies
+* Validate popular LLMs such as [LLama2](/examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Falcon](/examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [GPT-J](/examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [Bloom](/examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), [OPT](/examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm), and more than 10,000 broad models such as [Stable Diffusion](/examples/deprecated/pytorch/nlp/huggingface_models/text-to-image/quantization), [BERT-Large](/examples/deprecated/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx), and [ResNet50](/examples/deprecated/pytorch/image_recognition/torchvision_models/quantization/ptq/cpu/fx) from popular model hubs such as [Hugging Face](https://huggingface.co/), [Torch Vision](https://pytorch.org/vision/stable/index.html), and [ONNX Model Zoo](https://github.com/onnx/models#models), with automatic [accuracy-driven](/docs/source/design.md#workflow) quantization strategies
 
 * Collaborate with cloud marketplaces such as [Google Cloud Platform](https://console.cloud.google.com/marketplace/product/bitnami-launchpad/inc-tensorflow-intel?project=verdant-sensor-286207), [Amazon Web Services](https://aws.amazon.com/marketplace/pp/prodview-yjyh2xmggbmga#pdp-support), and [Azure](https://azuremarketplace.microsoft.com/en-us/marketplace/apps/bitnami.inc-tensorflow-intel), software platforms such as [Alibaba Cloud](https://www.intel.com/content/www/us/en/developer/articles/technical/quantize-ai-by-oneapi-analytics-on-alibaba-cloud.html), [Tencent TACO](https://new.qq.com/rain/a/20221202A00B9S00) and [Microsoft Olive](https://github.com/microsoft/Olive), and open AI ecosystem such as [Hugging Face](https://huggingface.co/blog/intel), [PyTorch](https://pytorch.org/tutorials/recipes/intel_neural_compressor_for_pytorch.html), [ONNX](https://github.com/onnx/models#models), [ONNX Runtime](https://github.com/microsoft/onnxruntime), and [Lightning AI](https://github.com/Lightning-AI/lightning/blob/master/docs/source-pytorch/advanced/post_training_quantization.rst)
 

diff --git a/docs/source/3x/PT_FP8Quant.md b/docs/source/3x/PT_FP8Quant.md
@@ -91,7 +91,7 @@ During runtime, Intel Neural Compressor will detect hardware automatically and t
 
 ## Get Start with FP8 Quantization
 [Demo Usage](https://github.com/intel/neural-compressor?tab=readme-ov-file#getting-started)    
-[Computer vision example](../../../examples/3.x_api/pytorch/cv/fp8_quant)
+[Computer vision example](../../../examples/pytorch/cv/fp8_quant)
 
 ## Optimum-habana LLM example
 ### Overview

diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md
@@ -95,7 +95,7 @@ user_model = convert(model=user_model)
 
 ## Examples
 
-- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant)
+- PyTorch [huggingface models](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant)
 
 
 ## Reference

diff --git a/docs/source/benchmark.md b/docs/source/benchmark.md
@@ -57,4 +57,4 @@ fit(model="./int8.pb", conf=conf, b_dataloader=eval_dataloader)
 
 ## Examples
 
-Refer to the [Benchmark example](../../examples/helloworld/tf_example5).
+Refer to the [Benchmark example](../../examples/deprecated/helloworld/tf_example5).
diff --git a/docs/source/distillation.md b/docs/source/distillation.md
@@ -107,7 +107,7 @@ model = training_func_for_nc(model)
 eval_func(model)
 ```
 
-For Intermediate Layer Knowledge Distillation or Self Distillation, the only difference to above launcher code is that `distil_loss_conf` should be set accordingly as shown below. More detailed settings can be found in this [example](../../examples/pytorch/nlp/huggingface_models/text-classification/optimization_pipeline/distillation_for_quantization/fx/run_glue_no_trainer.py#L510) for Intermediate Layer Knowledge Distillation and this [example](../../examples/pytorch/image_recognition/torchvision_models/self_distillation/eager/main.py#L344) for Self Distillation.
+For Intermediate Layer Knowledge Distillation or Self Distillation, the only difference to above launcher code is that `distil_loss_conf` should be set accordingly as shown below. More detailed settings can be found in this [example](../../examples/deprecated/pytorch/nlp/huggingface_models/text-classification/optimization_pipeline/distillation_for_quantization/fx/run_glue_no_trainer.py#L510) for Intermediate Layer Knowledge Distillation and this [example](../../examples/deprecated/pytorch/image_recognition/torchvision_models/self_distillation/eager/main.py#L344) for Self Distillation.
 
 ```python
 from neural_compressor.config import (
@@ -122,8 +122,8 @@ distil_loss_conf = IntermediateLayersKnowledgeDistillationLossConfig(layer_mappi
 distil_loss_conf = SelfKnowledgeDistillationLossConfig(layer_mappings=layer_mappings)
 ```
 ## Examples
-[Distillation PyTorch Examples](../../examples/README.md#distillation-1)
+[Distillation PyTorch Examples](../../examples/deprecated/README.md#distillation-1)
 <br>
-[Distillation TensorFlow Examples](../../examples/README.md#distillation)
+[Distillation TensorFlow Examples](../../examples/deprecated/README.md#distillation)
 <br>
 [Distillation Examples Results](./validated_model_list.md#validated-knowledge-distillation-examples)
diff --git a/docs/source/mixed_precision.md b/docs/source/mixed_precision.md
@@ -160,8 +160,8 @@ converted_model.save("./path/to/save/")
 
 ## Examples
 
-- Quick started with [helloworld example](/examples/helloworld/tf_example3)
-- PyTorch [ResNet18](/examples/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18)
-- IPEX [DistilBERT base](/examples/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex)
-- Tensorflow [ResNet50](/examples/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision)
-- ONNX Runtime [Bert base](/examples/onnxrt/nlp/huggingface_model/text_classification/mix_precision)
+- Quick started with [helloworld example](/examples/deprecated/helloworld/tf_example3)
+- PyTorch [ResNet18](/examples/deprecated/pytorch/image_recognition/torchvision_models/mixed_precision/resnet18)
+- IPEX [DistilBERT base](/examples/deprecated/pytorch/nlp/huggingface_models/question-answering/mixed_precision/ipex)
+- Tensorflow [ResNet50](/examples/deprecated/tensorflow/image_recognition/tensorflow_models/resnet50_v1/mixed_precision)
+- ONNX Runtime [Bert base](/examples/deprecated/onnxrt/nlp/huggingface_model/text_classification/mix_precision)
diff --git a/docs/source/mx_quantization.md b/docs/source/mx_quantization.md
@@ -118,7 +118,7 @@ user_model = quantize(model=user_model, quant_config=quant_config)
 
 ## Examples
 
-- PyTorch [huggingface models](/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mx)
+- PyTorch [huggingface models](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mx)
 
 
 ## Reference

diff --git a/docs/source/orchestration.md b/docs/source/orchestration.md
@@ -90,4 +90,4 @@ model.save('./path/to/save')
 
 ## Examples
 
-[Orchestration Examples](../../examples/README.md#orchestration)
+[Orchestration Examples](../../examples/deprecated/README.md#orchestration)
diff --git a/docs/source/pruning.md b/docs/source/pruning.md
@@ -91,7 +91,7 @@ Pruning patterns defines the rules of pruned weights' arrangements in space. Int
   An advantage of channel pruning is that in some particular structure(feed forward parts in Transformers etc.), pruned channels can be removed permanently from original weights without influencing other dense channels. Via this process, we can decrease these weights' size and obtain direct improvements of inference speed, without using hardware related optimization tools like [Intel Extension for Transformers](https://github.com/intel/intel-extension-for-transformers). 
 
 
-  We name this process as <span id="click">**Model Auto Slim**(experimental feature)</span> and currently we have validated that this process can significantly improve some popular transformer model's inference speed. Currently this method is under development and only supports some particular structures. Please refer more details of such method in this [model slim example](../../examples/pytorch/nlp/huggingface_models/question-answering/model_slim/).
+  We name this process as <span id="click">**Model Auto Slim**(experimental feature)</span> and currently we have validated that this process can significantly improve some popular transformer model's inference speed. Currently this method is under development and only supports some particular structures. Please refer more details of such method in this [model slim example](../../examples/deprecated/pytorch/nlp/huggingface_models/question-answering/model_slim/).
 
 - Unstructured Pruning
 
@@ -478,25 +478,25 @@ The pruning technique  is validated on typical models across various domains (in
 
 - Text Classification
 
-  Sparsity is implemented in different pruning patterns of MRPC and SST-2 tasks [Text-classification examples](../../examples/pytorch/nlp/huggingface_models/text-classification/pruning/eager).
+  Sparsity is implemented in different pruning patterns of MRPC and SST-2 tasks [Text-classification examples](../../examples/deprecated/pytorch/nlp/huggingface_models/text-classification/pruning/eager).
 
 - Question Answering
 
-  Multiple examples of sparse models were obtained on the SQuAD-v1.1 dataset [Question-answering examples](../../examples/pytorch/nlp/huggingface_models/question-answering/pruning/eager).
+  Multiple examples of sparse models were obtained on the SQuAD-v1.1 dataset [Question-answering examples](../../examples/deprecated/pytorch/nlp/huggingface_models/question-answering/pruning/eager).
 
 - Language Translation (Experimental, sparsity 0.8, pattern 4x1, BLEU 25.63(dense) vs 24.35(sparse))
 
-  Pruning Flan-T5-small model on English-Romanian translation task [Translation examples](../../examples/pytorch/nlp/huggingface_models/translation/pruning/eager).
+  Pruning Flan-T5-small model on English-Romanian translation task [Translation examples](../../examples/deprecated/pytorch/nlp/huggingface_models/translation/pruning/eager).
 
 - Object Detection (Experimental, sparsity 0.8, pattern 4x1, mAP 0.404(dense) vs 0.381(sparse))
 
-  Pruning on YOLOv5 model using coco dataset [Object-detection examples](../../examples/pytorch/object_detection/yolo_v5/pruning/eager).
+  Pruning on YOLOv5 model using coco dataset [Object-detection examples](../../examples/deprecated/pytorch/object_detection/yolo_v5/pruning/eager).
 
 - Image Recognition (Experimental, sparsity 0.75, pattern 2x1, top1 acc 0.801(dense) vs 0.7895(sparse))
 
-  Pruning on ResNet50 model using ImageNet dataset [Image-recognition examples](../../examples/pytorch/image_recognition/ResNet50/pruning/eager/).
+  Pruning on ResNet50 model using ImageNet dataset [Image-recognition examples](../../examples/deprecated/pytorch/image_recognition/ResNet50/pruning/eager/).
 
-Please refer to [pruning examples](../../examples/README.md#Pruning-1) for more information.
+Please refer to [pruning examples](../../examples/deprecated/README.md#Pruning-1) for more information.
 
 ## Sparse Model Deployment
 

diff --git a/docs/source/quantization.md b/docs/source/quantization.md
@@ -543,4 +543,4 @@ conf = PostTrainingQuantConfig(backend="itex", device="gpu")
 ## Examples
 
 User could refer to [examples](https://github.com/intel/neural-compressor/blob/master/examples/README.md) on how to quantize a new model.
-If user wants to quantize an onnx model with npu, please refer to this [example](../../examples/onnxrt/image_recognition/onnx_model_zoo/shufflenet/quantization/ptq_static/README.md). If user wants to quantize a pytorch model with Intel GPU, please refer to this [example](../../examples/pytorch/nlp/huggingface_models/question-answering/quantization/ptq_static/ipex/README.md).
+If user wants to quantize an onnx model with npu, please refer to this [example](../../examples/deprecated/onnxrt/image_recognition/onnx_model_zoo/shufflenet/quantization/ptq_static/README.md). If user wants to quantize a pytorch model with Intel GPU, please refer to this [example](../../examples/deprecated/pytorch/nlp/huggingface_models/question-answering/quantization/ptq_static/ipex/README.md).
diff --git a/docs/source/quantization_layer_wise.md b/docs/source/quantization_layer_wise.md
@@ -95,4 +95,4 @@ q_model = quantization.fit(fp32_model_path, conf, calib_dataloader=dataloader)
 q_model.save(int8_model_path)
 ```
 
-Refer to [ONNX Runtime llama-2 LWQ example](../../examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only)
+Refer to [ONNX Runtime llama-2 LWQ example](../../examples/deprecated/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only)
diff --git a/docs/source/quantization_weight_only.md b/docs/source/quantization_weight_only.md
@@ -177,7 +177,7 @@ q_model = quantization.fit(model, conf, eval_func=eval_func, calib_dataloader=da
 q_model.save("saved_results")
 ```
 
-Refer to this [link](../../examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only) for an example of WOQ algorithms tuning on ONNX Llama models.
+Refer to this [link](../../examples/deprecated/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only) for an example of WOQ algorithms tuning on ONNX Llama models.
 
 
 

diff --git a/docs/source/smooth_quant.md b/docs/source/smooth_quant.md
@@ -375,7 +375,7 @@ A list of models that achieved a <1% accuracy drop is shown below.
 | databricks/dolly-v2-3b* | 0.6297 | 0.6247 | alpha=0.5, Ipex 2.1 |
 | tiiuae/falcon-7b-instruct | 0.6437 | 0.6392 | alpha=0.7, Pytorch |
 
-The results listed below are achieved using IPEX optimize_transformers in model initialization for better performance. Please refer to the step-by-step [instruction](../../examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md) for details.
+The results listed below are achieved using IPEX optimize_transformers in model initialization for better performance. Please refer to the step-by-step [instruction](../../examples/deprecated/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md) for details.
 | Model/Last token accuracy |  FP32 Accuracy   | INT8 (w/ SmoothQuant) | Notes |
 |:----------:|:------:|:------:|-----------------------------------|
 | LLaMa-2-7b-hf* | 0.7392 | 0.7332  | alpha=Auto, Ipex 2.1 |

diff --git a/docs/source/tuning_strategies.md b/docs/source/tuning_strategies.md
@@ -495,7 +495,7 @@ To use Distributed Tuning, the number of processes should be specified to be gre
 ```shell
 mpirun -np <number_of_processes> <RUN_CMD>
 ```
-An example of distributed tuning can be reached at [ptq_static_mrpc](../../examples/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx).
+An example of distributed tuning can be reached at [ptq_static_mrpc](../../examples/deprecated/pytorch/nlp/huggingface_models/text-classification/quantization/ptq_static/fx).
 
 
 ## Customize a New Tuning Strategy
Original file line number	Diff line number	Diff line change
Expand Up		@@ -57,4 +57,4 @@ fit(model="./int8.pb", conf=conf, b_dataloader=eval_dataloader)

		## Examples

		Refer to the [Benchmark example](../../examples/helloworld/tf_example5).
		Refer to the [Benchmark example](../../examples/deprecated/helloworld/tf_example5).
Original file line number	Diff line number	Diff line change
Expand Up		@@ -90,4 +90,4 @@ model.save('./path/to/save')

		## Examples

		[Orchestration Examples](../../examples/README.md#orchestration)
		[Orchestration Examples](../../examples/deprecated/README.md#orchestration)
-Original file line number
+Diff line change
@@ Expand Up @@
     q_model.save("saved_results")
     ```
-    Refer to this [link](../../examples/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only) for an example of WOQ algorithms tuning on ONNX Llama models.
+    Refer to this [link](../../examples/deprecated/onnxrt/nlp/huggingface_model/text_generation/llama/quantization/weight_only) for an example of WOQ algorithms tuning on ONNX Llama models.
@@ Expand Down @@