diff --git a/cv/classification/README.md b/cv/classification/README.md
index 4102038..97b926d 100644
--- a/cv/classification/README.md
+++ b/cv/classification/README.md
@@ -105,5 +105,18 @@ Bash script `infer.sh` is used to infer the trained model.
 sh infer.sh
 ```
 
+### Multi-Device Support (Experimental)
 
+This branch introduces preliminary support for running on different device types. By default, the training script now automatically selects the best available device in the following priority:
+1. CUDA (GPU)
+2. NPU (if oneflow_npu is installed)
+3. CPU (fallback)
+
+If you want to explicitly run on a specific device (e.g., NPU), you can still override the default by adding the following argument to your train.sh command:
+
+```bash
+--device=npu
+```
+
+> Note: The label_smoothing feature is currently not supported in this branch. If your configuration file (e.g., configs/default_settings.yaml) includes label_smoothing, please disable it(set to 0.0) to avoid errors.
 
diff --git a/cv/classification/main.py b/cv/classification/main.py
index b780d7e..6841b78 100755
--- a/cv/classification/main.py
+++ b/cv/classification/main.py
@@ -7,6 +7,7 @@
 import argparse
 import datetime
 import numpy as np
+import importlib.util
 import oneflow as flow
 import oneflow.backends.cudnn as cudnn
 
@@ -39,6 +40,15 @@ def build_model(config):
     return model
 
 
+def detect_device():
+    if flow.cuda.is_available():
+        return "cuda"
+    elif importlib.util.find_spec("oneflow_npu") is not None:
+        return "npu"
+    else:
+        return "cpu"
+
+
 def parse_option():
     parser = argparse.ArgumentParser(
         "Flowvision image classification training and evaluation script", add_help=False
@@ -122,10 +132,17 @@ def parse_option():
         required=False,
         help="local rank for DistributedDataParallel",
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=detect_device(),
+        help="Specify the device to run the model on. Options: 'cuda', 'cpu', or 'npu'.",
+    )
 
     args, unparsed = parser.parse_known_args()
 
     config = get_config(args)
+    config["DEVICE"] = args.device.lower()
 
     return args, config
 
@@ -141,7 +158,7 @@ def main(config):
 
     logger.info(f"Creating model:{config.MODEL.ARCH}")
     model = build_model(config)
-    model.cuda()
+    model.to(config.DEVICE)
 
     optimizer = build_optimizer(config, model)
     model = flow.nn.parallel.DistributedDataParallel(model, broadcast_buffers=False, use_bucket=False)
@@ -255,8 +272,8 @@ def train_one_epoch(
     start = time.time()
     end = time.time()
     for idx, (samples, targets) in enumerate(data_loader):
-        samples = samples.cuda()
-        targets = targets.cuda()
+        samples = samples.to(config.DEVICE)
+        targets = targets.to(config.DEVICE).to(flow.int32)
 
         if mixup_fn is not None:
             samples, targets = mixup_fn(samples, targets)
@@ -324,8 +341,8 @@ def validate(config, data_loader, model):
 
     end = time.time()
     for idx, (images, target) in enumerate(data_loader):
-        images = images.cuda()
-        target = target.cuda()
+        images = images.to(config.DEVICE)
+        target = target.to(config.DEVICE).to(flow.int32)
 
         # compute output
         output = model(images)
@@ -370,7 +387,7 @@ def throughput(data_loader, model, logger):
     model.eval()
 
     for idx, (images, _) in enumerate(data_loader):
-        images = images.cuda()
+        images = images.to(config.DEVICE)
         batch_size = images.shape[0]
         for i in range(50):
             model(images)
@@ -453,4 +470,7 @@ def throughput(data_loader, model, logger):
     # print config
     logger.info(config.dump())
 
+    if config.DEVICE == "npu":
+        import oneflow_npu
+
     main(config)