diff --git a/cv/classification/README.md b/cv/classification/README.md index 4102038..97b926d 100644 --- a/cv/classification/README.md +++ b/cv/classification/README.md @@ -105,5 +105,18 @@ Bash script `infer.sh` is used to infer the trained model. sh infer.sh ``` +### Multi-Device Support (Experimental) +This branch introduces preliminary support for running on different device types. By default, the training script now automatically selects the best available device in the following priority: +1. CUDA (GPU) +2. NPU (if oneflow_npu is installed) +3. CPU (fallback) + +If you want to explicitly run on a specific device (e.g., NPU), you can still override the default by adding the following argument to your train.sh command: + +```bash +--device=npu +``` + +> Note: The label_smoothing feature is currently not supported in this branch. If your configuration file (e.g., configs/default_settings.yaml) includes label_smoothing, please disable it(set to 0.0) to avoid errors. diff --git a/cv/classification/main.py b/cv/classification/main.py index b780d7e..6841b78 100755 --- a/cv/classification/main.py +++ b/cv/classification/main.py @@ -7,6 +7,7 @@ import argparse import datetime import numpy as np +import importlib.util import oneflow as flow import oneflow.backends.cudnn as cudnn @@ -39,6 +40,15 @@ def build_model(config): return model +def detect_device(): + if flow.cuda.is_available(): + return "cuda" + elif importlib.util.find_spec("oneflow_npu") is not None: + return "npu" + else: + return "cpu" + + def parse_option(): parser = argparse.ArgumentParser( "Flowvision image classification training and evaluation script", add_help=False @@ -122,10 +132,17 @@ def parse_option(): required=False, help="local rank for DistributedDataParallel", ) + parser.add_argument( + "--device", + type=str, + default=detect_device(), + help="Specify the device to run the model on. Options: 'cuda', 'cpu', or 'npu'.", + ) args, unparsed = parser.parse_known_args() config = get_config(args) + config["DEVICE"] = args.device.lower() return args, config @@ -141,7 +158,7 @@ def main(config): logger.info(f"Creating model:{config.MODEL.ARCH}") model = build_model(config) - model.cuda() + model.to(config.DEVICE) optimizer = build_optimizer(config, model) model = flow.nn.parallel.DistributedDataParallel(model, broadcast_buffers=False, use_bucket=False) @@ -255,8 +272,8 @@ def train_one_epoch( start = time.time() end = time.time() for idx, (samples, targets) in enumerate(data_loader): - samples = samples.cuda() - targets = targets.cuda() + samples = samples.to(config.DEVICE) + targets = targets.to(config.DEVICE).to(flow.int32) if mixup_fn is not None: samples, targets = mixup_fn(samples, targets) @@ -324,8 +341,8 @@ def validate(config, data_loader, model): end = time.time() for idx, (images, target) in enumerate(data_loader): - images = images.cuda() - target = target.cuda() + images = images.to(config.DEVICE) + target = target.to(config.DEVICE).to(flow.int32) # compute output output = model(images) @@ -370,7 +387,7 @@ def throughput(data_loader, model, logger): model.eval() for idx, (images, _) in enumerate(data_loader): - images = images.cuda() + images = images.to(config.DEVICE) batch_size = images.shape[0] for i in range(50): model(images) @@ -453,4 +470,7 @@ def throughput(data_loader, model, logger): # print config logger.info(config.dump()) + if config.DEVICE == "npu": + import oneflow_npu + main(config)