diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2875d11 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,41 @@ +# Change the version of cuda and cudnn according to user cuda and cudnn version +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive + +# Install libraries and dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + ffmpeg \ + libsm6 \ + libxext6 \ + git \ + curl \ + unzip \ + wget \ + tar \ + build-essential \ + libopenmpi-dev \ + libcairo2-dev \ + pkg-config \ + cmake \ + libpoppler-cpp-dev \ + poppler-utils \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y --no-install-recommends python3.9 python3.9-distutils python3.9-dev python3-pip \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install --upgrade setuptools + +# Update Python symlink to point to Python 3.9 +RUN ln -sf /usr/bin/python3.9 /usr/bin/python \ + && ln -sf /usr/bin/python3.9 /usr/bin/python3 \ + && pip install huggingface-hub + +COPY requirements.txt . + +RUN pip install -r requirements.txt + +CMD [ "bash" ] \ No newline at end of file diff --git a/doclayout_yolo/engine/trainer.py b/doclayout_yolo/engine/trainer.py index 7541d5f..b3ba156 100644 --- a/doclayout_yolo/engine/trainer.py +++ b/doclayout_yolo/engine/trainer.py @@ -19,6 +19,7 @@ import torch from torch import distributed as dist from torch import nn, optim +import torch.amp from doclayout_yolo.cfg import get_cfg, get_save_dir from doclayout_yolo.data.utils import check_cls_dataset, check_det_dataset @@ -226,7 +227,7 @@ def _setup_ddp(self, world_size): torch.cuda.set_device(RANK) self.device = torch.device("cuda", RANK) # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}') - os.environ["NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout + os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1" # set to enforce timeout dist.init_process_group( backend="nccl" if dist.is_nccl_available() else "gloo", timeout=timedelta(seconds=32400), # 3 hours @@ -377,7 +378,7 @@ def _do_train(self, world_size=1): x["momentum"] = np.interp(ni, xi, [self.args.warmup_momentum, self.args.momentum]) # Forward - with torch.cuda.amp.autocast(self.amp): + with torch.amp.autocast(device_type="cuda", enabled=self.amp): #https://pytorch.org/docs/stable/amp.html# batch = self.preprocess_batch(batch) self.loss, self.loss_items = self.model(batch) if RANK != -1: diff --git a/doclayout_yolo/nn/modules/g2l_crm.py b/doclayout_yolo/nn/modules/g2l_crm.py index 62466b1..b44cd1a 100644 --- a/doclayout_yolo/nn/modules/g2l_crm.py +++ b/doclayout_yolo/nn/modules/g2l_crm.py @@ -33,10 +33,15 @@ def __init__(self, c, dilation, k, fuse="sum", shortcut=True): def dilated_conv(self, x, dilation): act = self.dcv.act - bn = self.dcv.bn weight = self.dcv.conv.weight padding = dilation * (self.k//2) - return act(bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation))) + if hasattr(self.dcv, "bn") and self.dcv.bn is not None: + bn = self.dcv.bn + x = bn(F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation)) + else: + x = F.conv2d(x, weight, stride=1, padding=padding, dilation=dilation) + + return act(x) # Apply activation function def forward(self, x): """'forward()' applies the YOLO FPN to input data.""" diff --git a/doclayout_yolo/utils/autobatch.py b/doclayout_yolo/utils/autobatch.py index 998c803..14b13b4 100644 --- a/doclayout_yolo/utils/autobatch.py +++ b/doclayout_yolo/utils/autobatch.py @@ -5,6 +5,7 @@ import numpy as np import torch +import torch.amp from doclayout_yolo.utils import DEFAULT_CFG, LOGGER, colorstr from doclayout_yolo.utils.torch_utils import profile @@ -22,8 +23,7 @@ def check_train_batch_size(model, imgsz=640, amp=True): Returns: (int): Optimal batch size computed using the autobatch() function. """ - - with torch.cuda.amp.autocast(amp): + with torch.amp.autocast(device_type="cuda", enabled=amp): return autobatch(deepcopy(model).train(), imgsz) # compute optimal batch size diff --git a/doclayout_yolo/utils/checks.py b/doclayout_yolo/utils/checks.py index e378281..c7d8383 100644 --- a/doclayout_yolo/utils/checks.py +++ b/doclayout_yolo/utils/checks.py @@ -13,11 +13,11 @@ from importlib import metadata from pathlib import Path from typing import Optional - import cv2 import numpy as np import requests import torch +import torch.amp from matplotlib import font_manager from doclayout_yolo.utils import ( @@ -638,7 +638,7 @@ def check_amp(model): def amp_allclose(m, im): """All close FP32 vs AMP results.""" a = m(im, device=device, verbose=False)[0].boxes.data # FP32 inference - with torch.cuda.amp.autocast(True): + with torch.amp.autocast(device_type="cuda", enabled=True): b = m(im, device=device, verbose=False)[0].boxes.data # AMP inference del m return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5) # close to 0.5 absolute tolerance diff --git a/doclayout_yolo/utils/loss.py b/doclayout_yolo/utils/loss.py index 2c65ebf..262d119 100644 --- a/doclayout_yolo/utils/loss.py +++ b/doclayout_yolo/utils/loss.py @@ -1,6 +1,7 @@ # Ultralytics YOLO 🚀, AGPL-3.0 license import torch +import torch.amp import torch.nn as nn import torch.nn.functional as F @@ -30,7 +31,7 @@ def __init__(self): def forward(pred_score, gt_score, label, alpha=0.75, gamma=2.0): """Computes varfocal loss.""" weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label - with torch.cuda.amp.autocast(enabled=False): + with torch.amp.autocast(device_type="cuda", enabled=False): loss = ( (F.binary_cross_entropy_with_logits(pred_score.float(), gt_score.float(), reduction="none") * weight) .mean(1) diff --git a/doclayout_yolo/utils/torch_utils.py b/doclayout_yolo/utils/torch_utils.py index a59ff0a..51e25ef 100644 --- a/doclayout_yolo/utils/torch_utils.py +++ b/doclayout_yolo/utils/torch_utils.py @@ -462,6 +462,15 @@ def update_attr(self, model, include=(), exclude=("process_group", "reducer")): if self.enabled: copy_attr(self.ema, model, include, exclude) +def convert_to_fp16(model): + """Convert model to FP16 (half precision) and return original device.""" + for layers in model.modules(): + if isinstance(layers, (nn.BatchNorm2d, nn.BatchNorm1d, nn.SyncBatchNorm)): + try: + layers.half() + except Exception: + LOGGER.warning(f"Warning: layer {layers} not supported for FP16 conversion") + return model def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "") -> None: """ @@ -483,21 +492,23 @@ def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "") -> None: strip_optimizer(f) ``` """ - x = torch.load(f, map_location=torch.device("cpu")) + x = torch.load(f, map_location=torch.device("cpu"), weights_only=False) if "model" not in x: LOGGER.info(f"Skipping {f}, not a valid Ultralytics model.") return - if hasattr(x["model"], "args"): - x["model"].args = dict(x["model"].args) # convert from IterableSimpleNamespace to dict + model = x["model"] + if hasattr(model, "args"): + model.args = dict(model.args) # convert from IterableSimpleNamespace to dict args = {**DEFAULT_CFG_DICT, **x["train_args"]} if "train_args" in x else None # combine args if x.get("ema"): - x["model"] = x["ema"] # replace model with ema - for k in "optimizer", "best_fitness", "ema", "updates": # keys + model = x["ema"] # replace model with ema + for k in ["optimizer", "best_fitness", "ema", "updates"]: # keys x[k] = None x["epoch"] = -1 - x["model"].half() # to FP16 - for p in x["model"].parameters(): + model = convert_to_fp16(model) # to FP16 + #x["model"].half() # to FP16 + for p in model.parameters(): p.requires_grad = False x["train_args"] = {k: v for k, v in args.items() if k in DEFAULT_CFG_KEYS} # strip non-default keys # x['model'].args = x['train_args'] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4e0b700 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,83 @@ +albucore==0.0.23 +albumentations==2.0.5 +annotated-types==0.7.0 +blinker==1.4 +certifi==2025.1.31 +charset-normalizer==3.4.1 +contourpy==1.3.0 +cryptography==3.4.8 +cycler==0.12.1 +dbus-python==1.2.18 +distro==1.7.0 +eval_type_backport==0.2.2 +filelock==3.18.0 +fonttools==4.56.0 +fsspec==2025.3.0 +httplib2==0.20.2 +huggingface-hub==0.29.3 +idna==3.10 +importlib-metadata==4.6.4 +importlib_resources==6.5.2 +jeepney==0.7.1 +Jinja2==3.1.6 +keyring==23.5.0 +kiwisolver==1.4.7 +launchpadlib==1.10.16 +lazr.restfulclient==0.14.4 +lazr.uri==1.0.6 +MarkupSafe==3.0.2 +matplotlib==3.9.4 +more-itertools==8.10.0 +mpmath==1.3.0 +networkx==3.2.1 +numpy==2.0.2 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +oauthlib==3.2.0 +opencv-python==4.11.0.86 +opencv-python-headless==4.11.0.86 +packaging==24.2 +pandas==2.2.3 +pillow==11.1.0 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pydantic==2.11.0 +pydantic_core==2.33.0 +PyGObject==3.42.1 +PyJWT==2.3.0 +pyparsing==2.4.7 +python-apt==2.4.0+ubuntu4 +python-dateutil==2.9.0.post0 +pytz==2025.2 +PyYAML==6.0.2 +requests==2.32.3 +scipy==1.13.1 +seaborn==0.13.2 +SecretStorage==3.3.1 +simsimd==6.2.1 +six==1.16.0 +stringzilla==3.12.3 +sympy==1.13.1 +thop==0.1.1.post2209072238 +torch==2.6.0 +torchvision==0.21.0 +tqdm==4.67.1 +triton==3.2.0 +typing-inspection==0.4.0 +typing_extensions==4.13.0 +tzdata==2025.2 +urllib3==2.3.0 +#pycairo +wadllib==1.3.6 +zipp==3.21.0 \ No newline at end of file