Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion dl_lib/engine/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import logging
import os
from collections import OrderedDict
from getpass import getuser

import torch
from torch.nn.parallel import DistributedDataParallel
Expand Down Expand Up @@ -66,7 +67,7 @@ def default_argument_parser():
# PyTorch still may leave orphan processes in multi-gpu training.
# Therefore we use a deterministic way to obtain port,
# so that users are aware of orphan processes by seeing the port occupied.
port = 2 ** 15 + 2 ** 14 + hash(os.getuid()) % 2 ** 14
port = 2 ** 15 + 2 ** 14 + hash(getuser()) % 2 ** 14
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
port = 2 ** 15 + 2 ** 14 + hash(getuser()) % 2 ** 14
port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14

parser.add_argument("--dist-url", default="tcp://127.0.0.1:{}".format(port))
parser.add_argument(
"opts",
Expand Down
8 changes: 6 additions & 2 deletions dl_lib/layers/ROIAlign/ROIAlign_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,10 @@ __global__ void RoIAlignBackwardFeature(

namespace dl_lib {

int ceil_div(int a, int b){
return (a + b - 1) / b;
}

at::Tensor ROIAlign_forward_cuda(
const at::Tensor& input,
const at::Tensor& rois,
Expand Down Expand Up @@ -334,7 +338,7 @@ at::Tensor ROIAlign_forward_cuda(
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L));
dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's better to break this long line of code.

dim3 block(512);

if (output.numel() == 0) {
Expand Down Expand Up @@ -390,7 +394,7 @@ at::Tensor ROIAlign_backward_cuda(

cudaStream_t stream = at::cuda::getCurrentCUDAStream();

dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L));
dim3 grid(std::min(at::cuda::ATenCeilDiv(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)));
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto.

dim3 block(512);

// handle possibly empty gradients
Expand Down
30 changes: 25 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import glob
import os
import platform

import torch
from setuptools import find_packages, setup
Expand All @@ -12,6 +13,7 @@
torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
assert torch_ver >= [1, 3], "Requires PyTorch >= 1.3"

os_name = platform.system()

def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down Expand Up @@ -39,6 +41,8 @@ def get_extensions():
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
if "Windows" == os_name:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is sys.platform suitable for your case?

extra_compile_args["nvcc"].append("-D _WIN64")

# It's better if pytorch can do this by default ..
CC = os.environ.get("CC", None)
Expand All @@ -61,13 +65,28 @@ def get_extensions():


cur_dir = os.getcwd()
with open("tools/dl_train", "w") as dl_lib_train:

if "Windows" == os_name:
dl_train_name = "tools/dl_train.bat"
dl_test_name = "tools/dl_test.bat"
head = f"set OMP_NUM_THREADS=1\n"
python_command = "python"
parameters = "%*"
elif "Linux" == os_name:
dl_train_name = "tools/dl_train"
dl_test_name = "tools/dl_test"
head = f"#!/bin/bash\n\nexport OMP_NUM_THREADS=1\n"
python_command = "python3"
parameters = "$@"
else:
raise Exception("Target OS not support")

with open(dl_train_name, "w") as dl_lib_train:
dl_lib_train.write(
head + f"python3 {os.path.join(cur_dir, 'tools', 'train_net.py')} $@")
with open("tools/dl_test", "w") as dl_lib_test:
head + f"{python_command} {os.path.join(cur_dir, 'tools', 'train_net.py')} {parameters}")
with open(dl_test_name, "w") as dl_lib_test:
dl_lib_test.write(
head + f"python3 {os.path.join(cur_dir, 'tools', 'test_net.py')} $@")
head + f"{python_command} {os.path.join(cur_dir, 'tools', 'test_net.py')} {parameters}")

setup(
name="dl_lib",
Expand Down Expand Up @@ -95,5 +114,6 @@ def get_extensions():
extras_require={"all": ["shapely", "psutil"]},
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
scripts=["tools/dl_train", "tools/dl_test"],
scripts=["tools/dl_train", "tools/dl_test"] if 'Linux' == os_name
else ["tools/dl_train.bat", "tools/dl_test.bat"],
)
16 changes: 9 additions & 7 deletions tools/train_net.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import os
import sys
sys.path.insert(0, '.') # noqa: E402
import platform

from colorama import Fore, Style

Expand Down Expand Up @@ -79,13 +80,14 @@ def main(args):
cfg, logger = default_setup(config, args)
model = build_model(cfg)
logger.info(f"Model structure: {model}")
file_sys = os.statvfs(cfg.OUTPUT_DIR)
free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
# We assume that a single dumped model is 700Mb
eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
if eval_space_Gb > free_space_Gb:
logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")
if "Linux" == platform.system():
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if "Linux" == platform.system():
if sys.platform == "linux":

file_sys = os.statvfs(cfg.OUTPUT_DIR)
free_space_Gb = (file_sys.f_bfree * file_sys.f_frsize) / 2**30
# We assume that a single dumped model is 700Mb
eval_space_Gb = (cfg.SOLVER.LR_SCHEDULER.MAX_ITER // cfg.SOLVER.CHECKPOINT_PERIOD) * 700 / 2**10
if eval_space_Gb > free_space_Gb:
logger.warning(f"{Fore.RED}Remaining space({free_space_Gb}GB) "
f"is less than ({eval_space_Gb}GB){Style.RESET_ALL}")
if args.eval_only:
DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
cfg.MODEL.WEIGHTS, resume=args.resume
Expand Down