diff --git a/.gitignore b/.gitignore index 4f61b8cbb..e2e2fe2f5 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,9 @@ wheels/ *.egg-info/ .installed.cfg *.egg +# /home/svision/experiments/recognition/insightface/recognition/arcface_torch/wandb +recognition/arcface_torch/wandb/ +recognition/volumes/ # PyInstaller # Usually these files are written by a python script from a template diff --git a/recognition/Dockerfile b/recognition/Dockerfile new file mode 100644 index 000000000..a06a80809 --- /dev/null +++ b/recognition/Dockerfile @@ -0,0 +1,47 @@ +FROM nvidia/cuda:12.8.0-cudnn-runtime-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + PIP_NO_CACHE_DIR=1 + +# System deps (opencv runtime libs + basic build tools) +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip python3-dev python3-venv \ + git curl ca-certificates \ + build-essential pkg-config \ + libglib2.0-0 libsm6 libxext6 libxrender1 \ + && rm -rf /var/lib/apt/lists/* + +RUN python3 -m pip install --upgrade pip setuptools wheel + +# ---- PyTorch (CUDA 12.8 / cu128) ---- +ARG TORCH_VERSION=2.10.0 +ARG TORCHVISION_VERSION=0.25.0 +ARG TORCHAUDIO_VERSION=2.10.0 + +# Use extra-index-url so deps can still come from PyPI +RUN python3 -m pip install \ + torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} \ + --extra-index-url https://download.pytorch.org/whl/cu128 + +# ---- Your packages ---- +RUN python3 -m pip install \ + "numpy<1.24" \ + tensorboard \ + easydict \ + mxnet \ + onnx \ + scikit-learn \ + tqdm \ + wandb \ + "opencv-python-headless<4.10" + +# Put code into the image +WORKDIR /workspace +COPY . /workspace + +# If the repo has a requirements.txt, uncomment: +# RUN python3 -m pip install -r requirements.txt + +CMD ["bash"] diff --git a/recognition/README.md b/recognition/README.md index c7ddef1b6..f2fd837b4 100644 --- a/recognition/README.md +++ b/recognition/README.md @@ -1,5 +1,7 @@ ## Face Recognition +This is the forked version of original Insightface repo which I modified and used to train face reco models. +Many links and descriptions remains similar to original repo, I've just added some utility scripts and files needed to create training / testing environment.
diff --git a/recognition/arcface_torch/backbones/iresnet.py b/recognition/arcface_torch/backbones/iresnet.py index 6f2347c92..78b4b72c0 100644 --- a/recognition/arcface_torch/backbones/iresnet.py +++ b/recognition/arcface_torch/backbones/iresnet.py @@ -146,7 +146,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False): return nn.Sequential(*layers) def forward(self, x): - with torch.cuda.amp.autocast(self.fp16): + with torch.amp.autocast('cuda', enabled=self.fp16): x = self.conv1(x) x = self.bn1(x) x = self.prelu(x) diff --git a/recognition/arcface_torch/configs/merged_ms1m_glint_r100.py b/recognition/arcface_torch/configs/merged_ms1m_glint_r100.py new file mode 100644 index 000000000..4c9545eb8 --- /dev/null +++ b/recognition/arcface_torch/configs/merged_ms1m_glint_r100.py @@ -0,0 +1,37 @@ +import os + +from easydict import EasyDict as edict + +config = edict() +config.margin_list = (1.0, 0.5, 0.0) +config.network = "r100" +config.resume = False +config.output = "/output/merged_ms1m_glint_r100" +config.embedding_size = 512 +config.sample_rate = 0.2 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 1e-4 +config.batch_size = 320 +config.lr = 0.1 +config.verbose = 2000 +config.dali = False +config.num_workers = 24 # Try to change this according to the number of CPU cores, 12 is good for 16 cores, but if you have 32 cores, you can set it to 24 or 28 + +config.rec = "/datasets/merged_ms1m_glint" +config.num_classes = 453663 +config.num_image = 22271167 +config.num_epoch = 20 +config.warmup_epoch = 2 +config.val_targets = ['lfw', 'cfp_fp', "agedb_30"] +config.save_all_states = False# To save GPU memory, only save the state of backbone, not the state of partial fc and optimizer + +config.using_wandb = True +config.wandb_key = os.getenv("WANDB_API_KEY") +config.wandb_entity = os.getenv("WANDB_ENTITY") +config.wandb_project = os.getenv("WANDB_PROJECT") +config.wandb_log_all = True +config.wandb_resume = False +config.suffix_run_name = "merged_ms1m_glint_r100" +config.notes = "Training r100 on merged MS1MV3 + Glint360K dataset" + diff --git a/recognition/arcface_torch/configs/ms1mv2_r100.py b/recognition/arcface_torch/configs/ms1mv2_r100.py index 36773489c..24dd8171f 100644 --- a/recognition/arcface_torch/configs/ms1mv2_r100.py +++ b/recognition/arcface_torch/configs/ms1mv2_r100.py @@ -8,18 +8,18 @@ config.margin_list = (1.0, 0.5, 0.0) config.network = "r100" config.resume = False -config.output = None +config.output = "/output/ms1mv2_r100" config.embedding_size = 512 config.sample_rate = 1.0 config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 -config.batch_size = 128 +config.batch_size = 64 config.lr = 0.1 config.verbose = 2000 config.dali = False -config.rec = "/train_tmp/faces_emore" +config.rec = "/datasets/faces_emore" config.num_classes = 85742 config.num_image = 5822653 config.num_epoch = 20 diff --git a/recognition/arcface_torch/dataset.py b/recognition/arcface_torch/dataset.py index 6cdbdcf05..88dc80e5f 100644 --- a/recognition/arcface_torch/dataset.py +++ b/recognition/arcface_torch/dataset.py @@ -23,7 +23,7 @@ def get_dataloader( dali = False, dali_aug = False, seed = 2048, - num_workers = 2, + num_workers = 4, ) -> Iterable: rec = os.path.join(root_dir, 'train.rec') diff --git a/recognition/arcface_torch/lr_scheduler.py b/recognition/arcface_torch/lr_scheduler.py index 6f3cda31d..439415df8 100644 --- a/recognition/arcface_torch/lr_scheduler.py +++ b/recognition/arcface_torch/lr_scheduler.py @@ -4,8 +4,8 @@ import warnings class PolynomialLRWarmup(_LRScheduler): - def __init__(self, optimizer, warmup_iters, total_iters=5, power=1.0, last_epoch=-1, verbose=False): - super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose) + def __init__(self, optimizer, warmup_iters, total_iters=5, power=1.0, last_epoch=-1, **kwargs): + super().__init__(optimizer, last_epoch=last_epoch) self.total_iters = total_iters self.power = power self.warmup_iters = warmup_iters diff --git a/recognition/arcface_torch/partial_fc_v2.py b/recognition/arcface_torch/partial_fc_v2.py index 0752554ca..58654c75f 100644 --- a/recognition/arcface_torch/partial_fc_v2.py +++ b/recognition/arcface_torch/partial_fc_v2.py @@ -154,7 +154,7 @@ def forward( else: weight = self.weight - with torch.cuda.amp.autocast(self.fp16): + with torch.amp.autocast('cuda', enabled=self.fp16): norm_embeddings = normalize(embeddings) norm_weight_activated = normalize(weight) logits = linear(norm_embeddings, norm_weight_activated) diff --git a/recognition/arcface_torch/svision_scripts/copy_files.py b/recognition/arcface_torch/svision_scripts/copy_files.py new file mode 100644 index 000000000..e145f4162 --- /dev/null +++ b/recognition/arcface_torch/svision_scripts/copy_files.py @@ -0,0 +1,94 @@ +import pysftp +import sys, os +import time +import logging +import glob + +from multiprocessing import Pool +from multiprocessing import cpu_count +import numpy as np +import multiprocessing + +logging.raiseExceptions=False +def chunk(l, n): + # loop over the list in n-sized chunks + for i in range(0, len(l), n): + # yield the current n-sized chunk to the calling function + yield l[i: i + n] + +def copy_func(payloads): + with pysftp.Connection(host=myHostname, username=myUsername, password=myPassword, cnopts=cnopts) as sftp: + print("Connection succesfully stablished ... ") + + cpu_amount = float(cpu_count()) + cpu_id = float(multiprocessing.current_process().name.split("-")[1]) + + outputPath = payloads["output_path"] + k=0 + for input_path in payloads["input_paths"]: + sftp.put(input_path,outputPath+input_path.split("/")[-1]) + + if k%1000==0: + print("LEFT {}".format(len(payloads["input_paths"])-k)) + k+=1 + +#ip adress computera +myHostname = "10.16.107.15" +#login computer +myUsername = "umai" +#pswd computer +myPassword = "passw0rd13!" + +cnopts = pysftp.CnOpts() +cnopts.hostkeys = None + +folders = '/home/umai/' +remote_folder = '/photo/' + +folders_lst = ['ud_gr_photos'] + +print('Amount of folders:', len(folders_lst)) + +for folder_name in folders_lst: + #path of 13mln photo + path = folders + folder_name + '/' + print('Source:', path) + #type of photo + file_type = '*.ldr' + #end path of photo where it will be + remote_path = remote_folder + folder_name + '/' + if not os.path.exists(remote_path): + os.makedirs(remote_path) + print('Destination:', remote_path) + + pictures = sorted(glob.glob(path + file_type)) + print(len(pictures)) + + procs = cpu_count() + procIDs = list(range(0, procs)) + + PicturesPerProc = len(pictures) / float(procs) + PicturesPerProc = int(np.ceil(PicturesPerProc)) + + chunkedPaths = list(chunk(pictures, PicturesPerProc)) + + payloads = [] + for (i, imagePaths) in enumerate(chunkedPaths): + data = { + "input_paths": imagePaths, + "output_path": remote_path + } + payloads.append(data) + #print(payloads) + + start = time.time() + + pool = Pool(processes=procs) + pool.map(copy_func, payloads) + + print("[INFO] waiting for processes to finish...") + + pool.close() + pool.join() + + print(time.time()-start) diff --git a/recognition/arcface_torch/svision_scripts/copy_files_3.py b/recognition/arcface_torch/svision_scripts/copy_files_3.py new file mode 100644 index 000000000..1cae835d1 --- /dev/null +++ b/recognition/arcface_torch/svision_scripts/copy_files_3.py @@ -0,0 +1,103 @@ +#import pysftp +import sys, os +import time +import logging +import glob +import shutil + +from multiprocessing import Pool +from multiprocessing import cpu_count +# import numpy as np +import multiprocessing +import math + +logging.raiseExceptions=False +def chunk(l, n): + # loop over the list in n-sized chunks + for i in range(0, len(l), n): + # yield the current n-sized chunk to the calling function + yield l[i: i + n] + +def copy_func(payloads): + #with pysftp.Connection(host=myHostname, username=myUsername, password=myPassword, cnopts=cnopts) as sftp: + #print("Connection succesfully stablished ... ") + + cpu_amount = float(cpu_count()) + cpu_id = float(multiprocessing.current_process().name.split("-")[1]) + + outputPath = payloads["output_path"] + k=0 + time1 = time.time() + for input_path in payloads["input_paths"]: + dst_dir = os.path.join(outputPath, input_path.split("/")[-2]) + os.makedirs(dst_dir, exist_ok=True) + shutil.copy2(input_path, os.path.join(dst_dir, input_path.split("/")[-1])) + + if k%1000==0: + print("LEFT {}".format(len(payloads["input_paths"])-k)) + print(time.time()-time1) + k+=1 + +#ip adress computera +myHostname = "172.30.10.117" +#login computer +myUsername = "svision" +#pswd computer +myPassword = "1q2w3e" + +# cnopts = pysftp.CnOpts() +# cnopts.hostkeys = None + +#path of 13mln photo +path = '/data/datasets/recognition/merged_ms1m_glint/' +#path = '/media/tengrilab/NewHDD/FOTO_SSD_NEW_201-19/' +#list_of_folders = next(os.walk(path))[1] + +#pictures = [] +#i=0 +#for folder in list_of_folders: +# pictures = pictures + glob.glob(path+folder+'/*') +# print(len(pictures),pictures[-1]) + #i = i+1 + #if i==5: + # break +#type of photo +file_type = '*/*' +#end path of photo where it will be +remote_path = '/home/svision/datasets/merged_ms1m_glint_copy/' + +pictures = sorted(glob.glob(path + file_type)) +print(len(pictures)) + +procs = cpu_count() +procIDs = list(range(0, procs)) + +PicturesPerProc = len(pictures) / float(procs) +PicturesPerProc = int(math.ceil(PicturesPerProc)) +print(PicturesPerProc) + +chunkedPaths = list(chunk(pictures, PicturesPerProc)) + +payloads = [] +for (i, imagePaths) in enumerate(chunkedPaths): + data = { + "input_paths": imagePaths, + "output_path": remote_path + } + payloads.append(data) +#print(payloads) + +start = time.time() + +# with pysftp.Connection(host=myHostname, username=myUsername, password=myPassword, cnopts=cnopts) as sftp: +# print("Connection succesfully stablished ... ") + +pool = Pool(processes=procs) +pool.map(copy_func, payloads) + +print("[INFO] waiting for processes to finish...") + +pool.close() +pool.join() + +print(time.time()-start) diff --git a/recognition/arcface_torch/svision_scripts/copy_files_to_ssd.py b/recognition/arcface_torch/svision_scripts/copy_files_to_ssd.py new file mode 100644 index 000000000..6a9c671a8 --- /dev/null +++ b/recognition/arcface_torch/svision_scripts/copy_files_to_ssd.py @@ -0,0 +1,102 @@ +#import pysftp +import sys, os +import time +import logging +import glob +import shutil +import pysftp + +from multiprocessing import Pool +from multiprocessing import cpu_count +import numpy as np +import multiprocessing + +logging.raiseExceptions=False +def chunk(l, n): + # loop over the list in n-sized chunks + for i in range(0, len(l), n): + # yield the current n-sized chunk to the calling function + yield l[i: i + n] + +def copy_func(payloads): + # with pysftp.Connection(host=myHostname, username=myUsername, password=myPassword, cnopts=cnopts) as sftp: + # print("Connection succesfully stablished ... ") + + cpu_amount = float(cpu_count()) + cpu_id = float(multiprocessing.current_process().name.split("-")[1]) + + outputPath = payloads["output_path"] + k=0 + time1 = time.time() + for input_path in payloads["input_paths"]: + with sftp.cd(outputPath+input_path.split("/")[-2]+"/"): + sftp.put(input_path) + # shutil.copy2(input_path,outputPath+input_path.split("/")[-2]+"/"+input_path.split("/")[-1]) + + if k%1000==0: + print("LEFT {}".format(len(payloads["input_paths"])-k)) + print(time.time()-time1) + k+=1 + +#ip adress computera +myHostname = "172.30.10.117" +#login computer +myUsername = "svision" +#pswd computer +myPassword = "1q2w3e" + +cnopts = pysftp.CnOpts() +cnopts.hostkeys = None + +#path of 13mln photo +path = '/data/datasets/recognition/merged_ms1m_glint/' # /mnt/ssd1/merged_ms1m_glint_copy/ +#list_of_folders = next(os.walk(path))[1] + +#pictures = [] +#i=0 +#for folder in list_of_folders: +# pictures = pictures + glob.glob(path+folder+'/*') +# print(len(pictures),pictures[-1]) + #i = i+1 + #if i==5: + # break +#type of photo +file_type = '*' +#end path of photo where it will be +remote_path = '/home/svision/datasets/merged_ms1m_glint_copy/' + +pictures = sorted(glob.glob(path + file_type)) +print(len(pictures)) + +procs = cpu_count() +procIDs = list(range(0, procs)) + +PicturesPerProc = len(pictures) / float(procs) +PicturesPerProc = int(np.ceil(PicturesPerProc)) + +chunkedPaths = list(chunk(pictures, PicturesPerProc)) + + +payloads = [] +for (i, imagePaths) in enumerate(chunkedPaths): + data = { + "input_paths": imagePaths, + "output_path": remote_path + } + payloads.append(data) +#print(payloads) + +start = time.time() + +with pysftp.Connection(host=myHostname, username=myUsername, password=myPassword, cnopts=cnopts) as sftp: + print("Connection succesfully stablished ... ") + + pool = Pool(processes=procs) + pool.map(copy_func, payloads) + + print("[INFO] waiting for processes to finish...") + + pool.close() + pool.join() + + print(time.time()-start) diff --git a/recognition/arcface_torch/svision_scripts/merge_datasets.py b/recognition/arcface_torch/svision_scripts/merge_datasets.py new file mode 100644 index 000000000..e69de29bb diff --git a/recognition/arcface_torch/svision_scripts/unpack_images.py b/recognition/arcface_torch/svision_scripts/unpack_images.py new file mode 100644 index 000000000..7fb00e48d --- /dev/null +++ b/recognition/arcface_torch/svision_scripts/unpack_images.py @@ -0,0 +1,57 @@ +import os +import cv2 +import mxnet as mx +from tqdm import tqdm + +dataset_dir = "/datasets/ms1m-retinaface-t1" +rec_path = os.path.join(dataset_dir, "train.rec") +idx_path = os.path.join(dataset_dir, "train.idx") +out_dir = os.path.join(dataset_dir, "ms1m-retinaface-t1-images") + +os.makedirs(out_dir, exist_ok=True) + +# Open RecordIO +imgrec = mx.recordio.MXIndexedRecordIO(idx_path, rec_path, 'r') + +# Read index 0 header (usually metadata) +s = imgrec.read_idx(0) +header, _ = mx.recordio.unpack(s) + +# In many face datasets: +# - header.label[0] = total images (or start idx) +# - header.label[1] = total identities (or end idx) +# But this varies by packing script, so we just scan indices safely. + +# Try to use keys from idx file +keys = list(imgrec.keys) + +print(f"Total record keys found: {len(keys)}") + +for i in tqdm(keys): + if i == 0: + continue # metadata record in many datasets + try: + s = imgrec.read_idx(i) + if s is None: + continue + header, img = mx.recordio.unpack(s) + + # decode image bytes + img_np = mx.image.imdecode(img).asnumpy() + + # label can be float, list, tuple, or numpy array + label = header.label + import numbers + if isinstance(label, numbers.Number): + person_id = int(label) + else: + person_id = int(label[0]) + + person_dir = os.path.join(out_dir, f"{person_id:07d}") + os.makedirs(person_dir, exist_ok=True) + + img_path = os.path.join(person_dir, f"{i:08d}.jpg") + cv2.imwrite(img_path, cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)) + except Exception as e: + if "empty buffer" not in str(e): + print(f"Failed at index {i}: {e}") \ No newline at end of file diff --git a/recognition/arcface_torch/train_v2.py b/recognition/arcface_torch/train_v2.py index 9563c6533..670e3cee0 100755 --- a/recognition/arcface_torch/train_v2.py +++ b/recognition/arcface_torch/train_v2.py @@ -97,7 +97,7 @@ def main(args): backbone = torch.nn.parallel.DistributedDataParallel( module=backbone, broadcast_buffers=False, device_ids=[local_rank], bucket_cap_mb=16, - find_unused_parameters=True) + find_unused_parameters=False) backbone.register_comm_hook(None, fp16_compress_hook) backbone.train() @@ -171,7 +171,7 @@ def main(args): ) loss_am = AverageMeter() - amp = torch.cuda.amp.grad_scaler.GradScaler(growth_interval=100) + amp = torch.amp.GradScaler('cuda', growth_interval=100) for epoch in range(start_epoch, cfg.num_epoch): diff --git a/recognition/arcface_torch/utils/utils_callbacks.py b/recognition/arcface_torch/utils/utils_callbacks.py index d9368073f..a49066b75 100755 --- a/recognition/arcface_torch/utils/utils_callbacks.py +++ b/recognition/arcface_torch/utils/utils_callbacks.py @@ -19,7 +19,7 @@ def __init__(self, val_targets, rec_prefix, summary_writer=None, image_size=(112 self.highest_acc_list: List[float] = [0.0] * len(val_targets) self.ver_list: List[object] = [] self.ver_name_list: List[str] = [] - if self.rank is 0: + if self.rank == 0: self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size) self.summary_writer = summary_writer @@ -59,7 +59,7 @@ def init_dataset(self, val_targets, data_dir, image_size): self.ver_name_list.append(name) def __call__(self, num_update, backbone: torch.nn.Module): - if self.rank is 0 and num_update > 0: + if self.rank == 0 and num_update > 0: backbone.eval() self.ver_test(backbone, num_update) backbone.train() diff --git a/recognition/docker-compose.yml b/recognition/docker-compose.yml new file mode 100644 index 000000000..102f760df --- /dev/null +++ b/recognition/docker-compose.yml @@ -0,0 +1,73 @@ +services: + recognition: + build: . + image: insightface-recognition:latest + container_name: insightface-recognition + stdin_open: true + tty: true + working_dir: /workspace/arcface_torch + command: "torchrun --nproc_per_node=3 train_v2.py configs/merged_ms1m_glint_r100" + environment: + - OMP_NUM_THREADS=1 + - WANDB_API_KEY=${WANDB_API_KEY} + - WANDB_ENTITY=${WANDB_ENTITY} + - WANDB_PROJECT=${WANDB_PROJECT} + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: all + capabilities: [gpu] + volumes: + - .:/workspace + - /data/datasets/recognition:/datasets + - /home/svision/experiments/recognition/output:/output + shm_size: '16g' + networks: + - svision_experiments + restart: unless-stopped + + minio: + container_name: devbox-dataholder-minio + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + - MINIO_ACCESS_KEY=${MINIO_USER} + - MINIO_SECRET_KEY=${MINIO_PASS} + ports: + - "10006:9001" + - "${MINIO_PUBLIC_PORT}:${MINIO_PORT}" + volumes: + - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data + command: minio server /minio_data --console-address ":9001" + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:${MINIO_PORT}/minio/health/live"] + interval: 30s + timeout: 20s + retries: 3 + networks: + - svision_experiments + restart: unless-stopped + + minio-mc: + image: minio/mc + container_name: devbox-minio-mc + depends_on: + minio: + condition: service_healthy + entrypoint: > + /bin/sh -c " + sleep 5 && + mc alias set myminio http://minio:9000 ${MINIO_USER} ${MINIO_PASS} && + mc mb --ignore-existing myminio/face-dataset-bucket && + mc anonymous set download myminio/face-dataset-bucket && + echo 'MinIO bucket setup completed successfully' && + exit 0 + " + networks: + - svision_experiments + +networks: + svision_experiments: + name: svision_experiments_net + driver: bridge \ No newline at end of file