Handles batches for instace iou

nicolas-chaulet · nicolas-chaulet · commit 8f4ec9e6c630 · 2020-06-25T08:39:42.000Z
diff --git a/cuda/include/metrics.h b/cuda/include/metrics.h
@@ -3,4 +3,4 @@
 
 at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
                              at::Tensor gt_instances, at::Tensor gt_instance_sizes,
-                             long num_gt_instances);
+                             at::Tensor num_gt_instances, at::Tensor batch);
diff --git a/cuda/src/metrics.cpp b/cuda/src/metrics.cpp
@@ -2,33 +2,48 @@
 #include "compat.h"
 #include "utils.h"
 
-void instance_iou_kernel_wrapper(int nInstance, int nProposal, long* proposals_idx,
-                                 long* proposals_offset, long* instance_labels,
-                                 long* instance_pointnum, float* proposals_iou);
+void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
+                                 const long* nInstance, int nProposal, const long* proposals_idx,
+                                 const long* proposals_offset, const long* instance_labels,
+                                 const long* offset_num_gt_instances, const long* batch,
+                                 const long* instance_pointnum, float* proposals_iou);
 
 at::Tensor instance_iou_cuda(at::Tensor instance_idx, at::Tensor instance_offsets,
                              at::Tensor gt_instances, at::Tensor gt_instance_sizes,
-                             long num_gt_instances)
+                             at::Tensor num_gt_instances, at::Tensor batch)
 {
     CHECK_CONTIGUOUS(instance_idx);
     CHECK_CONTIGUOUS(instance_offsets);
     CHECK_CONTIGUOUS(gt_instances);
     CHECK_CONTIGUOUS(gt_instance_sizes);
+    CHECK_CONTIGUOUS(num_gt_instances);
+    CHECK_CONTIGUOUS(batch);
 
     CHECK_CUDA(instance_idx);
     CHECK_CUDA(instance_offsets);
     CHECK_CUDA(gt_instances);
     CHECK_CUDA(gt_instance_sizes);
 
+    cudaSetDevice(instance_idx.get_device());
     long num_proposed_instances = instance_offsets.size(0) - 1;
+    auto total_gt_instances = (int64_t*)malloc(sizeof(int64_t));
+    cudaMemcpy(total_gt_instances, num_gt_instances.sum().DATA_PTR<int64_t>(), sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+    auto max_gt_instances = (int64_t*)malloc(sizeof(int64_t));
+    cudaMemcpy(max_gt_instances, num_gt_instances.max().DATA_PTR<int64_t>(), sizeof(int64_t),
+               cudaMemcpyDeviceToHost);
+
     at::Tensor output =
-        torch::zeros({num_proposed_instances, num_gt_instances},
+        torch::zeros({num_proposed_instances, total_gt_instances[0]},
                      at::device(gt_instances.device()).dtype(at::ScalarType::Float));
 
-    instance_iou_kernel_wrapper(num_gt_instances, num_proposed_instances,
-                                instance_idx.DATA_PTR<long>(), instance_offsets.DATA_PTR<long>(),
-                                gt_instances.DATA_PTR<long>(), gt_instance_sizes.DATA_PTR<long>(),
-                                output.DATA_PTR<float>());
+    at::Tensor offset_num_gt_instances =
+        at::cat({at::zeros(1, num_gt_instances.options()), num_gt_instances.cumsum(0)}, 0);
+    instance_iou_kernel_wrapper(
+        total_gt_instances[0], max_gt_instances[0], num_gt_instances.DATA_PTR<long>(),
+        num_proposed_instances, instance_idx.DATA_PTR<long>(), instance_offsets.DATA_PTR<long>(),
+        gt_instances.DATA_PTR<long>(), offset_num_gt_instances.DATA_PTR<long>(),
+        batch.DATA_PTR<long>(), gt_instance_sizes.DATA_PTR<long>(), output.DATA_PTR<float>());
 
     return output;
 }
diff --git a/cuda/src/metrics_gpu.cu b/cuda/src/metrics_gpu.cu
@@ -6,18 +6,25 @@
 
 #define THREADS 512
 
-__global__ void instance_iou_cuda_kernel(int nInstance, int nProposal, long* proposals_idx,
-                                         long* proposals_offset, long* instance_labels,
-                                         long* instance_pointnum, float* proposals_iou)
+__global__ void instance_iou_cuda_kernel(
+    long total_gt_instances, const long* __restrict__ nInstance, int nProposal,
+    const long* __restrict__ proposals_idx, const long* __restrict__ proposals_offset,
+    const long* __restrict__ instance_labels, const long* __restrict__ offset_num_gt_instances,
+    const long* __restrict__ batch, const long* __restrict__ instance_pointnum,
+    float* proposals_iou)
 {
     for (int proposal_id = blockIdx.x; proposal_id < nProposal; proposal_id += gridDim.x)
     {
         int start = proposals_offset[proposal_id];
         int end = proposals_offset[proposal_id + 1];
+        int sampleIdx = batch[proposals_idx[start]];
+        int sampleNInstances = nInstance[sampleIdx];
+        int instanceOffset = offset_num_gt_instances[sampleIdx];
         int proposal_total = end - start;
-        for (int instance_id = threadIdx.x; instance_id < nInstance; instance_id += blockDim.x)
+        for (int instance_id = threadIdx.x; instance_id < sampleNInstances;
+             instance_id += blockDim.x)
         {
-            int instance_total = instance_pointnum[instance_id];
+            int instance_total = instance_pointnum[instanceOffset + instance_id];
             int intersection = 0;
             for (int i = start; i < end; i++)
             {
@@ -27,7 +34,8 @@ __global__ void instance_iou_cuda_kernel(int nInstance, int nProposal, long* pro
                     intersection += 1;
                 }
             }
-            proposals_iou[proposal_id * nInstance + instance_id] =
+
+            proposals_iou[instanceOffset + instance_id + proposal_id * total_gt_instances] =
                 (float)intersection /
                 ((float)(proposal_total + instance_total - intersection) + 1e-5);
         }
@@ -39,12 +47,15 @@ __global__ void instance_iou_cuda_kernel(int nInstance, int nProposal, long* pro
 // input: instance_labels (N), long, 0~total_nInst-1, -100
 // input: instance_pointnum (total_nInst), int
 // output: proposals_iou (nProposal, total_nInst), float
-void instance_iou_kernel_wrapper(int nInstance, int nProposal, long* proposals_idx,
-                                 long* proposals_offset, long* instance_labels,
-                                 long* instance_pointnum, float* proposals_iou)
+void instance_iou_kernel_wrapper(long total_gt_instances, long max_gt_instances,
+                                 const long* nInstance, int nProposal, const long* proposals_idx,
+                                 const long* proposals_offset, const long* instance_labels,
+                                 const long* offset_num_gt_instances, const long* batch,
+                                 const long* instance_pointnum, float* proposals_iou)
 {
+    auto stream = at::cuda::getCurrentCUDAStream();
     instance_iou_cuda_kernel<<<std::min(nProposal, THREADS * THREADS),
-                               std::min(nInstance, THREADS)>>>(nInstance, nProposal, proposals_idx,
-                                                               proposals_offset, instance_labels,
-                                                               instance_pointnum, proposals_iou);
+                               std::min(max_gt_instances, (long)THREADS), 0, stream>>>(
+        total_gt_instances, nInstance, nProposal, proposals_idx, proposals_offset, instance_labels,
+        offset_num_gt_instances, batch, instance_pointnum, proposals_iou);
 }
diff --git a/test/test_metrics.py b/test/test_metrics.py
@@ -13,31 +13,44 @@
 
 
 class TestInstanceIou(unittest.TestCase):
-    def test_simple(self):
+    def test_simple(self, cuda=False):
         gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0])
         proposed_instances = [
             torch.tensor([0, 2]),  # 100% instance 1
             torch.tensor([1, 4]),  # 2/3 of instance 2
             torch.tensor([3, 5]),  # 1/3 of instance 2 and 1/1 of instance 3
         ]
-
+        if cuda:
+            proposed_instances = [c.cuda() for c in proposed_instances]
+            gt_instances = gt_instances.cuda()
         ious = instance_iou(proposed_instances, gt_instances)
-        torch.testing.assert_allclose(ious, torch.tensor([[1, 0, 0], [0, 2 / 3.0, 0], [0, 1.0 / 4.0, 1.0 / 2.0]]))
+        torch.testing.assert_allclose(ious.cpu(), torch.tensor([[1, 0, 0], [0, 2 / 3.0, 0], [0, 1.0 / 4.0, 1.0 / 2.0]]))
 
-    @run_if_cuda
-    def test_simple_cuda(self):
-        gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0]).cuda()
+    def test_batch(self, cuda=False):
+        gt_instances = torch.tensor([1, 2, 1, 2, 2, 3, 0])
+        batch = torch.tensor([0, 0, 1, 1, 1, 1, 1])
         proposed_instances = [
-            torch.tensor([0, 2]).cuda(),  # 100% instance 1
-            torch.tensor([1, 4]).cuda(),  # 2/3 of instance 2
-            torch.tensor([3, 5]).cuda(),  # 1/3 of instance 2 and 1/1 of instance 3
+            torch.tensor([0, 1]),  # 50% instance 1, 50% instance 2 of sample 1
+            torch.tensor([3, 4]),  # 100% instance 2 of sample 2
+            torch.tensor([5]),  # 100% of instance 3 of sample 2
         ]
-
-        ious = instance_iou(proposed_instances, gt_instances)
+        if cuda:
+            proposed_instances = [c.cuda() for c in proposed_instances]
+            gt_instances = gt_instances.cuda()
+            batch = batch.cuda()
+        ious = instance_iou(proposed_instances, gt_instances, batch=batch)
         torch.testing.assert_allclose(
-            ious, torch.tensor([[1, 0, 0], [0, 2 / 3.0, 0], [0, 1.0 / 4.0, 1.0 / 2.0]]).cuda(),
+            ious.cpu(), torch.tensor([[0.5, 0.5, 0, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1],]),
         )
 
+    @run_if_cuda
+    def test_simple_cuda(self):
+        self.test_simple(cuda=True)
+
+    @run_if_cuda
+    def test_batch_cuda(self):
+        self.test_batch(cuda=True)
+
     @run_if_cuda
     def test_same(self):
         gt_instances = torch.randint(0, 10, (1000,))
diff --git a/torch_points_kernels/metrics.py b/torch_points_kernels/metrics.py
@@ -1,5 +1,5 @@
 import torch
-from typing import List
+from typing import List, Optional
 import numpy as np
 import numba
 
@@ -8,23 +8,31 @@
 
 
 @numba.jit(nopython=True, parallel=True)
-def _instance_iou_cpu(instance_idx, instance_offsets, gt_instances, gt_instance_sizes, num_gt_instances):
+def _instance_iou_cpu(
+    instance_idx, instance_offsets, gt_instances, gt_instance_sizes, num_gt_instances: np.array, batch: np.array,
+):
     num_proposed_instances = len(instance_offsets) - 1
-    iou = np.zeros((num_proposed_instances, num_gt_instances))
+    iou = np.zeros((num_proposed_instances, num_gt_instances.sum()))
+    offset_num_gt_instances = np.concatenate((np.array([0]), num_gt_instances.cumsum()))
     for proposed_instance in range(num_proposed_instances):
         instance = instance_idx[instance_offsets[proposed_instance] : instance_offsets[proposed_instance + 1]]
-        for instance_id in numba.prange(1, num_gt_instances + 1):
+        sample_idx = batch[instance[0]]
+        gt_count_offset = offset_num_gt_instances[sample_idx]
+        sample_instance_count = num_gt_instances[sample_idx]
+        for instance_id in numba.prange(1, sample_instance_count + 1):
             intersection = 0
             for idx in instance:
                 if gt_instances[idx] == instance_id:
                     intersection += 1
-            iou[proposed_instance, instance_id - 1] = intersection / float(
-                len(instance) + gt_instance_sizes[instance_id - 1] - intersection
+            iou[proposed_instance, gt_count_offset + instance_id - 1] = intersection / float(
+                len(instance) + gt_instance_sizes[gt_count_offset + instance_id - 1] - intersection
             )
     return iou
 
 
-def instance_iou(instance_idx: List[torch.Tensor], gt_instances: torch.Tensor):
+def instance_iou(
+    instance_idx: List[torch.Tensor], gt_instances: torch.Tensor, batch: Optional[torch.Tensor] = None,
+):
     """ Computes the IoU between each proposed instance in instance_idx and ground truth instances. Returns a
     tensor of shape [instance_idx.shape[0], num_instances] that contains the iou between the proposed instances and all gt instances
     Instance label 0 is reserved for non instance points
@@ -41,29 +49,48 @@ def instance_iou(instance_idx: List[torch.Tensor], gt_instances: torch.Tensor):
     -------
     ious: torch.Tensor[nb_proposals, nb_groundtruth]
     """
+    if batch is None:
+        batch = torch.zeros_like(gt_instances)
+
+    # Gather number of gt instances per batch and size of those instances
     gt_instance_sizes = []
-    num_gt_instances = torch.max(gt_instances).item()
-    for instance_id in range(1, num_gt_instances + 1):
-        gt_instance_sizes.append(torch.sum(gt_instances == instance_id))
+    num_gt_instances = []
+    batch_size = batch[-1] + 1
+    for s in range(batch_size):
+        batch_mask = batch == s
+        sample_gt_instances = gt_instances[batch_mask]
+        sample_num_gt_instances = torch.max(sample_gt_instances).item()
+        num_gt_instances.append(sample_num_gt_instances)
+        for instance_id in range(1, sample_num_gt_instances + 1):
+            gt_instance_sizes.append(torch.sum(sample_gt_instances == instance_id))
     gt_instance_sizes = torch.stack(gt_instance_sizes)
+    num_gt_instances = torch.tensor(num_gt_instances)
 
+    # Instance offset when flatten
     instance_offsets = [0]
     cum_offset = 0
     for instance in instance_idx:
         cum_offset += instance.shape[0]
         instance_offsets.append(cum_offset)
 
+    # Compute ious
     instance_idx = torch.cat(instance_idx)
     if gt_instances.is_cuda:
         return tpcuda.instance_iou_cuda(
-            instance_idx, torch.tensor(instance_offsets).cuda(), gt_instances, gt_instance_sizes, num_gt_instances,
+            instance_idx.cuda(),
+            torch.tensor(instance_offsets).cuda(),
+            gt_instances.cuda(),
+            gt_instance_sizes.cuda(),
+            num_gt_instances.cuda(),
+            batch.cuda(),
         )
     else:
         res = _instance_iou_cpu(
             instance_idx.numpy(),
             np.asarray(instance_offsets),
             gt_instances.numpy(),
             gt_instance_sizes.numpy(),
-            num_gt_instances,
+            num_gt_instances.numpy(),
+            batch.numpy(),
         )
         return torch.tensor(res).float()