Add extra utils get_sparse_indices(...) and trunc_csr(...)

Frankie Robertson · Frankie Robertson · commit d197e5ce4271 · 2021-03-26T17:42:28.000+02:00
diff --git a/sklearn_ann/utils.py b/sklearn_ann/utils.py
@@ -7,12 +7,37 @@ def check_metric(metric, metrics):
         raise ValueError(f"Unknown metric {metric!r}. Valid metrics are {metrics!r}")
 
 
+def get_sparse_indices(mat, idx):
+    start_idx = mat.indptr[idx]
+    end_idx = mat.indptr[idx + 1]
+    return mat.indices[start_idx:end_idx]
+
+
 def get_sparse_row(mat, idx):
     start_idx = mat.indptr[idx]
     end_idx = mat.indptr[idx + 1]
     return zip(mat.indices[start_idx:end_idx], mat.data[start_idx:end_idx])
 
 
+def trunc_csr(csr, k):
+    indptr = np.empty_like(csr.indptr)
+    num_rows = len(csr.indptr) - 1
+    indices = [None] * num_rows
+    data = [None] * num_rows
+    cur_indptr = 0
+    for row_idx in range(num_rows):
+        indptr[row_idx] = cur_indptr
+        start_idx = csr.indptr[row_idx]
+        old_end_idx = csr.indptr[row_idx + 1]
+        end_idx = min(old_end_idx, start_idx + k)
+        data[row_idx] = csr.data[start_idx:end_idx]
+        indices[row_idx] = csr.indices[start_idx:end_idx]
+        ptr_inc = min(k, old_end_idx - start_idx)
+        cur_indptr = cur_indptr + ptr_inc
+    indptr[-1] = cur_indptr
+    return csr_matrix((np.concatenate(data), np.concatenate(indices), indptr))
+
+
 def or_else_csrs(csr1, csr2):
     # Possible TODO: Could use numba/Cython to speed this up?
     if csr1.shape != csr2.shape: