|
| 1 | +from typing import Optional, Sequence, Tuple |
1 | 2 | import numba |
2 | 3 | import numpy as np |
3 | 4 | from numba import typed, types |
| 5 | +from numpy.typing import NDArray |
4 | 6 | from rdkit import Chem |
5 | 7 | from tqdm import tqdm |
6 | 8 |
|
@@ -255,6 +257,105 @@ def count_fingerprint_keys(fingerprints): |
255 | 257 | return unique_keys[order], count_arr[order], first_arr[order] |
256 | 258 |
|
257 | 259 |
|
| 260 | +def merge_fingerprints( |
| 261 | + fingerprints: Sequence[Tuple[NDArray[np.integer], NDArray[np.floating]]], |
| 262 | + weights: Optional[NDArray[np.floating]] = None, |
| 263 | +) -> Tuple[NDArray[np.integer], NDArray[np.floating]]: |
| 264 | + """ |
| 265 | + Merge multiple sparse Morgan (count/TF-IDF) fingerprints into a single |
| 266 | + weighted-average fingerprint. |
| 267 | +
|
| 268 | + Parameters |
| 269 | + ---------- |
| 270 | + fingerprints : |
| 271 | + Sequence of (bits, values) pairs. |
| 272 | + - bits: 1D integer array of bit indices (non-zero entries) |
| 273 | + - values: 1D float array of TF-IDF (or other) weights, |
| 274 | + same length as `bits`. |
| 275 | + weights : |
| 276 | + Optional 1D array-like of length len(fingerprints) with one weight |
| 277 | + per fingerprint. Each fingerprint's values are scaled by its weight, |
| 278 | + then the merged fingerprint is normalized by the sum of all weights. |
| 279 | +
|
| 280 | + - If None, all fingerprints are weighted equally (weight = 1.0). |
| 281 | +
|
| 282 | + Returns |
| 283 | + ------- |
| 284 | + merged_bits, merged_values : |
| 285 | + - merged_bits: 1D integer array of unique bit indices |
| 286 | + - merged_values: 1D float array of weighted-average values per bit |
| 287 | + (sum over all weighted fingerprints, divided by sum(weights)). |
| 288 | + """ |
| 289 | + n_fps = len(fingerprints) |
| 290 | + if n_fps == 0: |
| 291 | + # Return empty sparse fingerprint |
| 292 | + return ( |
| 293 | + np.array([], dtype=np.int64), |
| 294 | + np.array([], dtype=np.float64), |
| 295 | + ) |
| 296 | + |
| 297 | + if weights is not None: |
| 298 | + w = np.asarray(weights, dtype=np.float64).ravel() |
| 299 | + if w.shape[0] != n_fps: |
| 300 | + raise ValueError( |
| 301 | + f"weights must have length {n_fps}, got {w.shape[0]}" |
| 302 | + ) |
| 303 | + total_weight = float(w.sum()) |
| 304 | + if total_weight <= 0.0: |
| 305 | + raise ValueError("Sum of weights must be positive.") |
| 306 | + else: |
| 307 | + # Equal weighting |
| 308 | + w = None |
| 309 | + total_weight = float(n_fps) |
| 310 | + |
| 311 | + # Concatenate all indices and (weighted) values |
| 312 | + bits_list = [] |
| 313 | + vals_list = [] |
| 314 | + |
| 315 | + for i, (bits, vals) in enumerate(fingerprints): |
| 316 | + bits = np.asarray(bits) |
| 317 | + vals = np.asarray(vals, dtype=np.float64) |
| 318 | + |
| 319 | + if bits.shape[0] != vals.shape[0]: |
| 320 | + raise ValueError( |
| 321 | + f"Fingerprint {i}: bits and values must have same length, " |
| 322 | + f"got {bits.shape[0]} and {vals.shape[0]}" |
| 323 | + ) |
| 324 | + |
| 325 | + if w is not None: |
| 326 | + vals = vals * w[i] |
| 327 | + |
| 328 | + bits_list.append(bits) |
| 329 | + vals_list.append(vals) |
| 330 | + |
| 331 | + if not bits_list: |
| 332 | + return ( |
| 333 | + np.array([], dtype=np.int64), |
| 334 | + np.array([], dtype=np.float64), |
| 335 | + ) |
| 336 | + |
| 337 | + all_bits = np.concatenate(bits_list) |
| 338 | + all_vals = np.concatenate(vals_list) |
| 339 | + |
| 340 | + if all_bits.size == 0: |
| 341 | + return ( |
| 342 | + np.array([], dtype=np.int64), |
| 343 | + np.array([], dtype=np.float64), |
| 344 | + ) |
| 345 | + |
| 346 | + # Group by bit index and sum weighted values |
| 347 | + unique_bits, inverse = np.unique(all_bits, return_inverse=True) |
| 348 | + summed_vals = np.bincount(inverse, weights=all_vals) |
| 349 | + |
| 350 | + # Weighted average: divide by sum of weights |
| 351 | + avg_vals = summed_vals / total_weight |
| 352 | + |
| 353 | + # Keep dtypes reasonably tight |
| 354 | + merged_bits = unique_bits.astype(all_bits.dtype, copy=False) |
| 355 | + merged_vals = avg_vals.astype(np.float32, copy=False) |
| 356 | + |
| 357 | + return merged_bits, merged_vals |
| 358 | + |
258 | 359 | ### ------------------------ |
259 | 360 | ### Bit Scaling and Weighing |
260 | 361 | ### ------------------------ |
|
0 commit comments