|
| 1 | +#author https://github.com/akgohain |
| 2 | + |
| 3 | +import os |
| 4 | +import gc |
| 5 | +import math |
| 6 | +import h5py |
| 7 | +import numpy as np |
| 8 | +from tqdm import tqdm |
| 9 | +import argparse |
| 10 | +from scipy import spatial |
| 11 | +from concurrent.futures import ProcessPoolExecutor, as_completed |
| 12 | + |
| 13 | +chunk_size = 25 #25 |
| 14 | + |
| 15 | +ds_factor = 1 |
| 16 | + |
| 17 | +voxel_dims = (30, 8 * ds_factor, 8 * ds_factor) |
| 18 | + |
| 19 | +sample_dir = '/home/rothmr/hydra/sample/' |
| 20 | + |
| 21 | + |
| 22 | +#inpute into args to compute for all neurons |
| 23 | +names_20 = ["KR4", "KR5", "KR6", "SHL55", "PN3", "LUX2", "SHL20", "KR11", "KR10", |
| 24 | + "RGC2", "KM4", "NET12", "NET10", "NET11", "PN7", "SHL18", |
| 25 | + "SHL24", "SHL28", "RGC7", "SHL17"] |
| 26 | + |
| 27 | + |
| 28 | +def process_chunk_range(params): |
| 29 | + start, end, file_path, ds_factor, name = params |
| 30 | + chunk_stats = {} |
| 31 | + with h5py.File(file_path, 'r', swmr=True) as f: |
| 32 | + dset = f["main"] |
| 33 | + |
| 34 | + chunk = dset[start:end, ::ds_factor, ::ds_factor] |
| 35 | + unique_labels = np.unique(chunk) |
| 36 | + for label in unique_labels: |
| 37 | + if label == 0: |
| 38 | + continue |
| 39 | + mask = (chunk == label) |
| 40 | + coords = np.argwhere(mask) |
| 41 | + if coords.size == 0: |
| 42 | + continue |
| 43 | + coords[:, 0] += start |
| 44 | + sum_coords = coords.sum(axis=0) |
| 45 | + count = coords.shape[0] |
| 46 | + if label in chunk_stats: |
| 47 | + chunk_stats[label]['sum'] += sum_coords |
| 48 | + chunk_stats[label]['count'] += count |
| 49 | + else: |
| 50 | + chunk_stats[label] = {'sum': sum_coords, 'count': count} |
| 51 | + del chunk |
| 52 | + gc.collect() |
| 53 | + return chunk_stats |
| 54 | + |
| 55 | +def process_chunks(file_path, ds_factor, chunk_size, name, num_workers=None): |
| 56 | + chunk_stats = {} |
| 57 | + with h5py.File(file_path, 'r', swmr=True) as f: |
| 58 | + full_shape = f["main"].shape |
| 59 | + |
| 60 | + chunk_ranges = [] |
| 61 | + for start in range(0, full_shape[0], chunk_size): |
| 62 | + end = min(start + chunk_size, full_shape[0]) |
| 63 | + chunk_ranges.append((start, end, file_path, ds_factor, name)) |
| 64 | + |
| 65 | + with ProcessPoolExecutor(max_workers=num_workers) as executor: |
| 66 | + futures = [executor.submit(process_chunk_range, params) for params in chunk_ranges] |
| 67 | + for future in tqdm(as_completed(futures), total=len(futures), desc="Processing chunks", ncols=100): |
| 68 | + result = future.result() |
| 69 | + for label, data in result.items(): |
| 70 | + if label in chunk_stats: |
| 71 | + chunk_stats[label]['sum'] += data['sum'] |
| 72 | + chunk_stats[label]['count'] += data['count'] |
| 73 | + else: |
| 74 | + chunk_stats[label] = data |
| 75 | + return chunk_stats |
| 76 | + |
| 77 | + |
| 78 | +def compute_metadata(chunk_stats, voxel_dims): |
| 79 | + """ |
| 80 | + Computes the center-of-mass (COM), volume, and radius for each vesicle. |
| 81 | + The volume is computed as: voxel_count * (30 * 64 * 64) |
| 82 | + The radius (in nm) is computed using the micaela/shulin's method: |
| 83 | + sqrt((voxel_count * (64*64)) / pi) |
| 84 | + """ |
| 85 | + metadata = {} |
| 86 | + for label, stats in chunk_stats.items(): |
| 87 | + com = stats['sum'] / stats['count'] |
| 88 | + volume_nm = stats['count'] * (voxel_dims[0] * voxel_dims[1] * voxel_dims[2]) |
| 89 | + radius_nm = math.sqrt((stats['count'] * (voxel_dims[1] * voxel_dims[2])) / math.pi) |
| 90 | + metadata[label] = { |
| 91 | + 'com': com, |
| 92 | + 'volume_nm': volume_nm, |
| 93 | + 'radius_nm': radius_nm |
| 94 | + } |
| 95 | + return metadata |
| 96 | + |
| 97 | +def compute_density(metadata, voxel_dims, kd_radius=500): |
| 98 | + """ |
| 99 | + Computes local vesicle density using a KDTree. |
| 100 | + COM coordinates are converted from voxel indices to physical units (nm) and |
| 101 | + then the density is calculated as: frequency / (kd_radius^2) |
| 102 | + The density values are also normalized. |
| 103 | + """ |
| 104 | + com_list = [] |
| 105 | + labels = [] |
| 106 | + for label, data in metadata.items(): |
| 107 | + com = data['com'] |
| 108 | + # Convert voxel indices to physical coordinates (nm) |
| 109 | + physical_com = np.array([ |
| 110 | + com[0] * voxel_dims[0], |
| 111 | + com[1] * voxel_dims[1], |
| 112 | + com[2] * voxel_dims[2] |
| 113 | + ]) |
| 114 | + com_list.append(physical_com) |
| 115 | + labels.append(label) |
| 116 | + |
| 117 | + print("total num of ves: ", len(com_list)) |
| 118 | + com_array = np.array(com_list) |
| 119 | + |
| 120 | + |
| 121 | + # Build the KDTree and query neighbors within kd_radius (nm) |
| 122 | + tree = spatial.KDTree(com_array) |
| 123 | + neighbors = tree.query_ball_tree(tree, kd_radius) |
| 124 | + frequency = np.array([len(n) for n in neighbors]) |
| 125 | + density = frequency / (kd_radius ** 2) |
| 126 | + |
| 127 | + # Normalize the density values |
| 128 | + min_density = np.min(density) |
| 129 | + max_density = np.max(density) |
| 130 | + if max_density > min_density: |
| 131 | + normalized_density = (density - min_density) / (max_density - min_density) |
| 132 | + else: |
| 133 | + normalized_density = density |
| 134 | + |
| 135 | + # Add density info to metadata |
| 136 | + for i, label in enumerate(labels): |
| 137 | + metadata[label]['density'] = density[i] |
| 138 | + metadata[label]['normalized_density'] = normalized_density[i] |
| 139 | + return metadata |
| 140 | + |
| 141 | +def process_vesicle_data(name, vesicle_type="lv"): |
| 142 | + """ |
| 143 | + Processes vesicle data (either 'lv' or 'sv') using sequential chunking. |
| 144 | + Computes COM, volume, radius, and density via KDTree. |
| 145 | + The metadata is written out to a text file. |
| 146 | + |
| 147 | + Note: Re-enumerates labels to ensure uniqueness across LV and SV datasets. |
| 148 | + """ |
| 149 | + file_prefix = "vesicle_big_" if vesicle_type == "lv" else "vesicle_small_" |
| 150 | + file_path = f"/data/projects/weilab/dataset/hydra/results/{file_prefix}{name}_30-8-8.h5" |
| 151 | + |
| 152 | + #if sample |
| 153 | + if(name=="sample"): |
| 154 | + if(vesicle_type == "lv"): #should only be lv |
| 155 | + file_path = f"{sample_dir}sample_data/7-13_pred_filtered.h5" |
| 156 | + |
| 157 | + print(f"Starting chunked processing for {vesicle_type} data of {name}...") |
| 158 | + chunk_stats = process_chunks(file_path, ds_factor, chunk_size, name) |
| 159 | + metadata = compute_metadata(chunk_stats, voxel_dims) |
| 160 | + metadata = compute_density(metadata, voxel_dims, kd_radius=500) |
| 161 | + |
| 162 | + # Re-enumerate labels to ensure uniqueness by prefixing with vesicle type |
| 163 | + unique_metadata = {} |
| 164 | + for label, data in metadata.items(): |
| 165 | + new_label = f"{vesicle_type}_{label}" |
| 166 | + unique_metadata[new_label] = data |
| 167 | + metadata = unique_metadata |
| 168 | + |
| 169 | + # Ensure output directory exists |
| 170 | + output_dir = f"metadata/{name}/" |
| 171 | + os.makedirs(output_dir, exist_ok=True) |
| 172 | + |
| 173 | + output_file = f"metadata/{name}/{name}_{vesicle_type}_com_mapping.txt" |
| 174 | + |
| 175 | + if(name=="sample"): |
| 176 | + output_file = f"{sample_dir}sample_outputs/sample_com_mapping.txt" |
| 177 | + |
| 178 | + with open(output_file, "w") as f: |
| 179 | + for label, data in metadata.items(): |
| 180 | + f.write(f"{data['com']}: ('{vesicle_type}', {label}, {data['volume_nm']}, {data['radius_nm']}, {data['density']}, {data['normalized_density']})\n") |
| 181 | + print(f"Chunked processing complete for {vesicle_type} data of {name}!") |
| 182 | + return metadata |
| 183 | + |
| 184 | +if __name__ == "__main__": |
| 185 | + |
| 186 | + parser = argparse.ArgumentParser() |
| 187 | + parser.add_argument("--which_neurons", type=str, help="all or sample?") #enter as "all" or "sample" |
| 188 | + args = parser.parse_args() |
| 189 | + which_neurons = args.which_neurons |
| 190 | + |
| 191 | + if(which_neurons=="sample"): |
| 192 | + lv_metadata = process_vesicle_data("sample", vesicle_type="lv") |
| 193 | + #only need LV for the near counts example pipeline |
| 194 | + |
| 195 | + elif(which_neurons=="all"): |
| 196 | + for name in names_20: |
| 197 | + lv_metadata = process_vesicle_data(name, vesicle_type="lv") |
| 198 | + sv_metadata = process_vesicle_data(name, vesicle_type="sv") |
| 199 | + |
| 200 | + |
| 201 | + |
| 202 | + |
0 commit comments