forked from cytomining/DeepProfiler
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimage_dataset.py
More file actions
251 lines (208 loc) · 11.1 KB
/
image_dataset.py
File metadata and controls
251 lines (208 loc) · 11.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
import pandas as pd
import os
import deepprofiler.dataset.pixels
import deepprofiler.dataset.utils
import deepprofiler.dataset.metadata
import deepprofiler.dataset.target
import deepprofiler.imaging.boxes
class ImageLocations(object):
def __init__(self, metadata_training, getImagePaths, targets):
self.keys = []
self.images = []
self.targets = []
self.outlines = []
for i, r in metadata_training.iterrows():
key, image, outl = getImagePaths(r)
self.keys.append(key)
self.images.append(image)
self.targets.append([t.get_values(r) for t in targets])
self.outlines.append(outl)
print("Reading single-cell locations")
def load_loc(self, params):
# Load cell locations for one image
i, config = params
loc = deepprofiler.imaging.boxes.get_locations(self.keys[i], config)
loc["ID"] = loc.index
loc["ImageKey"] = self.keys[i]
loc["ImagePaths"] = "#".join(self.images[i])
loc["Target"] = self.targets[i][0]
loc["Outlines"] = self.outlines[i]
print("Image", i, ":", len(loc), "cells", end="\r")
return loc
def load_locations(self, config):
# Use parallel tools to read all cells as quickly as possible
process = deepprofiler.dataset.utils.Parallel(config, numProcs=config["train"]["sampling"]["workers"])
data = process.compute(self.load_loc, [x for x in range(len(self.keys))])
process.close()
return data
class ImageDataset():
def __init__(self, metadata, sampling_field, channels, dataRoot, keyGen, config):
self.meta = metadata # Metadata object with a valid dataframe
self.channels = channels # List of column names corresponding to each channel file
self.root = dataRoot # Path to the directory of images
self.keyGen = keyGen # Function that returns the image key given its record in the metadata
self.sampling_field = sampling_field # Field in the metadata used to sample images evenly
self.sampling_values = metadata.data[sampling_field].unique()
self.targets = [] # Array of tasks in a multi-task setting (only one task supported)
self.outlines = None # Use of outlines if available
self.config = config # The configuration file
def get_image_paths(self, r):
key = self.keyGen(r)
list_images = [r[ch] for ch in self.channels]
paths = [(os.path.split(r[ch]))[0] for ch in self.channels]
image = [list_images[ch] if os.path.isdir(paths[ch]) else self.root + "/" + list_images[ch] for ch in range(len(paths))]
outlines = self.outlines
if outlines is not None:
outlines = self.outlines + r["Outlines"]
return (key, image, outlines)
def prepare_training_locations(self):
# Load single cell locations in one data frame
image_loc = ImageLocations(self.meta.train, self.get_image_paths, self.targets)
locations = image_loc.load_locations(self.config)
locations = pd.concat(locations)
# Group by image and count the number of single cells per image in the column ID
self.training_images = locations.groupby(["ImageKey", "Target"])["ID"].count().reset_index()
workers = self.config["train"]["sampling"]["workers"]
batch_size = self.config["train"]["model"]["params"]["batch_size"]
cache_size = self.config["train"]["sampling"]["cache_size"]
self.sampling_factor = self.config["train"]["sampling"]["factor"]
# Count the total number of single cells
self.total_single_cells = len(locations)
# Median number of images per class
self.sample_images = int(np.median(self.training_images.groupby("Target").count()["ID"]))
# Number of classes
targets = len(self.training_images["Target"].unique())
self.config["num_classes"] = targets
# Median number of single cells per image (column ID has counts as a result of groupby above)
self.sample_locations = int(np.median(self.training_images["ID"]))
# Set the target of single cells per epoch asuming a balanced set
self.cells_per_epoch = int(targets * self.sample_images * self.sample_locations * self.sampling_factor)
# Number of images that each worker should load at a time
self.images_per_worker = int(batch_size / workers)
# Percent of all cells that will be loaded in memory at a given moment in the queue
self.cache_coverage = 100*(cache_size / self.cells_per_epoch)
# Number of gradient updates required to approximately use all cells in an epoch
self.steps_per_epoch = int(self.cells_per_epoch / batch_size)
self.data_rotation = 0
self.cache_records = 0
self.shuffle_training_images()
def show_setup(self):
print(" || => Total single cells:", self.total_single_cells)
print(" || => Median # of images per class:", self.sample_images)
print(" || => Number of classes:", len(self.training_images["Target"].unique()))
print(" || => Median # of cells per image:", self.sample_locations)
print(" || => Approx. cells per epoch (with balanced sampling):", self.cells_per_epoch)
print(" || => Images sampled per worker:", self.images_per_worker)
print(" || => Cache data coverage: {}%".format(int(self.cache_coverage)))
print(" || => Steps per epoch:", self.steps_per_epoch)
def show_stats(self): ## Deprecated?
# Proportion of images loaded by workers from all images that they should load in one epoch (recall)
worker_efficiency = int(100 * (self.data_rotation / self.training_sample.shape[0]))
# Proportion of single cells placed in the cache from all those that should be used in one epoch
cache_usage = int(100 * self.cache_records / self.cells_per_epoch)
#print("Training set coverage: {}% (worker efficiency). Data rotation: {}% (cache usage).".format(
# worker_efficiency,
# cache_usage)
#)
self.data_rotation = 0
self.cache_records = 0
return {'worker_efficiency': worker_efficiency, 'cache_usage': cache_usage}
def shuffle_training_images(self):
# Images in the original metadata file are resampled at each epoch
sample = []
for c in self.meta.train[self.sampling_field].unique():
# Sample the same number of images per class. Oversample if the class has less images than needed
mask = self.meta.train[self.sampling_field] == c
available = self.meta.train[mask].shape[0]
rec = self.meta.train[mask].sample(n=self.sample_images, replace=available < self.sample_images)
sample.append(rec)
# Shuffle and restart pointers. Note that training sample has images instead of single cells.
self.training_sample = pd.concat(sample)
self.training_sample = self.training_sample.sample(frac=1.0).reset_index(drop=True)
self.batch_pointer = 0
def get_train_batch(self, lock):
# Select the next group of available images for cropping
lock.acquire()
df = self.training_sample[self.batch_pointer:self.batch_pointer + self.images_per_worker].copy()
self.batch_pointer += self.images_per_worker
self.data_rotation += self.images_per_worker
if self.batch_pointer > self.training_sample.shape[0]:
self.shuffle_training_images()
lock.release()
# Prepare the batch and cropping information for these images
batch = {"keys": [], "images": [], "targets": [], "locations": []}
sample = max(1, int(self.sample_locations * self.sampling_factor))
for k, r in df.iterrows():
key, image, outl = self.get_image_paths(r)
batch["keys"].append(key)
batch["targets"].append([t.get_values(r) for t in self.targets])
batch["images"].append(deepprofiler.dataset.pixels.openImage(image, outl))
batch["locations"].append(deepprofiler.imaging.boxes.get_locations(key, self.config, random_sample=sample))
return batch
def scan(self, f, frame="train", check=lambda k: True):
if frame == "all":
frame = self.meta.data.iterrows()
elif frame == "val":
frame = self.meta.val.iterrows()
else:
frame = self.meta.train.iterrows()
images = [(i, self.get_image_paths(r), r) for i, r in frame]
for img in images:
# img => [0] index key, [1] => [0:key, 1:paths, 2:outlines], [2] => metadata
index = img[0]
meta = img[2]
if check(meta):
image = deepprofiler.dataset.pixels.openImage(img[1][1], img[1][2])
f(index, image, meta)
return
def number_of_records(self, dataset):
if dataset == "all":
return len(self.meta.data)
elif dataset == "val":
return len(self.meta.val)
elif dataset == "train":
return len(self.meta.train)
else:
return 0
def add_target(self, new_target):
self.targets.append(new_target)
def read_dataset(config, mode = 'train'):
# Read metadata and split dataset in training and validation
metadata = deepprofiler.dataset.metadata.Metadata(config["paths"]["index"], dtype=None)
if config["prepare"]["compression"]["implement"]:
metadata.data.replace({'.tiff': '.png', '.tif': '.png'}, inplace=True, regex=True)
# Add outlines if specified
outlines = None
if "outlines" in config["prepare"].keys() and config["prepare"]["outlines"] != "":
df = pd.read_csv(config["paths"]["metadata"] + "/outlines.csv")
metadata.mergeOutlines(df)
outlines = config["paths"]["root"] + "inputs/outlines/"
print(metadata.data.info())
# Split training data
if mode == 'train' and config["train"]["model"]["crop_generator"] == 'crop_generator':
split_field = config["train"]["partition"]["split_field"]
trainingFilter = lambda df: df[split_field].isin(config["train"]["partition"]["training"])
validationFilter = lambda df: df[split_field].isin(config["train"]["partition"]["validation"])
metadata.splitMetadata(trainingFilter, validationFilter)
# Create a dataset
keyGen = lambda r: "{}/{}-{}".format(r["Metadata_Plate"], r["Metadata_Well"], r["Metadata_Site"])
dset = ImageDataset(
metadata,
config["dataset"]["metadata"]["label_field"],
config["dataset"]["images"]["channels"],
config["paths"]["images"],
keyGen,
config
)
# Add training targets
for t in config["train"]["partition"]["targets"]:
new_target = deepprofiler.dataset.target.MetadataColumnTarget(t, metadata.data[t].unique())
dset.add_target(new_target)
# Activate outlines for masking if needed
if config["dataset"]["locations"]["mask_objects"]:
dset.outlines = outlines
# For training with sampled_crop_generator, no need to read locations again.
if mode == 'train' and config["train"]["model"]["crop_generator"] == 'crop_generator':
dset.prepare_training_locations()
return dset