Skip to content

Commit ddcaf23

Browse files
authored
Merge pull request #37 from computational-metabolomics/new_features
Update portals, peaklist and peak_matrix models, and fix minor bugs
2 parents cde30e3 + 02eeca7 commit ddcaf23

15 files changed

+116
-73
lines changed

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ will help you to make the PR if you are new to `git`.
3232
Developers & Contributors
3333
-------------------------
3434
- Ralf J. M. Weber ([email protected]) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
35-
- Jiarui (Albert) Zhou ([email protected]) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
35+
- Jiarui (Albert) Zhou ([email protected]) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_, `HIT Shenzhen (China) <http://www.hitsz.edu.cn>`_
3636
- Thomas N. Lawson ([email protected]) - `University of Birmingham (UK) <http://www.birmingham.ac.uk/index.aspx>`_
3737

3838

dimspy/models/peak_matrix.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,7 @@ def rsd(self, *args, **kwargs):
401401
402402
:param args: tags or untyped tag values for RSD calculation, no value = calculate over all samples
403403
:param kwargs: typed tags for RSD calculation, , no value = calculate over all samples
404+
:param on_attr: calculate RSD on given attribute. Default = "intensity"
404405
:param flagged_only: whether to calculate on flagged peaks only. Default = True
405406
:type: numpy array
406407
@@ -413,6 +414,7 @@ def rsd(self, *args, **kwargs):
413414
corresponding rsd value will be set to np.nan.
414415
415416
"""
417+
on_attr = kwargs.pop('on_attr') if kwargs.has_key('on_attr') else 'intensity'
416418
flagged_only = kwargs.pop('flagged_only') if kwargs.has_key('flagged_only') else True
417419

418420
if self.shape[0] < 2:
@@ -423,7 +425,7 @@ def rsd(self, *args, **kwargs):
423425
unmask_peakmatrix(self, *args, **kwargs)) as m:
424426
if m.shape[0] == 0: raise AttributeError('peak matrix does not have label(s) [%s]' %
425427
join(map(lambda x: str(x)[1:-1], (args, kwargs)), ', '))
426-
ints = m.attr_matrix('intensity', flagged_only)
428+
ints = m.attr_matrix(on_attr, flagged_only)
427429
rsd = m._present_std(ints, 0, flagged_only) / m._present_mean(ints, 0, flagged_only) * 100
428430

429431
rsd[np.where(map(lambda x: len(set(x[np.nonzero(x)])) == 1, ints.T))] = np.nan # only one valid value

dimspy/models/peaklist.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,11 +140,16 @@ def ID(self):
140140
Property of the peaklist ID.
141141
142142
:getter: returns the peaklist ID
143+
:setter: set the peaklist ID
143144
:type: same as input ID
144145
145146
"""
146147
return self._id
147148

149+
@ID.setter
150+
def ID(self, value):
151+
self._id = str(value)
152+
148153
@property
149154
def size(self):
150155
"""

dimspy/portals/mzml_portal.py

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,21 @@
1414
import pymzml
1515
import numpy as np
1616
import zipfile
17+
from copy import deepcopy
1718
from dimspy.models.peaklist import PeakList
1819
from dimspy.experiment import mz_range_from_header
1920

2021

2122
class Mzml:
22-
def __init__(self, filename="", archive=None):
23+
def __init__(self, filename="", archive=None, preload=True):
2324
self.filename = filename
2425
self.archive = archive
26+
self._preload = preload
27+
self._cache = None
2528

2629
def run(self):
30+
if self._cache is not None: return self._cache
31+
2732
if not self.filename.lower().endswith(".mzml") and not self.filename.lower().endswith(".mzml.gz") and not self.filename.lower().endswith(".zip"):
2833
raise IOError('Incorrect file format for mzML parser')
2934
if self.archive is not None:
@@ -32,11 +37,15 @@ def run(self):
3237
zf = zipfile.ZipFile(self.archive, 'r')
3338
if self.filename not in zf.namelist():
3439
raise IOError("{} does not exist in zip file".format(self.filename))
35-
return pymzml.run.Reader('', file_object=zf.open(self.filename))
40+
dat = pymzml.run.Reader('', file_object=zf.open(self.filename))
41+
if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
42+
return dat
3643
elif self.filename.lower().endswith(".mzml") or self.filename.lower().endswith(".mzml.gz"):
3744
if not os.path.isfile(self.filename):
3845
raise IOError("{} does not exist".format(self.filename))
39-
return pymzml.run.Reader(self.filename)
46+
dat = pymzml.run.Reader(self.filename)
47+
if self._preload: dat = self._cache = tuple(map(deepcopy, dat))
48+
return dat
4049
else:
4150
return None
4251

@@ -62,7 +71,10 @@ def peaklist(self, scan_id, function_noise="median"):
6271
for scan in self.run():
6372
if scan["id"] == scan_id:
6473

65-
mzs, ints = zip(*scan.peaks)
74+
if len(scan.peaks) > 0:
75+
mzs, ints = zip(*scan.peaks)
76+
else:
77+
mzs, ints = [], []
6678

6779
scan_time = scan["MS:1000016"]
6880
tic = scan["total ion current"]
@@ -101,10 +113,21 @@ def tics(self):
101113
# print self.run()[2]
102114
for scan in self.run():
103115
if scan["id"] == "TIC":
104-
tics = zip(*scan.peaks)[1]
105-
return tics
116+
return zip(*scan.peaks)[1]
106117
return
107118

119+
def injection_times(self):
120+
injection_times = {}
121+
for scan in self.run():
122+
injection_times[scan['id']] = None
123+
for element in scan.xmlTree:
124+
if "MS:1000927" == element.get('accession'):
125+
injection_times[scan['id']] = float(element.get("value"))
126+
break
127+
if scan['id'] not in injection_times:
128+
injection_times[scan['id']] = None
129+
return injection_times
130+
108131
def scan_dependents(self):
109132
l = []
110133
for scan in self.run():

dimspy/portals/thermo_raw_portal.py

Lines changed: 34 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,66 +24,53 @@
2424

2525

2626
def mz_range_from_header(h):
27-
"""
28-
Extract a list of headers / .
29-
:rtype: list
30-
"""
3127
return [float(m) for m in re.findall(r'([\w\.-]+)-([\w\.-]+)', h)[0]]
3228

3329

3430
class ThermoRaw:
35-
"""
36-
Extract a list of headers / .
37-
:rtype: list
38-
"""
31+
3932
def __init__(self, filename):
4033
self.run = RawFileReader.RawFileReaderAdapter.FileFactory(filename)
4134
self.run.SelectInstrument(Business.Device.MS, 1)
35+
self.filename = filename
4236

4337
def headers(self):
44-
"""
45-
Extract a particular scan from a *.raw file and return a PeakList objects
46-
:rtype: dict
47-
"""
38+
4839
sids = collections.OrderedDict()
4940
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
5041
sids.setdefault(str(self.run.GetFilterForScanNumber(scan_id).Filter), []).append(scan_id)
5142
return sids
5243

5344
def scan_ids(self):
54-
"""
55-
Extract a particular scan from a *.raw file and return a PeakList objects
56-
:rtype: dict
57-
"""
45+
5846
sids = collections.OrderedDict()
5947
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
6048
sids[scan_id] = str(self.run.GetFilterForScanNumber(scan_id).Filter)
6149
return sids
6250

6351
def peaklist(self, scan_id, function_noise="noise_packets"):
64-
"""
65-
Extract a particular scan from a *.raw file and return a PeakList objects
6652

67-
:param scan_ids:
68-
:rtype: list
69-
"""
7053
if function_noise not in ["noise_packets", "mean", "median", "mad"]:
7154
raise ValueError("select a function that is available [noise_packets, mean, median, mad]")
7255

7356
scan = self.run.GetCentroidStream(scan_id, False)
74-
75-
mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises) # SignalToNoise not available
76-
mz_ibn.sort()
77-
mzs, ints, baseline, noise = zip(*mz_ibn)
57+
if scan.Masses is not None:
58+
mz_ibn = zip(scan.Masses, scan.Intensities, scan.Baselines, scan.Noises) # SignalToNoise not available
59+
mz_ibn.sort()
60+
mzs, ints, baseline, noise = zip(*mz_ibn)
61+
else:
62+
mzs, ints, baseline, noise = [], [], [], []
7863

7964
if function_noise == "noise_packets":
8065
snr = [p.SignalToNoise for p in scan.GetCentroids()]
81-
elif function_noise == "median":
66+
elif function_noise == "median" and len(ints) > 0:
8267
snr = ints / np.median(ints)
83-
elif function_noise == "mean":
68+
elif function_noise == "mean" and len(ints) > 0:
8469
snr = ints / np.mean(ints)
85-
elif function_noise == "mad":
70+
elif function_noise == "mad" and len(ints) > 0:
8671
snr = ints / np.median(np.abs(np.subtract(ints, np.median(ints))))
72+
else:
73+
snr = []
8774

8875
scan_stats = self.run.GetScanStatsForScanNumber(scan_id)
8976

@@ -119,39 +106,43 @@ def peaklist(self, scan_id, function_noise="noise_packets"):
119106
tic=tic,
120107
function_noise=function_noise)
121108

122-
pl.add_attribute('snr', snr)
123-
pl.add_attribute('noise', noise)
124-
pl.add_attribute('baseline', baseline)
109+
if len(pl.mz) > 0:
110+
pl.add_attribute('snr', snr)
111+
pl.add_attribute('noise', noise)
112+
pl.add_attribute('baseline', baseline)
113+
125114
return pl
126115

127116
def peaklists(self, scan_ids, function_noise="noise_packets"):
128-
"""
129-
Extract the scans from a *.raw file and return a list of PeakList objects
130-
131-
:param scan_ids:
132-
:rtype: list
133-
134-
"""
135117
if function_noise not in ["noise_packets", "mean", "median", "mad"]:
136118
raise ValueError("select a function that is available [noise_packets, mean, median, mad]")
137119

138120
return [self.peaklist(scan_id, function_noise=function_noise) for scan_id in scan_ids]
139121

140122
def tics(self):
141-
# somehow i can not access the scans directly when run() uses an open archive object
142-
# print self.run()[2]
143-
tics = []
123+
tics = collections.OrderedDict()
144124
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
145125
scan_stats = self.run.GetScanStatsForScanNumber(scan_id)
146-
tics.append(scan_stats.TIC)
126+
tics[scan_id].append(scan_stats.TIC)
147127
return tics
148128

129+
def injection_times(self):
130+
injection_times = collections.OrderedDict()
131+
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
132+
extra_values = list(self.run.GetTrailerExtraInformation(scan_id).Values)
133+
extra_labels = list(self.run.GetTrailerExtraInformation(scan_id).Labels)
134+
for i, label in enumerate(extra_labels):
135+
if "Ion Injection Time (ms):" == label:
136+
injection_times[scan_id] = float(extra_values[i])
137+
if scan_id not in injection_times:
138+
injection_times[scan_id] = None
139+
return injection_times
140+
149141
def scan_dependents(self):
150142
l = []
151143
for scan_id in range(self.run.RunHeaderEx.FirstSpectrum, self.run.RunHeaderEx.LastSpectrum + 1):
152144
gsd = self.run.GetScanDependents(scan_id, 5)
153145
if gsd is not None:
154146
for i, d in enumerate(gsd.ScanDependentDetailArray):
155-
print scan_id, self.run.GetFilterForScanNumber(scan_id).Filter, d.ScanIndex, d.FilterString
156147
l.append([scan_id, d.ScanIndex])
157148
return l

dimspy/process/peak_filters.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -67,46 +67,48 @@ def filter_ringing(pl, threshold, bin_size=1.0, flag_name='ringing_flag', flag_i
6767
return pl.add_attribute(flag_name, pl.intensity > (mask * threshold), is_flag=True, on_index=flag_index)
6868

6969

70-
def filter_mz_ranges(pl, mz_remove_rngs, flag_name='mz_range_remove_flag', flag_index=None):
70+
def filter_mz_ranges(pl, mz_ranges, flag_name='mz_ranges_flag', flagged_only=False, flag_index=None):
7171
"""
7272
Peaklist mz range filter.
73-
7473
:param pl: the target peaklist
75-
:param mz_remove_rngs: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
74+
:param mz_ranges: the mz ranges to remove. Must be in the format of [(mz_min1, mz_max2), (mz_min2, mz_max2), ...]
7675
:param flag_name: name of the new flag attribute. Default = 'mz_range_remove_flag'
7776
:param flag_index: index of the new flag to be inserted into the peaklist. Default = None
7877
:rtype: PeakList object
79-
8078
This filter will remove all the peaks whose mz values are within any of the ranges in the mz_remove_rngs.
81-
8279
"""
83-
mzrs_removed_flags = np.ones(pl.shape[0], dtype=bool)
84-
for mzr in mz_remove_rngs:
80+
if flagged_only:
81+
flags = np.ones(pl.shape[0], dtype=bool)
82+
else:
83+
flags = np.ones(pl.full_size, dtype=bool)
84+
85+
for mzr in mz_ranges:
8586
if len(mzr) != 2:
8687
raise ValueError('mzr_remove: Provide a list of "start" and "end" values for each m/z range that needs to be removed.')
8788
if mzr[0] >= mzr[1]:
8889
raise ValueError('mzr_remove: Start value cannot be larger then end value.')
89-
mzrs_removed_flags[(pl.mz >= mzr[0]) & (pl.mz <= mzr[1])] = False
90-
pl.add_attribute(flag_name, mzrs_removed_flags, is_flag=True, on_index=flag_index)
90+
flags[(pl.get_attribute("mz", flagged_only) >= mzr[0]) & (pl.get_attribute("mz", flagged_only) <= mzr[1])] = False
91+
pl.add_attribute(flag_name, flags, flagged_only=flagged_only, is_flag=True, on_index=flag_index)
9192
return pl
9293

9394

9495
# PeakMatrix filters
95-
def filter_rsd(pm, rsd_threshold, qc_tag, flag_name='rsd_flag'):
96+
def filter_rsd(pm, rsd_threshold, qc_tag, on_attr = 'intensity', flag_name='rsd_flag'):
9697
"""
9798
PeakMatrix RSD filter.
9899
99100
:param pm: the target peak matrix
100101
:param rsd_threshold: threshold of the RSD of the QC samples
101102
:param qc_tag: tag (label) to unmask qc samples
103+
:param on_attr: calculate RSD on given attribute. Default = "intensity"
102104
:param flag_name: name of the new flag. Default = 'rsd_flag'
103105
:rtype: PeakMatrix object
104106
105107
This filter will calculate the RSD values of the QC samples. A peak with a QC RSD value larger than the
106108
threshold will be unflagged.
107109
108110
"""
109-
rsd_values = pm.rsd(qc_tag)
111+
rsd_values = pm.rsd(qc_tag, on_attr = on_attr)
110112
if np.any(np.isnan(rsd_values)):
111113
logging.warning('nan found in QC rsd values, filter might not work properly')
112114

@@ -136,10 +138,10 @@ def filter_fraction(pm, fraction_threshold, within_classes=False, class_tag_type
136138
raise KeyError('must provide class tag type for within classes filtering')
137139
if not all(map(lambda t: t.has_tag_type(class_tag_type), pm.peaklist_tags)):
138140
raise AttributeError('not all tags have tag type [%s]' % class_tag_type)
139-
flg = np.ones(pm.shape[1])
141+
flg = np.zeros(pm.shape[1])
140142
for tag in pm.tags_of(class_tag_type):
141143
with unmask_peakmatrix(pm, tag) as m:
142-
flg = np.logical_and(flg, (m.fraction >= fraction_threshold))
144+
flg = np.logical_or(flg, (m.fraction >= fraction_threshold))
143145
pm.add_flag(flag_name, flg)
144146
return pm
145147

dimspy/process/replicate_processing.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def read_scans(fn, source, function_noise, min_scans=1, filter_scan_events=None)
119119
return scans
120120

121121

122-
def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, block_size=5000, ncpus=None):
122+
def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0, rsd_on="intensity", block_size=5000, ncpus=None):
123123

124124
emlst = np.array(map(lambda x: x.size == 0, pls))
125125
if np.sum(emlst) > 0:
@@ -137,18 +137,26 @@ def average_replicate_scans(name, pls, ppm=2.0, min_fraction=0.8, rsd_thres=30.0
137137
if v is not None:
138138
pl_avg.metadata[k].append(v)
139139

140-
pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2)
140+
if rsd_on != "intensity":
141+
pl_avg.add_attribute(rsd_on, pm.attr_mean_vector(rsd_on), on_index=2)
142+
rsd_label = "rsd_{}".format(rsd_on)
143+
shift = 1
144+
else:
145+
rsd_label = "rsd"
146+
shift = 0
147+
148+
pl_avg.add_attribute("snr", pm.attr_mean_vector('snr'), on_index=2+shift)
141149
pl_avg.add_attribute("snr_flag", np.ones(pl_avg.full_size), flagged_only=False, is_flag=True)
142150

143-
pl_avg.add_attribute("rsd", pm.rsd(flagged_only=False), on_index=5)
151+
pl_avg.add_attribute(rsd_label, pm.rsd(on_attr=rsd_on, flagged_only=False), on_index=5+shift)
144152

145153
if min_fraction is not None:
146154
pl_avg.add_attribute("fraction_flag", (pm.present / float(pm.shape[0])) >= min_fraction, flagged_only=False, is_flag=True)
147155
if rsd_thres is not None:
148156
if pm.shape[0] == 1:
149157
logging.warning('applying RSD filter on single scan, all peaks removed')
150-
rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute("rsd", flagged_only=False))
151-
pl_avg.add_attribute("rsd_flag", rsd_flag, flagged_only=False, is_flag=True)
158+
rsd_flag = map(lambda x: not np.isnan(x) and x < rsd_thres, pl_avg.get_attribute(rsd_label, flagged_only=False))
159+
pl_avg.add_attribute("{}_flag".format(rsd_label), rsd_flag, flagged_only=False, is_flag=True)
152160
return pl_avg
153161

154162

0 commit comments

Comments
 (0)