Skip to content

Commit 8d880ad

Browse files
committed
MS level filter and progress
1 parent 5d955be commit 8d880ad

File tree

1 file changed

+179
-151
lines changed

1 file changed

+179
-151
lines changed

Writer/ParquetSpectrumWriter.cs

Lines changed: 179 additions & 151 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
using ThermoFisher.CommonCore.Data.Business;
1010
using ThermoFisher.CommonCore.Data.FilterEnums;
1111
using ThermoFisher.CommonCore.Data.Interfaces;
12+
using ThermoRawFileParser.Writer.MzML;
1213

1314
namespace ThermoRawFileParser.Writer
1415
{
@@ -60,9 +61,6 @@ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNu
6061
throw new RawFileParserException("No MS data in RAW file, no output will be produced");
6162
}
6263

63-
//TODO: Correct iterator based on MS-level filter
64-
var enumerator = raw.GetFilteredScanEnumerator(" ");
65-
6664
ConfigureWriter(".mzparquet");
6765

6866
ParquetSerializerOptions opts = new ParquetSerializerOptions();
@@ -71,195 +69,225 @@ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNu
7169

7270
var data = new List<MzParquet>();
7371

74-
foreach (var scanNumber in enumerator)
75-
{
76-
var scanFilter = raw.GetFilterForScanNumber(scanNumber);
77-
78-
// Get the scan event for this scan number
79-
var scanEvent = raw.GetScanEventForScanNumber(scanNumber);
80-
81-
// Get scan ms level
82-
var msLevel = (int)scanFilter.MSOrder;
72+
var lastScanProgress = 0;
8373

84-
// Get Scan
85-
var scan = Scan.FromFile(raw, scanNumber);
74+
Log.Info(String.Format("Processing {0} MS scans", +(1 + lastScanNumber - firstScanNumber)));
8675

87-
ScanTrailer trailerData;
76+
for (var scanNumber = firstScanNumber; scanNumber <= lastScanNumber; scanNumber++)
77+
{
78+
if (ParseInput.LogFormat == LogFormat.DEFAULT)
79+
{
80+
var scanProgress = (int)((double)scanNumber / (lastScanNumber - firstScanNumber + 1) * 100);
81+
if (scanProgress % ProgressPercentageStep == 0)
82+
{
83+
if (scanProgress != lastScanProgress)
84+
{
85+
Console.Write("" + scanProgress + "% ");
86+
lastScanProgress = scanProgress;
87+
}
88+
}
89+
}
8890

8991
try
9092
{
91-
trailerData = new ScanTrailer(raw.GetTrailerExtraInformation(scanNumber));
93+
int level = (int)raw.GetScanEventForScanNumber(scanNumber).MSOrder; //applying MS level filter
94+
if (ParseInput.MsLevel.Contains(level))
95+
AddScan(raw, scanNumber, data);
9296
}
9397
catch (Exception ex)
9498
{
95-
Log.WarnFormat("Cannot load trailer infromation for scan {0} due to following exception\n{1}", scanNumber, ex.Message);
96-
ParseInput.NewWarn();
97-
trailerData = new ScanTrailer();
99+
Log.Error($"Scan #{scanNumber} cannot be processed because of the following exception: {ex.Message}\n{ex.StackTrace}");
100+
ParseInput.NewError();
98101
}
99102

100-
int? trailer_charge = trailerData.AsPositiveInt("Charge State:");
101-
double? trailer_mz = trailerData.AsDouble("Monoisotopic M/Z:");
102-
double? trailer_isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
103-
double? FAIMSCV = null;
104-
if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
105-
FAIMSCV = trailerData.AsDouble("FAIMS CV:");
106-
107-
double rt = raw.RetentionTimeFromScanNumber(scanNumber);
108-
int precursor_scan = 0;
109-
PrecursorData precursor_data = new PrecursorData
110-
{
111-
isolation_lower = null,
112-
isolation_upper = null,
113-
mz = null
114-
115-
};
116-
117-
if (msLevel == 1)
103+
// If we have enough ions to write a row group, do so
104+
// - some row groups might have more than this number of ions
105+
// but this ensures that all ions from a single scan are always
106+
// present in the same row group (critical property of mzparquet)
107+
if (data.Count >= 1_048_576)
118108
{
119-
// Keep track of scan number for precursor reference
120-
_precursorScanNumbers[""] = scanNumber;
121-
_precursorTree[scanNumber] = new PrecursorInfo();
109+
var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
110+
task.Wait();
111+
opts.Append = true;
112+
data.Clear();
113+
Log.Debug("Writing next row group");
122114
}
123-
else if (msLevel > 1)
124-
{
125-
// Keep track of scan number and isolation m/z for precursor reference
126-
var result = _filterStringIsolationMzPattern.Match(scanEvent.ToString());
127-
if (result.Success)
128-
{
129-
if (_precursorScanNumbers.ContainsKey(result.Groups[1].Value))
130-
{
131-
_precursorScanNumbers.Remove(result.Groups[1].Value);
132-
}
115+
}
133116

134-
_precursorScanNumbers.Add(result.Groups[1].Value, scanNumber);
135-
}
117+
// serialize any remaining ions into the final row group
118+
if (data.Count > 0)
119+
{
120+
var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
121+
task.Wait();
122+
Log.Debug("Writing final row group");
123+
}
124+
}
136125

137-
//update precursor scan if it is provided in trailer data
138-
var trailerMasterScan = trailerData.AsPositiveInt("Master Scan Number:");
139-
if (trailerMasterScan.HasValue)
140-
{
141-
precursor_scan = trailerMasterScan.Value;
142-
}
143-
else //try getting it from the scan filter
144-
{
145-
precursor_scan = GetParentFromScanString(result.Groups[1].Value);
146-
}
126+
private void AddScan(IRawDataPlus raw, int scanNumber, List<MzParquet> data)
127+
{
128+
var scanFilter = raw.GetFilterForScanNumber(scanNumber);
147129

148-
//finding precursor scan failed
149-
if (precursor_scan == -2)
150-
{
151-
Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
152-
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
153-
}
130+
// Get the scan event for this scan number
131+
var scanEvent = raw.GetScanEventForScanNumber(scanNumber);
154132

155-
//Parsing the last reaction
156-
try
157-
{
158-
try //since there is no direct way to get the number of reactions available, it is necessary to try and fail
159-
{
160-
scanEvent.GetReaction(_precursorTree[precursor_scan].ReactionCount);
161-
}
162-
catch (ArgumentOutOfRangeException ex)
163-
{
164-
Log.Debug($"Using Tribrid decision tree fix for scan# {scanNumber}");
165-
//Is it a decision tree scheduled scan on tribrid?
166-
if (msLevel == _precursorTree[precursor_scan].MSLevel)
167-
{
168-
precursor_scan = GetParentFromScanString(result.Groups[1].Value);
169-
}
170-
else
171-
{
172-
throw new RawFileParserException(
173-
$"Tribrid decision tree fix failed - cannot get reaction# {_precursorTree[precursor_scan].ReactionCount} from {scanEvent.ToString()}",
174-
ex);
175-
}
176-
}
133+
// Get scan ms level
134+
var msLevel = (int)scanFilter.MSOrder;
177135

178-
// Get Precursor m/z and isolation window borders
179-
precursor_data = GetPrecursorData(precursor_scan, scanEvent, trailer_mz, trailer_isolationWidth, out var reactionCount);
136+
// Get Scan
137+
var scan = Scan.FromFile(raw, scanNumber);
138+
ScanTrailer trailerData;
180139

181-
//save precursor information for later reference
182-
_precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, msLevel, reactionCount, null);
183-
}
184-
catch (Exception e)
185-
{
186-
var extra = (e.InnerException is null) ? "" : $"\n{e.InnerException.StackTrace}";
140+
try
141+
{
142+
trailerData = new ScanTrailer(raw.GetTrailerExtraInformation(scanNumber));
143+
}
144+
catch (Exception ex)
145+
{
146+
Log.WarnFormat("Cannot load trailer infromation for scan {0} due to following exception\n{1}", scanNumber, ex.Message);
147+
ParseInput.NewWarn();
148+
trailerData = new ScanTrailer();
149+
}
150+
151+
int? trailer_charge = trailerData.AsPositiveInt("Charge State:");
152+
double? trailer_mz = trailerData.AsDouble("Monoisotopic M/Z:");
153+
double? trailer_isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
154+
double? FAIMSCV = null;
155+
if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
156+
FAIMSCV = trailerData.AsDouble("FAIMS CV:");
187157

188-
Log.Warn($"Could not get precursor data for scan# {scanNumber} - precursor information for this and dependent scans will be empty\nException details:{e.Message}\n{e.StackTrace}\n{extra}");
189-
ParseInput.NewWarn();
158+
double rt = raw.RetentionTimeFromScanNumber(scanNumber);
159+
int precursor_scan = 0;
160+
PrecursorData precursor_data = new PrecursorData
161+
{
162+
isolation_lower = null,
163+
isolation_upper = null,
164+
mz = null
165+
166+
};
190167

191-
_precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, 1, 0, null);
168+
if (msLevel == 1)
169+
{
170+
// Keep track of scan number for precursor reference
171+
_precursorScanNumbers[""] = scanNumber;
172+
_precursorTree[scanNumber] = new PrecursorInfo();
173+
}
174+
else if (msLevel > 1)
175+
{
176+
// Keep track of scan number and isolation m/z for precursor reference
177+
var result = _filterStringIsolationMzPattern.Match(scanEvent.ToString());
178+
if (result.Success)
179+
{
180+
if (_precursorScanNumbers.ContainsKey(result.Groups[1].Value))
181+
{
182+
_precursorScanNumbers.Remove(result.Groups[1].Value);
192183
}
193184

185+
_precursorScanNumbers.Add(result.Groups[1].Value, scanNumber);
194186
}
195187

196-
double[] masses;
197-
double[] intensities;
188+
//update precursor scan if it is provided in trailer data
189+
var trailerMasterScan = trailerData.AsPositiveInt("Master Scan Number:");
190+
if (trailerMasterScan.HasValue)
191+
{
192+
precursor_scan = trailerMasterScan.Value;
193+
}
194+
else //try getting it from the scan filter
195+
{
196+
precursor_scan = GetParentFromScanString(result.Groups[1].Value);
197+
}
198198

199-
if (!ParseInput.NoPeakPicking.Contains(msLevel))
199+
//finding precursor scan failed
200+
if (precursor_scan == -2)
200201
{
201-
// Check if the scan has a centroid stream
202-
if (scan.HasCentroidStream)
202+
Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
203+
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
204+
}
205+
206+
//Parsing the last reaction
207+
try
208+
{
209+
try //since there is no direct way to get the number of reactions available, it is necessary to try and fail
203210
{
204-
masses = scan.CentroidScan.Masses;
205-
intensities = scan.CentroidScan.Intensities;
211+
scanEvent.GetReaction(_precursorTree[precursor_scan].ReactionCount);
206212
}
207-
else // otherwise take the segmented (low res) scan
213+
catch (ArgumentOutOfRangeException ex)
208214
{
209-
// If the spectrum is profile perform centroiding
210-
var segmentedScan = scanEvent.ScanData == ScanDataType.Profile
211-
? Scan.ToCentroid(scan).SegmentedScan
212-
: scan.SegmentedScan;
213-
214-
masses = segmentedScan.Positions;
215-
intensities = segmentedScan.Intensities;
215+
Log.Debug($"Using Tribrid decision tree fix for scan# {scanNumber}");
216+
//Is it a decision tree scheduled scan on tribrid?
217+
if (msLevel == _precursorTree[precursor_scan].MSLevel)
218+
{
219+
precursor_scan = GetParentFromScanString(result.Groups[1].Value);
220+
}
221+
else
222+
{
223+
throw new RawFileParserException(
224+
$"Tribrid decision tree fix failed - cannot get reaction# {_precursorTree[precursor_scan].ReactionCount} from {scanEvent.ToString()}",
225+
ex);
226+
}
216227
}
228+
229+
// Get Precursor m/z and isolation window borders
230+
precursor_data = GetPrecursorData(precursor_scan, scanEvent, trailer_mz, trailer_isolationWidth, out var reactionCount);
231+
232+
//save precursor information for later reference
233+
_precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, msLevel, reactionCount, null);
217234
}
218-
else // use the segmented data as is
235+
catch (Exception e)
219236
{
220-
masses = scan.SegmentedScan.Positions;
221-
intensities = scan.SegmentedScan.Intensities;
222-
}
237+
var extra = (e.InnerException is null) ? "" : $"\n{e.InnerException.StackTrace}";
223238

224-
// Add a row to parquet file for every m/z value in this scan
225-
for (int i = 0; i < masses.Length; i++)
226-
{
227-
MzParquet m;
228-
m.rt = (float)rt;
229-
m.scan = (uint)scanNumber;
230-
m.level = (uint)msLevel;
231-
m.intensity = (float)intensities[i];
232-
m.mz = (float)masses[i];
233-
m.isolation_lower = precursor_data.isolation_lower;
234-
m.isolation_upper = precursor_data.isolation_upper;
235-
m.precursor_scan = precursor_scan;
236-
m.precursor_mz = precursor_data.mz;
237-
m.precursor_charge = (uint?)trailer_charge;
238-
m.ion_mobility = (float?)FAIMSCV;
239-
data.Add(m);
239+
Log.Warn($"Could not get precursor data for scan# {scanNumber} - precursor information for this and dependent scans will be empty\nException details:{e.Message}\n{e.StackTrace}\n{extra}");
240+
ParseInput.NewWarn();
241+
242+
_precursorTree[scanNumber] = new PrecursorInfo(precursor_scan, 1, 0, null);
240243
}
241244

242-
// If we have enough ions to write a row group, do so
243-
// - some row groups might have more than this number of ions
244-
// but this ensures that all ions from a single scan are always
245-
// present in the same row group (critical property of mzparquet)
246-
if (data.Count >= 1_048_576)
245+
}
246+
247+
double[] masses;
248+
double[] intensities;
249+
250+
if (!ParseInput.NoPeakPicking.Contains(msLevel))
251+
{
252+
// Check if the scan has a centroid stream
253+
if (scan.HasCentroidStream)
247254
{
248-
var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
249-
task.Wait();
250-
opts.Append = true;
251-
data.Clear();
252-
Log.Debug("Writing next row group");
255+
masses = scan.CentroidScan.Masses;
256+
intensities = scan.CentroidScan.Intensities;
253257
}
258+
else // otherwise take the segmented (low res) scan
259+
{
260+
// If the spectrum is profile perform centroiding
261+
var segmentedScan = scanEvent.ScanData == ScanDataType.Profile
262+
? Scan.ToCentroid(scan).SegmentedScan
263+
: scan.SegmentedScan;
254264

265+
masses = segmentedScan.Positions;
266+
intensities = segmentedScan.Intensities;
267+
}
268+
}
269+
else // use the segmented data as is
270+
{
271+
masses = scan.SegmentedScan.Positions;
272+
intensities = scan.SegmentedScan.Intensities;
255273
}
256274

257-
// serialize any remaining ions into the final row group
258-
if (data.Count > 0)
275+
// Add a row to parquet file for every m/z value in this scan
276+
for (int i = 0; i < masses.Length; i++)
259277
{
260-
var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
261-
task.Wait();
262-
Log.Debug("Writing final row group");
278+
MzParquet m;
279+
m.rt = (float)rt;
280+
m.scan = (uint)scanNumber;
281+
m.level = (uint)msLevel;
282+
m.intensity = (float)intensities[i];
283+
m.mz = (float)masses[i];
284+
m.isolation_lower = precursor_data.isolation_lower;
285+
m.isolation_upper = precursor_data.isolation_upper;
286+
m.precursor_scan = precursor_scan;
287+
m.precursor_mz = precursor_data.mz;
288+
m.precursor_charge = (uint?)trailer_charge;
289+
m.ion_mobility = (float?)FAIMSCV;
290+
data.Add(m);
263291
}
264292
}
265293

0 commit comments

Comments
 (0)