Skip to content

Commit 01cd3af

Browse files
authored
Merge pull request #192
mzParquet writer v0.2 full implementation
2 parents 2009886 + 1ef4d88 commit 01cd3af

File tree

7 files changed

+448
-341
lines changed

7 files changed

+448
-341
lines changed

MainClass.cs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,9 @@ private static void RegularParametersParsing(string[] args)
772772
if (parseInput.OutputFormat == OutputFormat.IndexMzML) parseInput.OutputFormat = OutputFormat.MzML;
773773
}
774774

775+
// Switch off gzip compression for Parquet
776+
if (parseInput.OutputFormat == OutputFormat.Parquet) parseInput.Gzip = false;
777+
775778
parseInput.MaxLevel = parseInput.MsLevel.Max();
776779

777780
if (parseInput.S3Url != null && parseInput.S3AccessKeyId != null &&

ThermoRawFileParserTest/ThermoRawFileParserTest.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<PrivateAssets>all</PrivateAssets>
3333
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
3434
</PackageReference>
35+
<PackageReference Include="Parquet.Net" Version="5.0.1" />
3536
</ItemGroup>
3637

3738
<ItemGroup>

ThermoRawFileParserTest/WriterTests.cs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Xml.Serialization;
66
using IO.Mgf;
77
using NUnit.Framework;
8+
using Parquet;
89
using ThermoRawFileParser;
910
using ThermoRawFileParser.Writer.MzML;
1011

@@ -281,5 +282,58 @@ public void TestMzML_MS2()
281282

282283
Assert.That(testMzMl.run.chromatogramList.chromatogram[0].defaultArrayLength, Is.EqualTo(95));
283284
}
285+
286+
[Test]
287+
public void TestParquetCentroid()
288+
{
289+
// Get temp path for writing the test mzML
290+
var tempFilePath = Path.GetTempPath();
291+
292+
var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
293+
var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
294+
295+
RawFileParser.Parse(parseInput);
296+
297+
// Actual test
298+
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
299+
300+
using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
301+
{
302+
var groupReader = parquetReader.OpenRowGroupReader(0);
303+
var schema = parquetReader.Schema;
304+
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
305+
306+
Assert.That(scanColumn.NumValues, Is.EqualTo(48520));
307+
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
308+
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(1632));
309+
}
310+
}
311+
312+
[Test]
313+
public void TestParquetProfile()
314+
{
315+
// Get temp path for writing the test mzML
316+
var tempFilePath = Path.GetTempPath();
317+
318+
var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
319+
var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
320+
parseInput.NoPeakPicking = new HashSet<int> { 1, 2 };
321+
322+
RawFileParser.Parse(parseInput);
323+
324+
// Actual test
325+
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
326+
327+
using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
328+
{
329+
var groupReader = parquetReader.OpenRowGroupReader(0);
330+
var schema = parquetReader.Schema;
331+
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
332+
333+
Assert.That(scanColumn.NumValues, Is.EqualTo(305213));
334+
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
335+
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(17758));
336+
}
337+
}
284338
}
285339
}

Writer/MgfSpectrumWriter.cs

Lines changed: 3 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
using System;
2-
using System.Collections.Generic;
32
using System.Globalization;
4-
using System.Linq;
53
using System.Reflection;
6-
using System.Text.RegularExpressions;
74
using log4net;
85
using ThermoFisher.CommonCore.Data.Business;
96
using ThermoFisher.CommonCore.Data.FilterEnums;
@@ -19,15 +16,9 @@ public class MgfSpectrumWriter : SpectrumWriter
1916
private const string PositivePolarity = "+";
2017
private const string NegativePolarity = "-";
2118

22-
// Filter string
23-
private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");
24-
2519
// Precursor scan number for MSn scans
2620
private int _precursorScanNumber;
2721

28-
// Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
29-
private readonly Dictionary<string, int> _precursorScanNumbers = new Dictionary<string, int>();
30-
3122
public MgfSpectrumWriter(ParseInput parseInput) : base(parseInput)
3223
{
3324
ParseInput.MsLevel.Remove(1); // MS1 spectra are not supposed to be in MGF
@@ -126,23 +117,7 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
126117
}
127118
else //try getting it from the scan filter
128119
{
129-
var parts = Regex.Split(result.Groups[1].Value, " ");
130-
131-
//find the position of the first (from the end) precursor with a different mass
132-
//to account for possible supplementary activations written in the filter
133-
var lastIonMass = parts.Last().Split('@').First();
134-
int last = parts.Length;
135-
while (last > 0 &&
136-
parts[last - 1].Split('@').First() == lastIonMass)
137-
{
138-
last--;
139-
}
140-
141-
string parentFilter = String.Join(" ", parts.Take(last));
142-
if (_precursorScanNumbers.ContainsKey(parentFilter))
143-
{
144-
_precursorScanNumber = _precursorScanNumbers[parentFilter];
145-
}
120+
_precursorScanNumber = GetParentFromScanString(result.Groups[1].Value);
146121
}
147122

148123
if (_precursorScanNumber > 0)
@@ -151,7 +126,8 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
151126
}
152127
else
153128
{
154-
Log.Error($"Failed finding precursor for {scanNumber}");
129+
Log.Error($"Cannot find precursor scan for scan# {scanNumber}");
130+
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), null);
155131
ParseInput.NewError();
156132
}
157133
}

Writer/MzMlSpectrumWriter.cs

Lines changed: 2 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
2727
private static readonly ILog Log =
2828
LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
2929

30-
private readonly Regex _filterStringIsolationMzPattern = new Regex(@"ms\d+ (.+?) \[");
31-
3230
// Tune version < 3 produces multiple trailer entry like "SPS Mass [number]"
3331
private readonly Regex _spSentry = new Regex(@"SPS Mass\s+\d+:");
3432

@@ -45,12 +43,6 @@ public class MzMlSpectrumWriter : SpectrumWriter
4543
private readonly Dictionary<IonizationModeType, CVParamType> _ionizationTypes =
4644
new Dictionary<IonizationModeType, CVParamType>();
4745

48-
// Precursor scan number (value) and isolation m/z (key) for reference in the precursor element of an MSn spectrum
49-
private readonly Dictionary<string, int> _precursorScanNumbers = new Dictionary<string, int>();
50-
51-
//Precursor information for scans
52-
private Dictionary<int, PrecursorInfo> _precursorTree = new Dictionary<int, PrecursorInfo>();
53-
5446
private const string SourceFileId = "RAW1";
5547
private readonly XmlSerializerFactory _factory = new XmlSerializerFactory();
5648
private const string Ns = "http://psi.hupo.org/ms/mzml";
@@ -68,8 +60,6 @@ public MzMlSpectrumWriter(ParseInput parseInput) : base(parseInput)
6860
_mzMlNamespace.Add(string.Empty, "http://psi.hupo.org/ms/mzml");
6961
_doIndexing = ParseInput.OutputFormat == OutputFormat.IndexMzML;
7062
_osOffset = Environment.NewLine == "\n" ? 0 : 1;
71-
_precursorScanNumbers[""] = -1;
72-
_precursorTree[-1] = new PrecursorInfo();
7363
}
7464

7565
/// <inheritdoc />
@@ -639,7 +629,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
639629

640630
_writer.WriteValue(BitConverter.ToString(hash).Replace("-", "").ToLowerInvariant());
641631
_writer.WriteEndElement(); // fileChecksum
642-
643632
_writer.WriteEndElement(); // indexedmzML
644633
}
645634

@@ -652,21 +641,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
652641

653642
Writer.Flush();
654643
Writer.Close();
655-
656-
//This section is not necessary?
657-
/*if (_doIndexing)
658-
{
659-
try
660-
{
661-
cryptoStream.Flush();
662-
cryptoStream.Close();
663-
}
664-
catch (System.ObjectDisposedException e)
665-
{
666-
// Cannot access a closed file. CryptoStream was already closed when closing _writer
667-
Log.Warn($"Warning: {e.Message}");
668-
}
669-
}*/
670644
}
671645

672646
// In case of indexed mzML, change the extension from xml to mzML and check for the gzip option
@@ -1286,7 +1260,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
12861260
int? charge = trailerData.AsPositiveInt("Charge State:");
12871261
double? monoisotopicMz = trailerData.AsDouble("Monoisotopic M/Z:");
12881262
double? ionInjectionTime = trailerData.AsDouble("Ion Injection Time (ms):");
1289-
double? isolationWidth = trailerData.AsDouble("MS" + (int) scanFilter.MSOrder + " Isolation Width:");
1263+
double? isolationWidth = trailerData.AsDouble("MS" + msLevel + " Isolation Width:");
12901264
double? FAIMSCV = null;
12911265
if (trailerData.AsBool("FAIMS Voltage On:").GetValueOrDefault(false))
12921266
FAIMSCV = trailerData.AsDouble("FAIMS CV:");
@@ -1374,6 +1348,7 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
13741348
{
13751349
Log.Warn($"Cannot find precursor scan for scan# {scanNumber}");
13761350
_precursorTree[-2] = new PrecursorInfo(0, msLevel, FindLastReaction(scanEvent, msLevel), new PrecursorType[0]);
1351+
ParseInput.NewWarn();
13771352
}
13781353

13791354
try
@@ -2011,46 +1986,6 @@ private SpectrumType ConstructMSSpectrum(int scanNumber)
20111986

20121987
return spectrum;
20131988
}
2014-
2015-
private int FindLastReaction(IScanEvent scanEvent, int msLevel)
2016-
{
2017-
int lastReactionIndex = msLevel - 2;
2018-
2019-
//iteratively trying find the last available index for reaction
2020-
while(true)
2021-
{
2022-
try
2023-
{
2024-
scanEvent.GetReaction(lastReactionIndex + 1);
2025-
}
2026-
catch (ArgumentOutOfRangeException)
2027-
{
2028-
//stop trying
2029-
break;
2030-
}
2031-
2032-
lastReactionIndex++;
2033-
}
2034-
2035-
//supplemental activation flag is on -> one of the levels (not necissirily the last one) used supplemental activation
2036-
//check last two activations
2037-
if (scanEvent.SupplementalActivation == TriState.On)
2038-
{
2039-
var lastActivation = scanEvent.GetReaction(lastReactionIndex).ActivationType;
2040-
var beforeLastActivation = scanEvent.GetReaction(lastReactionIndex - 1).ActivationType;
2041-
2042-
if ((beforeLastActivation == ActivationType.ElectronTransferDissociation || beforeLastActivation == ActivationType.ElectronCaptureDissociation) &&
2043-
(lastActivation == ActivationType.CollisionInducedDissociation || lastActivation == ActivationType.HigherEnergyCollisionalDissociation))
2044-
return lastReactionIndex - 1; //ETD or ECD followed by HCD or CID -> supplemental activation in the last level (move the last reaction one step back)
2045-
else
2046-
return lastReactionIndex;
2047-
}
2048-
else //just use the last one
2049-
{
2050-
return lastReactionIndex;
2051-
}
2052-
}
2053-
20541989
private SpectrumType ConstructPDASpectrum(int scanNumber, int instrumentNumber)
20551990
{
20561991
// Get each scan from the RAW file
@@ -2631,29 +2566,6 @@ private PrecursorListType ConstructPrecursorList(int precursorScanNumber, IScanE
26312566

26322567
}
26332568

2634-
private int GetParentFromScanString(string scanString)
2635-
{
2636-
var parts = Regex.Split(scanString, " ");
2637-
2638-
//find the position of the first (from the end) precursor with a different mass
2639-
//to account for possible supplementary activations written in the filter
2640-
var lastIonMass = parts.Last().Split('@').First();
2641-
int last = parts.Length;
2642-
while (last > 0 &&
2643-
parts[last - 1].Split('@').First() == lastIonMass)
2644-
{
2645-
last--;
2646-
}
2647-
2648-
string parentFilter = String.Join(" ", parts.Take(last));
2649-
if (_precursorScanNumbers.ContainsKey(parentFilter))
2650-
{
2651-
return _precursorScanNumbers[parentFilter];
2652-
}
2653-
2654-
return -2; //unsuccessful parsing
2655-
}
2656-
26572569
/// <summary>
26582570
/// Populate the scan list element. Full version used for mass spectra,
26592571
/// having Scan Event, scan Filter etc

0 commit comments

Comments
 (0)