Skip to content

Commit 5021515

Browse files
committed
Test for ParquetWriter
1 parent 1e9b500 commit 5021515

File tree

4 files changed

+47
-26
lines changed

4 files changed

+47
-26
lines changed

ThermoRawFileParserTest/ThermoRawFileParserTest.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<PrivateAssets>all</PrivateAssets>
3333
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
3434
</PackageReference>
35+
<PackageReference Include="Parquet.Net" Version="5.0.1" />
3536
</ItemGroup>
3637

3738
<ItemGroup>

ThermoRawFileParserTest/WriterTests.cs

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using System.Xml.Serialization;
66
using IO.Mgf;
77
using NUnit.Framework;
8+
using Parquet;
89
using ThermoRawFileParser;
910
using ThermoRawFileParser.Writer.MzML;
1011

@@ -283,7 +284,7 @@ public void TestMzML_MS2()
283284
}
284285

285286
[Test]
286-
public void TestParquet()
287+
public void TestParquetCentroid()
287288
{
288289
// Get temp path for writing the test mzML
289290
var tempFilePath = Path.GetTempPath();
@@ -294,17 +295,45 @@ public void TestParquet()
294295
RawFileParser.Parse(parseInput);
295296

296297
// Actual test
297-
//var xmlSerializer = new XmlSerializer(typeof(mzMLType));
298-
//var testMzMl = (mzMLType)xmlSerializer.Deserialize(new FileStream(
299-
// Path.Combine(tempFilePath, "small.mzML"), FileMode.Open, FileAccess.Read, FileShare.ReadWrite));
298+
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
300299

301-
//Assert.That(testMzMl.run.spectrumList.count, Is.EqualTo("48"));
302-
//Assert.That(testMzMl.run.spectrumList.spectrum.Length, Is.EqualTo(48));
300+
using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
301+
{
302+
var groupReader = parquetReader.OpenRowGroupReader(0);
303+
var schema = parquetReader.Schema;
304+
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
303305

304-
//Assert.That(testMzMl.run.chromatogramList.count, Is.EqualTo("1"));
305-
//Assert.That(testMzMl.run.chromatogramList.chromatogram.Length, Is.EqualTo(1));
306+
Assert.That(scanColumn.NumValues, Is.EqualTo(48520));
307+
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
308+
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(1632));
309+
}
310+
}
311+
312+
[Test]
313+
public void TestParquetProfile()
314+
{
315+
// Get temp path for writing the test mzML
316+
var tempFilePath = Path.GetTempPath();
306317

307-
//Assert.That(testMzMl.run.chromatogramList.chromatogram[0].defaultArrayLength, Is.EqualTo(48));
318+
var testRawFile = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Data/small.RAW");
319+
var parseInput = new ParseInput(testRawFile, null, tempFilePath, OutputFormat.Parquet);
320+
parseInput.NoPeakPicking = new HashSet<int> { 1, 2 };
321+
322+
RawFileParser.Parse(parseInput);
323+
324+
// Actual test
325+
var parquetFilePath = Path.Combine(tempFilePath, "small.mzparquet");
326+
327+
using (var parquetReader = ParquetReader.CreateAsync(parquetFilePath).Result)
328+
{
329+
var groupReader = parquetReader.OpenRowGroupReader(0);
330+
var schema = parquetReader.Schema;
331+
var scanColumn = groupReader.ReadColumnAsync(schema.FindDataField("scan")).Result;
332+
333+
Assert.That(scanColumn.NumValues, Is.EqualTo(305213));
334+
Assert.That(scanColumn.Statistics.DistinctCount, Is.EqualTo(48));
335+
Assert.That((from int p in scanColumn.Data where p == 22 select p).Count(), Is.EqualTo(17758));
336+
}
308337
}
309338
}
310339
}

Writer/MzMlSpectrumWriter.cs

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -641,21 +641,6 @@ public override void Write(IRawDataPlus rawFile, int firstScanNumber, int lastSc
641641

642642
Writer.Flush();
643643
Writer.Close();
644-
645-
//This section is not necessary?
646-
/*if (_doIndexing)
647-
{
648-
try
649-
{
650-
cryptoStream.Flush();
651-
cryptoStream.Close();
652-
}
653-
catch (System.ObjectDisposedException e)
654-
{
655-
// Cannot access a closed file. CryptoStream was already closed when closing _writer
656-
Log.Warn($"Warning: {e.Message}");
657-
}
658-
}*/
659644
}
660645

661646
// In case of indexed mzML, change the extension from xml to mzML and check for the gzip option

Writer/ParquetSpectrumWriter.cs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ public class ParquetSpectrumWriter : SpectrumWriter
3636
private static readonly ILog Log =
3737
LogManager.GetLogger(MethodBase.GetCurrentMethod().DeclaringType);
3838

39+
private const int ParquetRowGroupSize = 1_048_576;
40+
3941
public ParquetSpectrumWriter(ParseInput parseInput) : base(parseInput)
4042
{
4143
//nothing to do here
@@ -49,7 +51,7 @@ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNu
4951
}
5052

5153
ConfigureWriter(".mzparquet");
52-
54+
5355
ParquetSerializerOptions opts = new ParquetSerializerOptions();
5456
opts.CompressionLevel = System.IO.Compression.CompressionLevel.Fastest;
5557
opts.CompressionMethod = Parquet.CompressionMethod.Zstd;
@@ -91,7 +93,7 @@ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNu
9193
// - some row groups might have more than this number of ions
9294
// but this ensures that all ions from a single scan are always
9395
// present in the same row group (critical property of mzparquet)
94-
if (data.Count >= 1_048_576)
96+
if (data.Count >= ParquetRowGroupSize)
9597
{
9698
var task = ParquetSerializer.SerializeAsync(data, Writer.BaseStream, opts);
9799
task.Wait();
@@ -108,6 +110,10 @@ public override void Write(IRawDataPlus raw, int firstScanNumber, int lastScanNu
108110
task.Wait();
109111
Log.Debug("Writing final row group");
110112
}
113+
114+
// Release the OS file handle
115+
Writer.Flush();
116+
Writer.Close();
111117
}
112118

113119
private void AddScan(IRawDataPlus raw, int scanNumber, List<MzParquet> data)

0 commit comments

Comments
 (0)