Skip to content

Commit 78fd5ad

Browse files
authored
Merge pull request #938 from jan-van-dieren/Nested-Object-Parsing
Extended matlab parser with structures, cell matrices and character matrices
2 parents 738c0c1 + 3e84f2a commit 78fd5ad

File tree

8 files changed

+519
-29
lines changed

8 files changed

+519
-29
lines changed

data/Matlab/cell-array-nested.mat

377 Bytes
Binary file not shown.

data/Matlab/struct-nested.mat

287 Bytes
Binary file not shown.

src/Data.Matlab/Data.Matlab.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,5 +64,6 @@ Control.Describe now includes CPU architecture and family identifier if know</Pa
6464
<PrivateAssets>all</PrivateAssets>
6565
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
6666
</PackageReference>
67+
<PackageReference Include="OneOf" Version="3.0.223" />
6768
</ItemGroup>
6869
</Project>

src/Data.Matlab/MatlabReader.cs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,18 @@ public static Matrix<T> Unpack<T>(MatlabMatrix matrixData)
7070
return Parser.ParseMatrix<T>(matrixData.Data);
7171
}
7272

73+
/// <summary>
74+
/// Unpacks a MATLAB object that cannot be mapped to a MathNet.Numerics matrix. This could be any nesting of cell matrices and structures.
75+
/// The nesting must bottom out on either a MathNet.Numerics matrix (if the lowest level values here are matrices of a numeric type) or a character mattrix (if the lowest level value is a matrix of a char type)
76+
/// Since structures and cell matrices can have different data types in different fields and can even be nested the type of the field value cannot be more specific than a nested object
77+
/// </summary>
78+
/// <returns></returns>
79+
public static NestedObject NonNumeric(MatlabMatrix structData)
80+
{
81+
return Parser.ParseNonNumeric(structData.Data);
82+
}
83+
84+
7385
/// <summary>
7486
/// Read the first or a specific matrix from a MATLAB file stream.
7587
/// </summary>

src/Data.Matlab/MatlabStructure.cs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
// <copyright file="MatlabStructure.cs" company="Math.NET">
2+
// Math.NET Numerics, part of the Math.NET Project
3+
// https://numerics.mathdotnet.com
4+
// https://github.com/mathnet/mathnet-numerics
5+
//
6+
// Copyright (c) 2009-$CURRENT_YEAR$ Math.NET
7+
//
8+
// Permission is hereby granted, free of charge, to any person
9+
// obtaining a copy of this software and associated documentation
10+
// files (the "Software"), to deal in the Software without
11+
// restriction, including without limitation the rights to use,
12+
// copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
// copies of the Software, and to permit persons to whom the
14+
// Software is furnished to do so, subject to the following
15+
// conditions:
16+
//
17+
// The above copyright notice and this permission notice shall be
18+
// included in all copies or substantial portions of the Software.
19+
//
20+
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21+
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
22+
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23+
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
24+
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
25+
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26+
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
27+
// OTHER DEALINGS IN THE SOFTWARE.
28+
// </copyright>
29+
30+
using MathNet.Numerics.LinearAlgebra;
31+
using OneOf;
32+
using System.Collections.Generic;
33+
using System.Text;
34+
35+
namespace MathNet.Numerics.Data.Matlab
36+
{
37+
public class MatlabStructure : Dictionary<string, NestedObject>
38+
{
39+
}
40+
41+
public class MatlabCellMatrix
42+
{
43+
public NestedObject[,] Data { get; private set; }
44+
45+
public MatlabCellMatrix(int rows, int cols)
46+
{
47+
Data = new NestedObject[rows, cols];
48+
}
49+
}
50+
51+
public class MatlabCharMatrix
52+
{
53+
/// <summary>
54+
/// Typically not that relevant, for UTF32 encoding however each single char is actually 2 chars (hence the string type)
55+
/// </summary>
56+
public Encoding Encoding { get; private set; }
57+
public string[,] Data { get; private set; }
58+
59+
public MatlabCharMatrix(int rows, int cols, Encoding encoding)
60+
{
61+
Encoding = encoding;
62+
63+
Data = new string[rows, cols];
64+
}
65+
66+
/// <summary>
67+
/// Returns each row as a single string
68+
/// </summary>
69+
/// <returns></returns>
70+
public string[] ConcatRows()
71+
{
72+
string[] result = new string[Data.GetLength(0)];
73+
for (int col = 0; col < Data.GetLength(1); col++)
74+
{
75+
for (int row = 0; row < Data.GetLength(0); row++)
76+
77+
{
78+
result[row] += Data[row, col];
79+
}
80+
}
81+
82+
return result;
83+
}
84+
}
85+
86+
public class NestedObject : OneOfBase<MatlabStructure, MatlabCellMatrix, MatlabCharMatrix, MatlabMatrix>
87+
{
88+
public NestedObject(OneOf<MatlabStructure, MatlabCellMatrix, MatlabCharMatrix, MatlabMatrix> input) : base(input)
89+
{
90+
}
91+
}
92+
}

src/Data.Matlab/Parser.cs

Lines changed: 226 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -141,51 +141,248 @@ internal static List<MatlabMatrix> ParseFile(Stream stream)
141141
internal static Matrix<T> ParseMatrix<T>(byte[] data)
142142
where T : struct, IEquatable<T>, IFormattable
143143
{
144-
using (var stream = new MemoryStream(data))
145-
using (var reader = new BinaryReader(stream))
144+
Func<BinaryReader, ArrayClass, bool, int, int, Matrix<T>> parser = (BinaryReader r, ArrayClass a, bool complex, int rows, int columns) =>
146145
{
147-
// Array Flags tag (8 bytes)
148-
reader.BaseStream.Seek(8, SeekOrigin.Current);
146+
// Data
147+
switch (a)
148+
{
149+
case ArrayClass.Sparse:
150+
return PopulateSparseMatrix<T>(r, complex, rows, columns);
151+
case ArrayClass.Function:
152+
case ArrayClass.Character:
153+
case ArrayClass.Object:
154+
case ArrayClass.Structure:
155+
case ArrayClass.Cell:
156+
case ArrayClass.Unknown:
157+
throw new NotSupportedException();
158+
default:
159+
return PopulateDenseMatrix<T>(r, complex, rows, columns);
160+
}
161+
};
149162

150-
// Array Flags data: flags (byte 3), class (byte 4) (8 bytes)
151-
var arrayClass = (ArrayClass)reader.ReadByte();
152-
var flags = reader.ReadByte();
153-
var complex = (flags & (byte)ArrayFlags.Complex) == (byte)ArrayFlags.Complex;
154-
reader.BaseStream.Seek(6, SeekOrigin.Current);
163+
return ParseObject(data, parser);
164+
}
155165

156-
// Dimensions Array tag (8 bytes)
157-
reader.BaseStream.Seek(4, SeekOrigin.Current);
158-
var numDimensions = reader.ReadInt32()/8;
159-
if (numDimensions > 2)
166+
/// <summary>
167+
/// For parsing nayhting that cannot be mapped to a MathNet.Numerics matrix
168+
/// </summary>
169+
/// <param name="data"></param>
170+
/// <returns></returns>
171+
/// <exception cref="NotSupportedException"></exception>
172+
internal static NestedObject ParseNonNumeric(byte[] data)
173+
{
174+
Func<BinaryReader, ArrayClass, bool, int, int, NestedObject> parser = (BinaryReader r, ArrayClass a, bool complex, int rows, int columns) =>
175+
{
176+
// Data
177+
switch (a)
160178
{
161-
throw new NotSupportedException("Only 1 and 2 dimensional arrays are supported.");
179+
case ArrayClass.Character:
180+
return PopulateCharacterMatrix(r, rows, columns);
181+
case ArrayClass.Structure:
182+
return PopulateStructure(r);
183+
case ArrayClass.Cell:
184+
return PopulateCellMatrix(r, complex, rows, columns);
185+
case ArrayClass.Unknown:
186+
throw new NotSupportedException();
187+
default:
188+
throw new NotSupportedException();
162189
}
190+
};
163191

164-
// Dimensions Array data: row and column count (8 bytes)
165-
var rows = reader.ReadInt32();
166-
var columns = reader.ReadInt32();
192+
return ParseObject(data, parser);
193+
}
167194

168-
// Array name
169-
ReadElementTag(reader, out _, out var size, out var isSmallBlock);
170-
reader.BaseStream.Seek(size, SeekOrigin.Current);
171-
SkipElementPadding(reader, size, isSmallBlock);
195+
private static NestedObject PopulateCharacterMatrix(BinaryReader reader, int rows, int columns)
196+
{
197+
ReadElementTag(reader, out var type, out var size, out var isSmallBlock);
198+
199+
MatlabCharMatrix result;
200+
Encoding encoding;
201+
202+
switch (type)
203+
{
204+
case DataType.Utf8:
205+
encoding = Encoding.UTF8;
206+
break;
207+
case DataType.Utf16:
208+
encoding = Encoding.Unicode;
209+
break;
210+
case DataType.Utf32:
211+
encoding = Encoding.UTF32;
212+
break;
213+
default:
214+
throw new NotImplementedException($"Could not parse char array due to unsupported encoding: {type}");
215+
}
216+
217+
result = new MatlabCharMatrix(rows, columns, encoding);
218+
219+
for (int col = 0; col < columns; col++)
220+
{
221+
for (int row = 0; row < rows; row++)
222+
{
223+
byte[] newChar;
224+
if(encoding.IsSingleByte)
225+
{
226+
newChar = reader.ReadBytes(1);
227+
}
228+
else
229+
{
230+
newChar = reader.ReadBytes(2);
231+
}
232+
233+
result.Data[row, col] = encoding.GetString(newChar);
234+
}
235+
}
236+
237+
return new NestedObject(result);
238+
}
239+
240+
internal static T ParseObject<T>(byte[] data, Func<BinaryReader, ArrayClass, bool,int,int,T> parser)
241+
{
242+
using (var stream = new MemoryStream(data))
243+
using (var reader = new BinaryReader(stream))
244+
{
245+
(ArrayClass arrayClass,
246+
bool complex,
247+
int rows, int columns, _) = ParseObjectHeader(reader);
172248

173249
// Data
250+
return parser(reader, arrayClass, complex, rows, columns);
251+
}
252+
}
253+
254+
/// <summary>
255+
/// Reads the object header and skips any remaining padding
256+
/// </summary>
257+
/// <param name="reader"></param>
258+
/// <returns></returns>
259+
/// <exception cref="NotSupportedException"></exception>
260+
private static (ArrayClass arrayClass, bool complex, int rows, int columns, string name) ParseObjectHeader(BinaryReader reader)
261+
{
262+
// Array Flags tag (8 bytes)
263+
reader.BaseStream.Seek(8, SeekOrigin.Current);
264+
265+
// Array Flags data: flags (byte 3), class (byte 4) (8 bytes)
266+
ArrayClass arrayClass = (ArrayClass)reader.ReadByte();
267+
var flags = reader.ReadByte();
268+
bool complex = (flags & (byte)ArrayFlags.Complex) == (byte)ArrayFlags.Complex;
269+
reader.BaseStream.Seek(6, SeekOrigin.Current);
270+
271+
// Dimensions Array tag (8 bytes)
272+
reader.BaseStream.Seek(4, SeekOrigin.Current);
273+
var numDimensions = reader.ReadInt32() / 8;
274+
if (numDimensions > 2)
275+
{
276+
throw new NotSupportedException("Only 1 and 2 dimensional arrays are supported.");
277+
}
278+
279+
// Dimensions Array data: row and column count (8 bytes)
280+
int rows = reader.ReadInt32();
281+
int columns = reader.ReadInt32();
282+
283+
// Array name
284+
ReadElementTag(reader, out _, out var size, out var isSmallBlock);
285+
byte[] nameBytes = new byte[size];
286+
reader.BaseStream.Read(nameBytes,0, size);
287+
string name = Encoding.UTF8.GetString(nameBytes);
288+
289+
SkipElementPadding(reader, size, isSmallBlock);
290+
291+
return (arrayClass, complex, rows, columns, name);
292+
}
293+
294+
private static NestedObject PopulateStructure(BinaryReader reader)
295+
{
296+
// after the common fields for all arrays a structure has the length for the field names as a short data element
297+
// acording to the docs MATLAB always sets this to 32 bytes (31 chars + NULL) so we don't actually need to check it
298+
ReadElementTag(reader, out _, out _, out _);
299+
300+
int nameLength = reader.ReadInt32();
301+
302+
// field names are saved as an miINT8 data element
303+
// each name is padded to align on 32 bytes and NULL terminated
304+
ReadElementTag(reader, out _, out var size, out var isSmallBlock);
305+
306+
List<string> fieldNames = new List<string>();
307+
int bytesRead = 0;
308+
309+
while(bytesRead < size)
310+
{
311+
byte[] currentName = reader.ReadBytes(nameLength);
312+
fieldNames.Add(Encoding.UTF8.GetString(currentName).TrimEnd((char)0));
313+
bytesRead += nameLength;
314+
}
315+
316+
SkipElementPadding(reader, size, isSmallBlock);
317+
318+
// each field of the structure could be any type supported by a matlab file
319+
MatlabStructure result = new MatlabStructure();
320+
321+
for (int i = 0; i<fieldNames.Count; i++)
322+
{
323+
// to use the regular array parsing methods we need to know how much data to give them
324+
ReadElementTag(reader, out _, out var fieldSize, out _);
325+
326+
// we also need to know what the array class is (maybe a nested structure or a cell)
327+
(ArrayClass arrayClass, _, _, _, string name) = ParseObjectHeader(reader);
328+
329+
// reset reader back to expected position for further parsers
330+
// the header has array flags (16 bytes), dimensions array (16 bytes) and array name (8 bytes)
331+
reader.BaseStream.Seek(-40, SeekOrigin.Current);
332+
333+
byte[] arrayData = reader.ReadBytes(fieldSize);
334+
174335
switch (arrayClass)
175336
{
176-
case ArrayClass.Sparse:
177-
return PopulateSparseMatrix<T>(reader, complex, rows, columns);
178-
case ArrayClass.Function:
179-
case ArrayClass.Character:
180-
case ArrayClass.Object:
181337
case ArrayClass.Structure:
182338
case ArrayClass.Cell:
183-
case ArrayClass.Unknown:
184-
throw new NotSupportedException();
339+
case ArrayClass.Character:
340+
result.Add(fieldNames[i], ParseNonNumeric(arrayData));
341+
break;
185342
default:
186-
return PopulateDenseMatrix<T>(reader, complex, rows, columns);
343+
result.Add(fieldNames[i], new NestedObject(new MatlabMatrix(name, arrayData)));
344+
break;
187345
}
188346
}
347+
348+
return new NestedObject(result);
349+
}
350+
351+
private static NestedObject PopulateCellMatrix(BinaryReader reader, bool complex, int rows, int columns)
352+
{
353+
MatlabCellMatrix result = new MatlabCellMatrix(rows, columns);
354+
355+
for(int col = 0; col<columns; col++)
356+
{
357+
for(int row = 0; row<rows; row++)
358+
{
359+
// to use the regular array parsing methods we need to know how much data to give them
360+
ReadElementTag(reader, out _, out var fieldSize, out _);
361+
362+
// we also need to know what the array class is (maybe a nested structure or a cell)
363+
(ArrayClass arrayClass, _, _, _, string name) = ParseObjectHeader(reader);
364+
365+
// reset reader back to expected position for further parsers
366+
// the header has array flags (16 bytes), dimensions array (16 bytes) and array name (8 bytes)
367+
reader.BaseStream.Seek(-40, SeekOrigin.Current);
368+
369+
byte[] arrayData = reader.ReadBytes(fieldSize);
370+
371+
switch (arrayClass)
372+
{
373+
case ArrayClass.Structure:
374+
case ArrayClass.Cell:
375+
case ArrayClass.Character:
376+
result.Data[row, col] = ParseNonNumeric(arrayData);
377+
break;
378+
default:
379+
result.Data[row, col] = new NestedObject(new MatlabMatrix(name, arrayData));
380+
break;
381+
}
382+
}
383+
}
384+
385+
return new NestedObject(result);
189386
}
190387

191388
/// <summary>

0 commit comments

Comments
 (0)