Skip to content

Commit 56b4826

Browse files
feat: read parquet resources (#151)
* feat: parquet reader * fix: review heuristic to determine reader * feat: support multiple files when reading parquet --------- Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
1 parent 721f0ad commit 56b4826

File tree

11 files changed

+496
-13
lines changed

11 files changed

+496
-13
lines changed

src/Packata.ResourceReaders.Testing/Packata.ResourceReaders.Testing.csproj

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
<Project Sdk="Microsoft.NET.Sdk">
1+
<Project Sdk="Microsoft.NET.Sdk">
22
<ItemGroup>
3+
<None Remove="Resources\iris.parquet" />
34
<None Remove="Resources\my-book.xlsx" />
45
</ItemGroup>
56
<ItemGroup>
7+
<EmbeddedResource Include="Resources\iris.parquet" />
68
<EmbeddedResource Include="Resources\my-book.xlsx" />
79
</ItemGroup>
810
<ItemGroup>
3.74 KB
Binary file not shown.
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Linq;
4+
using System.Reflection;
5+
using System.Text;
6+
using System.Threading.Tasks;
7+
using Moq;
8+
using NUnit.Framework;
9+
using Packata.Core;
10+
using Packata.Core.PathHandling;
11+
using Packata.Core.Testing.PathHandling;
12+
using Packata.ResourceReaders.Tabular;
13+
14+
namespace Packata.ResourceReaders.Testing.Tabular;
15+
public class ParquetReaderBuilderTests
16+
{
17+
private static LocalPath GetPath()
18+
{
19+
using var stream = Assembly.GetExecutingAssembly()
20+
.GetManifestResourceStream($"{typeof(ResourceTests).Namespace}.Resources.iris.parquet")
21+
?? throw new FileNotFoundException("Resource not found", $"{typeof(ResourceTests).Namespace}.Resources.iris.parquet");
22+
23+
var fileStream = new MemoryStream();
24+
stream.CopyTo(fileStream);
25+
var fileSystem = new Mock<IFileSystem>();
26+
fileSystem.Setup(x => x.Exists("my-resource-path")).Returns(true);
27+
fileSystem.Setup(x => x.OpenRead("my-resource-path")).Returns(fileStream);
28+
return new LocalPath(fileSystem.Object, "", "my-resource-path");
29+
}
30+
31+
[Test]
32+
public void ToDataReader_ExistingLocalResource_ReturnsIDataReader()
33+
{
34+
var resource = new Resource
35+
{
36+
Paths = [GetPath()],
37+
Type = "table",
38+
Name = "my-resource",
39+
Format = "parquet"
40+
};
41+
var builder = new ParquetReaderBuilder();
42+
builder.Configure(resource);
43+
var reader = builder.Build();
44+
45+
using var dataReader = reader.ToDataReader(resource);
46+
47+
Assert.That(dataReader, Is.Not.Null);
48+
Assert.That(dataReader.Read(), Is.True);
49+
50+
using (Assert.EnterMultipleScope())
51+
{
52+
Assert.That(dataReader[0], Is.EqualTo(5.10));
53+
Assert.That(dataReader["Sepal.Length"], Is.EqualTo(5.10));
54+
Assert.That(dataReader.GetValue(0), Is.EqualTo(5.10));
55+
Assert.That(dataReader.GetDouble(0), Is.EqualTo(5.10));
56+
Assert.That(dataReader.GetName(0), Is.EqualTo("Sepal.Length"));
57+
Assert.That(dataReader.GetOrdinal("Sepal.Length"), Is.EqualTo(0));
58+
59+
Assert.That(dataReader[4], Is.EqualTo("setosa"));
60+
Assert.That(dataReader["Species"], Is.EqualTo("setosa"));
61+
Assert.That(dataReader.GetValue(4), Is.EqualTo("setosa"));
62+
Assert.That(dataReader.GetString(4), Is.EqualTo("setosa"));
63+
Assert.That(dataReader.GetName(4), Is.EqualTo("Species"));
64+
Assert.That(dataReader.GetOrdinal("Species"), Is.EqualTo(4));
65+
}
66+
}
67+
}
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Linq;
5+
using System.Reflection;
6+
using System.Text;
7+
using System.Threading.Tasks;
8+
using Moq;
9+
using NUnit.Framework;
10+
using Packata.Core;
11+
using Packata.Core.PathHandling;
12+
using Packata.Core.Testing.PathHandling;
13+
using Packata.ResourceReaders.Tabular;
14+
15+
namespace Packata.ResourceReaders.Testing.Tabular;
16+
public class ParquetReaderTests
17+
{
18+
private static IEnumerable<IPath> GetPaths()
19+
{
20+
using var stream = Assembly.GetExecutingAssembly()
21+
.GetManifestResourceStream($"{typeof(ResourceTests).Namespace}.Resources.iris.parquet")
22+
?? throw new FileNotFoundException("Resource not found", $"{typeof(ResourceTests).Namespace}.Resources.iris.parquet");
23+
24+
using var tmp = new MemoryStream();
25+
stream.CopyTo(tmp);
26+
var parquetBytes = tmp.ToArray();// cache the bytes once
27+
28+
var fileSystem = new Mock<IFileSystem>();
29+
fileSystem.Setup(fs => fs.Exists("my-resource-path")).Returns(true);
30+
fileSystem.Setup(fs => fs.OpenRead("my-resource-path"))
31+
.Returns(() => new MemoryStream(parquetBytes, writable: false)); // fresh stream
32+
yield return new LocalPath(fileSystem.Object, "", "my-resource-path");
33+
}
34+
35+
[Test]
36+
[TestCaseSource(nameof(GetPaths))]
37+
public void ToDataReader_ExistingLocalResource_ReturnsIDataReader(IPath path)
38+
{
39+
var resource = new Resource() { Paths = [path], Type = "table", Name = "my-resource" };
40+
var wrapper = new ParquetReaderWrapper();
41+
var reader = new ParquetReader(wrapper);
42+
var dataReader = reader.ToDataReader(resource);
43+
44+
Assert.That(dataReader, Is.Not.Null);
45+
Assert.That(dataReader.Read(), Is.True);
46+
using (Assert.EnterMultipleScope())
47+
{
48+
Assert.That(dataReader[0], Is.EqualTo(5.10));
49+
Assert.That(dataReader[4], Is.EqualTo("setosa"));
50+
}
51+
Assert.That(dataReader.Read(), Is.True);
52+
using (Assert.EnterMultipleScope())
53+
{
54+
Assert.That(dataReader[0], Is.EqualTo(4.90));
55+
Assert.That(dataReader[4], Is.EqualTo("setosa"));
56+
}
57+
for (int i = 0; i < 148; i++)
58+
Assert.That(dataReader.Read(), Is.True);
59+
Assert.That(dataReader.Read(), Is.False);
60+
}
61+
62+
[Test]
63+
public void ToDataReader_NoResource_Throws()
64+
{
65+
var resource = new Resource() { Paths = [], Type = "table", Name = "my-resource" };
66+
var wrapper = new ParquetReaderWrapper();
67+
var reader = new ParquetReader(wrapper);
68+
Assert.Throws<InvalidOperationException>(() => reader.ToDataReader(resource));
69+
}
70+
71+
[Test]
72+
[TestCaseSource(nameof(GetPaths))]
73+
public void ToDataReader_TwoResources_DataReader(IPath path)
74+
{
75+
var resource = new Resource() { Paths = [path, path], Type = "table", Name = "my-resource" };
76+
var wrapper = new ParquetReaderWrapper();
77+
var reader = new ParquetReader(wrapper);
78+
var dr = reader.ToDataReader(resource);
79+
Assert.That(dr, Is.Not.Null);
80+
Assert.That(dr, Is.TypeOf<ParquetDataReader>());
81+
}
82+
83+
[Test]
84+
public void ToDataReader_FileNotFound_ThrowsFileNotFoundException()
85+
{
86+
// Setup a path that doesn't exist
87+
var fileSystem = new Mock<IFileSystem>();
88+
fileSystem.Setup(x => x.Exists("non-existent-path")).Returns(false);
89+
var path = new LocalPath(fileSystem.Object, "", "non-existent-path");
90+
91+
var resource = new Resource() { Paths = [path], Type = "table", Name = "my-resource" };
92+
var wrapper = new ParquetReaderWrapper();
93+
var reader = new ParquetReader(wrapper);
94+
95+
Assert.Throws<FileNotFoundException>(() => reader.ToDataReader(resource));
96+
}
97+
98+
[Test]
99+
public void ToDataReader_IOError_ThrowsIOException()
100+
{
101+
// Setup a path that throws an IO exception when opened
102+
var fileSystem = new Mock<IFileSystem>();
103+
fileSystem.Setup(x => x.Exists("error-path")).Returns(true);
104+
fileSystem.Setup(x => x.OpenRead("error-path")).Throws<IOException>();
105+
var path = new LocalPath(fileSystem.Object, "", "error-path");
106+
107+
var resource = new Resource() { Paths = [path], Type = "table", Name = "my-resource" };
108+
var wrapper = new ParquetReaderWrapper();
109+
var reader = new ParquetReader(wrapper);
110+
111+
Assert.Throws<IOException>(() => reader.ToDataReader(resource));
112+
}
113+
}

src/Packata.ResourceReaders.Testing/TabularReaderFactoryTests.cs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,4 +55,33 @@ public void SetHeuristic_UseDialectProperties_ApplyHeuristics()
5555
reader = factory.Create(new Resource() { Type = "table" });
5656
Assert.That(reader, Is.EqualTo(structuredReader.Object));
5757
}
58+
59+
[Test]
60+
[TestCase("delimited", typeof(DelimitedReader))]
61+
[TestCase("spreadsheet", typeof(SpreadsheetReader))]
62+
[TestCase("database", typeof(DatabaseReader))]
63+
public void Create_WithTableDialectType_ReturnsDelimitedReader(string dialectType, Type expected)
64+
{
65+
var factory = new TabularReaderFactory();
66+
var reader = factory.Create(new Resource() { Type = "table", Dialect = new TableDelimitedDialect() { Type = dialectType } });
67+
Assert.That(reader, Is.InstanceOf(expected));
68+
}
69+
70+
[Test]
71+
[TestCase("csv", typeof(DelimitedReader))]
72+
[TestCase("tsv", typeof(DelimitedReader))]
73+
[TestCase("psv", typeof(DelimitedReader))]
74+
[TestCase("csv.gz", typeof(DelimitedReader))]
75+
[TestCase("tsv.gz", typeof(DelimitedReader))]
76+
[TestCase("psv.gz", typeof(DelimitedReader))]
77+
[TestCase("parquet", typeof(ParquetReader))]
78+
[TestCase("pqt", typeof(ParquetReader))]
79+
[TestCase("xls", typeof(SpreadsheetReader))]
80+
[TestCase("xlsx", typeof(SpreadsheetReader))]
81+
public void Create_WithTableFormat_ReturnsExpectedReader(string format, Type expected)
82+
{
83+
var factory = new TabularReaderFactory();
84+
var reader = factory.Create(new Resource() { Type = "table", Format = format });
85+
Assert.That(reader, Is.InstanceOf(expected));
86+
}
5887
}

src/Packata.ResourceReaders/Inference/MediaTypeBasedDialectInference.cs

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,23 @@ public override bool TryInfer(Resource resource, out TableDelimitedDialect? dial
1515
if (string.IsNullOrEmpty(mediaType))
1616
return false;
1717

18-
if (!mediaType.StartsWith("text/", StringComparison.OrdinalIgnoreCase))
19-
return false;
20-
21-
mediaType = mediaType.ToLowerInvariant().Substring(5).Split([';'])[0];
22-
var format = mediaType switch
18+
string? format;
19+
if (mediaType.StartsWith("text/", StringComparison.OrdinalIgnoreCase))
2320
{
24-
"csv" => "csv",
25-
"tsv" => "tsv",
26-
"tab-separated-values" => "tsv",
27-
"psv" => "psv",
28-
_ => null
29-
};
21+
mediaType = mediaType.ToLowerInvariant().Substring(5).Split([';'])[0];
22+
format = mediaType switch
23+
{
24+
"csv" => "csv",
25+
"tsv" => "tsv",
26+
"tab-separated-values" => "tsv",
27+
"psv" => "psv",
28+
_ => null
29+
};
30+
}
31+
else
32+
{
33+
return false;
34+
}
3035

3136
return TryInferFromFormat(format, out dialect);
3237
}

0 commit comments

Comments
 (0)