Skip to content

Commit 8a699ac

Browse files
committed
feat: add file reader interface
1 parent a14c2b7 commit 8a699ac

File tree

4 files changed

+170
-0
lines changed

4 files changed

+170
-0
lines changed

src/iceberg/file_format.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/file_format.h
23+
/// File format used by Iceberg.
24+
25+
#include "iceberg/iceberg_export.h"
26+
27+
namespace iceberg {
28+
29+
/// \brief File format type
30+
enum class ICEBERG_EXPORT FileFormatType {
31+
kParquet,
32+
kAvro,
33+
kOrc,
34+
kPuffin,
35+
kMetadata,
36+
};
37+
38+
} // namespace iceberg

src/iceberg/file_reader.h

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/file_reader.h
23+
/// Reader interface for file formats like Parquet, Avro and ORC.
24+
25+
#include <functional>
26+
#include <variant>
27+
28+
#include "iceberg/arrow_c_data.h"
29+
#include "iceberg/result.h"
30+
#include "iceberg/type_fwd.h"
31+
32+
namespace iceberg {
33+
34+
/// \brief Base reader class to read data from different file formats.
35+
class ICEBERG_EXPORT Reader {
36+
public:
37+
virtual ~Reader() = default;
38+
39+
/// \brief Read next data from file.
40+
///
41+
/// \return std::monostate if the reader has no more data, otherwise `ArrowArray` or
42+
/// `StructLike` depending on the data layout by the reader implementation.
43+
using Data =
44+
std::variant<std::monostate, ArrowArray, std::reference_wrapper<const StructLike>>;
45+
virtual Result<Data> Next() = 0;
46+
47+
enum class DataLayout { kArrowArray, kStructLike };
48+
49+
/// \brief Get the data layout returned by `Next()` of the reader.
50+
virtual DataLayout data_layout() const = 0;
51+
};
52+
53+
/// \brief Wrapper of `Reader` to always return `StructLike`.
54+
///
55+
/// If the data layout of the wrapped reader is `ArrowArray`, the data will be converted
56+
/// to `StructLike`; otherwise, the data will be returned as is without any cost.
57+
class ICEBERG_EXPORT StructLikeReader : public Reader {
58+
public:
59+
explicit StructLikeReader(std::unique_ptr<Reader> reader);
60+
61+
/// \brief Always read data into `StructLike` or monostate if no more data.
62+
Result<Data> Next() final;
63+
64+
DataLayout data_layout() const final { return DataLayout::kStructLike; }
65+
};
66+
67+
/// \brief Wrapper of `Reader` to always return `ArrowArray`.
68+
///
69+
/// If the data layout of the wrapped reader is `StructLike`, the data will be converted
70+
/// to `ArrowArray`; otherwise, the data will be returned as is without any cost.
71+
class ICEBERG_EXPORT BatchReader : public Reader {
72+
public:
73+
explicit BatchReader(std::unique_ptr<Reader> reader);
74+
75+
/// \brief Always read data into `ArrowArray` or monostate if no more data.
76+
Result<Data> Next() final;
77+
78+
DataLayout data_layout() const final { return DataLayout::kArrowArray; }
79+
};
80+
81+
} // namespace iceberg

src/iceberg/manifest_reader.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/manifest_reader.h
23+
/// Data reader interface for manifest files.
24+
25+
#include <memory>
26+
#include <span>
27+
28+
#include "iceberg/file_reader.h"
29+
30+
namespace iceberg {
31+
32+
/// \brief Read manifest entries from a manifest file.
33+
class ICEBERG_EXPORT ManifestReader {
34+
public:
35+
virtual Result<std::span<std::unique_ptr<class ManifestEntry>>> Entries() const = 0;
36+
37+
private:
38+
std::unique_ptr<StructLikeReader> reader_;
39+
};
40+
41+
/// \brief Read manifest files from a manifest list file.
42+
class ICEBERG_EXPORT ManifestListReader {
43+
public:
44+
virtual Result<std::span<std::unique_ptr<class ManifestFile>>> Files() const = 0;
45+
46+
private:
47+
std::unique_ptr<StructLikeReader> reader_;
48+
};
49+
50+
} // namespace iceberg

src/iceberg/type_fwd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ struct SnapshotRef;
101101
struct StatisticsFile;
102102
struct TableMetadata;
103103

104+
enum class FileFormatType;
104105
enum class SnapshotRefType;
105106
enum class TransformType;
106107

0 commit comments

Comments
 (0)