Skip to content

Commit 694c619

Browse files
committed
feat: add file reader interface
1 parent a14c2b7 commit 694c619

File tree

3 files changed

+170
-0
lines changed

3 files changed

+170
-0
lines changed

src/iceberg/file_format.h

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/file_format.h
23+
/// File format used by Iceberg.
24+
25+
#include "iceberg/iceberg_export.h"
26+
27+
namespace iceberg {
28+
29+
/// \brief File format type
30+
enum class ICEBERG_EXPORT FileFormatType {
31+
kParquet,
32+
kAvro,
33+
kOrc,
34+
kPuffin,
35+
kMetadata,
36+
};
37+
38+
} // namespace iceberg

src/iceberg/file_reader.h

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/file_reader.h
23+
/// Reader interface for file formats like Parquet, Avro and ORC.
24+
25+
#include <functional>
26+
#include <memory>
27+
#include <variant>
28+
29+
#include "iceberg/arrow_c_data.h"
30+
#include "iceberg/result.h"
31+
#include "iceberg/type_fwd.h"
32+
33+
namespace iceberg {
34+
35+
/// \brief Base reader class to read data from different file formats.
36+
class ICEBERG_EXPORT Reader {
37+
public:
38+
virtual ~Reader() = default;
39+
40+
/// \brief Read next data from file.
41+
///
42+
/// \return std::monostate if the reader has no more data, otherwise `ArrowArray` or
43+
/// `StructLike` depending on the data layout by the reader implementation.
44+
using Data =
45+
std::variant<std::monostate, ArrowArray, std::reference_wrapper<const StructLike>>;
46+
virtual Result<Data> Next() = 0;
47+
48+
enum class DataLayout { kArrowArray, kStructLike };
49+
50+
/// \brief Get the data layout returned by `Next()` of the reader.
51+
virtual DataLayout data_layout() const = 0;
52+
};
53+
54+
/// \brief Wrapper of `Reader` to always return `StructLike`.
55+
///
56+
/// If the data layout of the wrapped reader is `ArrowArray`, the data will be converted
57+
/// to `StructLike`; otherwise, the data will be returned as is without any cost.
58+
class ICEBERG_EXPORT StructLikeReader : public Reader {
59+
public:
60+
explicit StructLikeReader(std::unique_ptr<Reader> reader);
61+
62+
/// \brief Always read data into `StructLike` or monostate if no more data.
63+
Result<Data> Next() final;
64+
65+
DataLayout data_layout() const final { return DataLayout::kStructLike; }
66+
};
67+
68+
/// \brief Wrapper of `Reader` to always return `ArrowArray`.
69+
///
70+
/// If the data layout of the wrapped reader is `StructLike`, the data will be converted
71+
/// to `ArrowArray`; otherwise, the data will be returned as is without any cost.
72+
class ICEBERG_EXPORT BatchReader : public Reader {
73+
public:
74+
explicit BatchReader(std::unique_ptr<Reader> reader);
75+
76+
/// \brief Always read data into `ArrowArray` or monostate if no more data.
77+
Result<Data> Next() final;
78+
79+
DataLayout data_layout() const final { return DataLayout::kArrowArray; }
80+
};
81+
82+
} // namespace iceberg

src/iceberg/manifest_reader.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
#pragma once
21+
22+
/// \file iceberg/manifest_reader.h
23+
/// Data reader interface for manifest files.
24+
25+
#include <memory>
26+
#include <span>
27+
28+
#include "iceberg/file_reader.h"
29+
30+
namespace iceberg {
31+
32+
/// \brief Read manifest entries from a manifest file.
33+
class ICEBERG_EXPORT ManifestReader {
34+
public:
35+
virtual Result<std::span<std::unique_ptr<class ManifestEntry>>> Entries() const = 0;
36+
37+
private:
38+
std::unique_ptr<StructLikeReader> reader_;
39+
};
40+
41+
/// \brief Read manifest files from a manifest list file.
42+
class ICEBERG_EXPORT ManifestListReader {
43+
public:
44+
virtual Result<std::span<std::unique_ptr<class ManifestFile>>> Files() const = 0;
45+
46+
private:
47+
std::unique_ptr<StructLikeReader> reader_;
48+
};
49+
50+
} // namespace iceberg

0 commit comments

Comments
 (0)