1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one
3+ * or more contributor license agreements. See the NOTICE file
4+ * distributed with this work for additional information
5+ * regarding copyright ownership. The ASF licenses this file
6+ * to you under the Apache License, Version 2.0 (the
7+ * "License"); you may not use this file except in compliance
8+ * with the License. You may obtain a copy of the License at
9+ *
10+ * http://www.apache.org/licenses/LICENSE-2.0
11+ *
12+ * Unless required by applicable law or agreed to in writing,
13+ * software distributed under the License is distributed on an
14+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+ * KIND, either express or implied. See the License for the
16+ * specific language governing permissions and limitations
17+ * under the License.
18+ */
19+ package org .apache .sysds .runtime .io ;
20+
21+ import java .io .IOException ;
22+ import java .io .InputStream ;
23+
24+ import org .apache .hadoop .conf .Configuration ;
25+ import org .apache .hadoop .fs .Path ;
26+ import org .apache .parquet .example .data .Group ;
27+ import org .apache .parquet .hadoop .ParquetFileReader ;
28+ import org .apache .parquet .hadoop .ParquetReader ;
29+ import org .apache .parquet .hadoop .example .GroupReadSupport ;
30+ import org .apache .parquet .hadoop .metadata .ParquetMetadata ;
31+ import org .apache .parquet .hadoop .util .HadoopInputFile ;
32+ import org .apache .parquet .schema .MessageType ;
33+ import org .apache .parquet .schema .PrimitiveType ;
34+ import org .apache .sysds .common .Types .ValueType ;
35+ import org .apache .sysds .conf .ConfigurationManager ;
36+ import org .apache .sysds .runtime .DMLRuntimeException ;
37+ import org .apache .sysds .runtime .frame .data .FrameBlock ;
38+ import org .apache .sysds .runtime .util .HDFSTool ;
39+
40+ /**
41+ * Single-threaded frame parquet reader.
42+ *
43+ */
44+ public class FrameReaderParquet extends FrameReader {
45+
46+ /**
47+ * Reads a Parquet file from HDFS and converts it into a FrameBlock.
48+ *
49+ * @param fname The HDFS file path to the Parquet file.
50+ * @param schema The expected data types of the columns.
51+ * @param names The names of the columns.
52+ * @param rlen The expected number of rows.
53+ * @param clen The expected number of columns.
54+ * @return A FrameBlock containing the data read from the Parquet file.
55+ */
56+ @ Override
57+ public FrameBlock readFrameFromHDFS (String fname , ValueType [] schema , String [] names , long rlen , long clen ) throws IOException , DMLRuntimeException {
58+ // Prepare file access
59+ Configuration conf = ConfigurationManager .getCachedJobConf ();
60+ Path path = new Path (fname );
61+
62+ // Check existence and non-empty file
63+ if (!HDFSTool .existsFileOnHDFS (path .toString ())) {
64+ throw new IOException ("File does not exist on HDFS: " + fname );
65+ }
66+
67+ // Allocate output frame block
68+ ValueType [] lschema = createOutputSchema (schema , clen );
69+ String [] lnames = createOutputNames (names , clen );
70+ FrameBlock ret = createOutputFrameBlock (lschema , lnames , rlen );
71+
72+ // Read Parquet file
73+ readParquetFrameFromHDFS (path , conf , ret , lschema , rlen , clen );
74+
75+ return ret ;
76+ }
77+
78+ /**
79+ * Reads data from a Parquet file on HDFS and fills the provided FrameBlock.
80+ * The method retrieves the Parquet schema from the file footer, maps the required column names
81+ * to their corresponding indices, and then uses a ParquetReader to iterate over each row.
82+ * Data is extracted based on the column type and set into the output FrameBlock.
83+ *
84+ * @param path The HDFS path to the Parquet file.
85+ * @param conf The Hadoop configuration.
86+ * @param dest The FrameBlock to populate with data.
87+ * @param schema The expected value types for the output columns.
88+ * @param rlen The expected number of rows.
89+ * @param clen The expected number of columns.
90+ */
91+ protected void readParquetFrameFromHDFS (Path path , Configuration conf , FrameBlock dest , ValueType [] schema , long rlen , long clen ) throws IOException {
92+ // Retrieve schema from Parquet footer
93+ ParquetMetadata metadata = ParquetFileReader .open (HadoopInputFile .fromPath (path , conf )).getFooter ();
94+ MessageType parquetSchema = metadata .getFileMetaData ().getSchema ();
95+
96+ // Map column names to Parquet schema indices
97+ String [] columnNames = dest .getColumnNames ();
98+ int [] columnIndices = new int [columnNames .length ];
99+ for (int i = 0 ; i < columnNames .length ; i ++) {
100+ columnIndices [i ] = parquetSchema .getFieldIndex (columnNames [i ]);
101+ }
102+
103+ // Read data usind ParquetReader
104+ try (ParquetReader <Group > rowReader = ParquetReader .builder (new GroupReadSupport (), path )
105+ .withConf (conf )
106+ .build ()) {
107+
108+ Group group ;
109+ int row = 0 ;
110+ while ((group = rowReader .read ()) != null ) {
111+ for (int col = 0 ; col < clen ; col ++) {
112+ int colIndex = columnIndices [col ];
113+ if (group .getFieldRepetitionCount (colIndex ) > 0 ) {
114+ PrimitiveType .PrimitiveTypeName type = parquetSchema .getType (columnNames [col ]).asPrimitiveType ().getPrimitiveTypeName ();
115+ switch (type ) {
116+ case INT32 :
117+ dest .set (row , col , group .getInteger (colIndex , 0 ));
118+ break ;
119+ case INT64 :
120+ dest .set (row , col , group .getLong (colIndex , 0 ));
121+ break ;
122+ case FLOAT :
123+ dest .set (row , col , group .getFloat (colIndex , 0 ));
124+ break ;
125+ case DOUBLE :
126+ dest .set (row , col , group .getDouble (colIndex , 0 ));
127+ break ;
128+ case BOOLEAN :
129+ dest .set (row , col , group .getBoolean (colIndex , 0 ));
130+ break ;
131+ case BINARY :
132+ dest .set (row , col , group .getBinary (colIndex , 0 ).toStringUsingUTF8 ());
133+ break ;
134+ default :
135+ throw new IOException ("Unsupported data type: " + type );
136+ }
137+ } else {
138+ dest .set (row , col , null );
139+ }
140+ }
141+ row ++;
142+ }
143+
144+ // Check frame dimensions
145+ if (row != rlen ) {
146+ throw new IOException ("Mismatch in row count: expected " + rlen + ", but got " + row );
147+ }
148+ }
149+ }
150+
151+ //not implemented
152+ @ Override
153+ public FrameBlock readFrameFromInputStream (InputStream is , ValueType [] schema , String [] names , long rlen , long clen )
154+ throws IOException , DMLRuntimeException {
155+ throw new UnsupportedOperationException ("Unimplemented method 'readFrameFromInputStream'" );
156+ }
157+ }
0 commit comments