@@ -32,14 +32,14 @@ import org.apache.hadoop.conf.Configuration
32
32
import org .apache .hadoop .fs .{FileStatus , Path }
33
33
import org .apache .hadoop .mapreduce .Job
34
34
35
- import org .apache .spark .TaskContext
35
+ import org .apache .spark .{ SparkException , TaskContext }
36
36
import org .apache .spark .internal .Logging
37
37
import org .apache .spark .sql .SparkSession
38
38
import org .apache .spark .sql .catalyst .InternalRow
39
39
import org .apache .spark .sql .execution .datasources .{FileFormat , OutputWriterFactory , PartitionedFile }
40
40
import org .apache .spark .sql .sources .{DataSourceRegister , Filter }
41
41
import org .apache .spark .sql .types .StructType
42
- import org .apache .spark .util .SerializableConfiguration
42
+ import org .apache .spark .util .{ SerializableConfiguration , Utils }
43
43
44
44
private [avro] class AvroFileFormat extends FileFormat
45
45
with DataSourceRegister with Logging with Serializable {
@@ -59,36 +59,13 @@ private[avro] class AvroFileFormat extends FileFormat
59
59
val conf = spark.sessionState.newHadoopConf()
60
60
val parsedOptions = new AvroOptions (options, conf)
61
61
62
- // Schema evolution is not supported yet. Here we only pick a single random sample file to
63
- // figure out the schema of the whole dataset.
64
- val sampleFile =
65
- if (parsedOptions.ignoreExtension) {
66
- files.headOption.getOrElse {
67
- throw new FileNotFoundException (" Files for schema inferring have been not found." )
68
- }
69
- } else {
70
- files.find(_.getPath.getName.endsWith(" .avro" )).getOrElse {
71
- throw new FileNotFoundException (
72
- " No Avro files found. If files don't have .avro extension, set ignoreExtension to true" )
73
- }
74
- }
75
-
76
62
// User can specify an optional avro json schema.
77
63
val avroSchema = parsedOptions.schema
78
64
.map(new Schema .Parser ().parse)
79
65
.getOrElse {
80
- val in = new FsInput (sampleFile.getPath, conf)
81
- try {
82
- val reader = DataFileReader .openReader(in, new GenericDatumReader [GenericRecord ]())
83
- try {
84
- reader.getSchema
85
- } finally {
86
- reader.close()
87
- }
88
- } finally {
89
- in.close()
90
- }
91
- }
66
+ inferAvroSchemaFromFiles(files, conf, parsedOptions.ignoreExtension,
67
+ spark.sessionState.conf.ignoreCorruptFiles)
68
+ }
92
69
93
70
SchemaConverters .toSqlType(avroSchema).dataType match {
94
71
case t : StructType => Some (t)
@@ -100,6 +77,51 @@ private[avro] class AvroFileFormat extends FileFormat
100
77
}
101
78
}
102
79
80
+ private def inferAvroSchemaFromFiles (
81
+ files : Seq [FileStatus ],
82
+ conf : Configuration ,
83
+ ignoreExtension : Boolean ,
84
+ ignoreCorruptFiles : Boolean ): Schema = {
85
+ // Schema evolution is not supported yet. Here we only pick first random readable sample file to
86
+ // figure out the schema of the whole dataset.
87
+ val avroReader = files.iterator.map { f =>
88
+ val path = f.getPath
89
+ if (! ignoreExtension && ! path.getName.endsWith(" .avro" )) {
90
+ None
91
+ } else {
92
+ Utils .tryWithResource {
93
+ new FsInput (path, conf)
94
+ } { in =>
95
+ try {
96
+ Some (DataFileReader .openReader(in, new GenericDatumReader [GenericRecord ]()))
97
+ } catch {
98
+ case e : IOException =>
99
+ if (ignoreCorruptFiles) {
100
+ logWarning(s " Skipped the footer in the corrupted file: $path" , e)
101
+ None
102
+ } else {
103
+ throw new SparkException (s " Could not read file: $path" , e)
104
+ }
105
+ }
106
+ }
107
+ }
108
+ }.collectFirst {
109
+ case Some (reader) => reader
110
+ }
111
+
112
+ avroReader match {
113
+ case Some (reader) =>
114
+ try {
115
+ reader.getSchema
116
+ } finally {
117
+ reader.close()
118
+ }
119
+ case None =>
120
+ throw new FileNotFoundException (
121
+ " No Avro files found. If files don't have .avro extension, set ignoreExtension to true" )
122
+ }
123
+ }
124
+
103
125
override def shortName (): String = " avro"
104
126
105
127
override def isSplitable (
0 commit comments