Netflix
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 1 deletion b/‎.gitignore‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎api/src/main/java/com/netflix/iceberg/FileFormat.java‎
Lines changed: 1 addition & 0 deletions b/‎api/src/main/java/com/netflix/iceberg/FileFormat.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/src/main/java/com/netflix/iceberg/UpdateProperties.java‎
Lines changed: 6 additions & 0 deletions b/‎api/src/main/java/com/netflix/iceberg/UpdateProperties.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎build.gradle‎
Lines changed: 17 additions & 0 deletions b/‎build.gradle‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎core/src/main/java/com/netflix/iceberg/PropertiesUpdate.java‎
Lines changed: 6 additions & 0 deletions b/‎core/src/main/java/com/netflix/iceberg/PropertiesUpdate.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎orc/src/main/java/com/netflix/iceberg/orc/ORC.java‎
Lines changed: 154 additions & 0 deletions b/‎orc/src/main/java/com/netflix/iceberg/orc/ORC.java‎
Lines changed: 154 additions & 0 deletions
diff --git a/‎orc/src/main/java/com/netflix/iceberg/orc/OrcFileAppender.java‎
Lines changed: 117 additions & 0 deletions b/‎orc/src/main/java/com/netflix/iceberg/orc/OrcFileAppender.java‎
Lines changed: 117 additions & 0 deletions
@@ -3,4 +3,5 @@
 *.iml
 # gradle build
 .gradle
-build
+build
+out
@@ -22,6 +22,7 @@
  * Enum of supported file formats.
  */
 public enum FileFormat {
+  ORC("orc"),
   PARQUET("parquet"),
   AVRO("avro");
 
 
@@ -47,4 +47,10 @@ public interface UpdateProperties extends PendingUpdate<Map<String, String>> {
    */
   UpdateProperties remove(String key);
 
+  /**
+   * Set the file format for the table.
+   * @param format
+   * @return this
+   */
+  UpdateProperties format(FileFormat format);
 }
@@ -60,6 +60,7 @@ subprojects {
 
   ext {
     avroVersion = '1.8.2'
+    orcVersion = '1.4.2'
     parquetVersion = '1.9.1-SNAPSHOT'
 
     jacksonVersion = '2.6.7'
@@ -114,6 +115,19 @@ project(':iceberg-core') {
   }
 }
 
+project(':iceberg-orc') {
+  dependencies {
+    compile project(':iceberg-api')
+    compile project(':iceberg-core')
+
+    compile "org.apache.orc:orc-core:$orcVersion:nohive"
+
+    compileOnly('org.apache.hadoop:hadoop-client:2.7.3') {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
+  }
+}
+
 project(':iceberg-parquet') {
   dependencies {
     compile project(':iceberg-api')
@@ -137,6 +151,7 @@ project(':iceberg-spark') {
     compile project(':iceberg-common')
     compile project(':iceberg-avro')
     compile project(':iceberg-core')
+    compile project(':iceberg-orc')
     compile project(':iceberg-parquet')
 
     compileOnly "org.apache.avro:avro:$avroVersion"
@@ -174,10 +189,12 @@ project(':iceberg-runtime') {
     shadow project(':iceberg-common')
     shadow project(':iceberg-avro')
     shadow project(':iceberg-core')
+    shadow project(':iceberg-orc')
     shadow project(':iceberg-parquet')
     shadow project(':iceberg-spark')
 
     shadow "org.apache.avro:avro:$avroVersion"
+    shadow "org.apache.orc:orc-core:$orcVersion:nohive"
     shadow "org.apache.parquet:parquet-avro:$parquetVersion"
   }
 
 
@@ -68,6 +68,12 @@ public UpdateProperties remove(String key) {
     return this;
   }
 
+  @Override
+  public UpdateProperties format(FileFormat format) {
+    set(TableProperties.DEFAULT_FILE_FORMAT, format.name());
+    return this;
+  }
+
   @Override
   public Map<String, String> apply() {
     this.base = ops.refresh();
 
@@ -0,0 +1,154 @@
+/*
+ * Copyright 2018 Hortonworks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.netflix.iceberg.orc;
+
+import com.google.common.base.Preconditions;
+import com.netflix.iceberg.PartitionSpec;
+import com.netflix.iceberg.Schema;
+import com.netflix.iceberg.io.InputFile;
+import com.netflix.iceberg.io.OutputFile;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Reader;
+import org.apache.orc.TypeDescription;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+public class ORC {
+  private ORC() {
+  }
+
+  public static WriteBuilder write(OutputFile file) {
+    return new WriteBuilder(file);
+  }
+
+  public static class WriteBuilder {
+    private final OutputFile file;
+    private Schema schema = null;
+    private PartitionSpec spec = null;
+    private Configuration conf = null;
+    private final Properties tableProperties = new Properties();
+    private Map<String, byte[]> metadata = new HashMap<>();
+
+    private WriteBuilder(OutputFile file) {
+      this.file = file;
+    }
+
+    public WriteBuilder partitionSpec(PartitionSpec spec) {
+      this.spec = spec;
+      return this;
+    }
+
+    public WriteBuilder metadata(String property, String value) {
+      metadata.put(property, value.getBytes(StandardCharsets.UTF_8));
+      return this;
+    }
+
+    public WriteBuilder tableProperties(Properties properties) {
+      tableProperties.putAll(properties);
+      return this;
+    }
+
+    public WriteBuilder schema(Schema schema) {
+      this.schema = schema;
+      return this;
+    }
+
+    public WriteBuilder conf(Configuration conf) {
+      this.conf = conf;
+      return this;
+    }
+
+    public OrcFileAppender build() {
+      Preconditions.checkNotNull(schema, "PartitionSpec is required");
+      if (conf == null) {
+        conf = new Configuration();
+      }
+      OrcFile.WriterOptions options =
+          OrcFile.writerOptions(tableProperties, conf);
+      return new OrcFileAppender(schema, spec, file, options, metadata);
+    }
+  }
+
+  public static ReadBuilder read(InputFile file) {
+    return new ReadBuilder(file);
+  }
+
+  public static class ReadBuilder {
+    private final InputFile file;
+    private com.netflix.iceberg.Schema schema = null;
+    private Long start = null;
+    private Long length = null;
+    private Configuration conf = null;
+
+    private ReadBuilder(InputFile file) {
+      Preconditions.checkNotNull(file, "Input file cannot be null");
+      this.file = file;
+    }
+
+    /**
+     * Restricts the read to the given range: [start, start + length).
+     *
+     * @param start the start position for this read
+     * @param length the length of the range this read should scan
+     * @return this builder for method chaining
+     */
+    public ReadBuilder split(long start, long length) {
+      this.start = start;
+      this.length = length;
+      return this;
+    }
+
+    public ReadBuilder schema(com.netflix.iceberg.Schema schema) {
+      this.schema = schema;
+      return this;
+    }
+
+    public ReadBuilder conf(Configuration conf) {
+      this.conf = conf;
+      return this;
+    }
+
+    public OrcIterator build() {
+      Preconditions.checkNotNull(schema, "Schema is required");
+      try {
+        Path path = new Path(file.location());
+        if (conf == null) {
+          conf = new Configuration();
+        }
+        Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
+        List<Integer> columnIds = new ArrayList<>();
+        TypeDescription orcSchema = TypeConversion.toOrc(schema, columnIds);
+        Reader.Options options = reader.options();
+        if (start != null) {
+          options.range(start, length);
+        }
+        options.schema(orcSchema);
+        return new OrcIterator(path, orcSchema, reader.rows(options));
+      } catch (IOException e) {
+        throw new RuntimeException("Can't open " + file.location(), e);
+      }
+    }
+  }
+}
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2018 Hortonworks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */package com.netflix.iceberg.orc;
+
+import com.netflix.iceberg.Metrics;
+import com.netflix.iceberg.PartitionSpec;
+import com.netflix.iceberg.Schema;
+import com.netflix.iceberg.io.FileAppender;
+import com.netflix.iceberg.io.OutputFile;
+import org.apache.hadoop.fs.Path;
+import org.apache.orc.ColumnStatistics;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch;
+import org.apache.orc.OrcFile;
+import org.apache.orc.Writer;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Create a file appender for ORC.
+ */
+public class OrcFileAppender implements FileAppender<VectorizedRowBatch> {
+  private final Writer writer;
+  private final TypeDescription orcSchema;
+  private final List<Integer> columnIds = new ArrayList<>();
+  private final Path path;
+
+  public static final String COLUMN_NUMBERS_ATTRIBUTE = "iceberg.column.ids";
+
+  static ByteBuffer buidIdString(List<Integer> list) {
+    StringBuilder buffer = new StringBuilder();
+    for(int i=0; i < list.size(); ++i) {
+      if (i != 0) {
+        buffer.append(',');
+      }
+      buffer.append(list.get(i));
+    }
+    return ByteBuffer.wrap(buffer.toString().getBytes(StandardCharsets.UTF_8));
+  }
+
+  OrcFileAppender(Schema schema,
+                  PartitionSpec spec,
+                  OutputFile file,
+                  OrcFile.WriterOptions options,
+                  Map<String,byte[]> metadata) {
+    orcSchema = TypeConversion.toOrc(schema, columnIds);
+    options.setSchema(orcSchema);
+    path = new Path(file.location());
+    try {
+      writer = OrcFile.createWriter(path, options);
+    } catch (IOException e) {
+      throw new RuntimeException("Can't create file " + path, e);
+    }
+    writer.addUserMetadata(COLUMN_NUMBERS_ATTRIBUTE, buidIdString(columnIds));
+    metadata.forEach(
+        (key,value) -> writer.addUserMetadata(key, ByteBuffer.wrap(value)));
+  }
+
+  @Override
+  public void add(VectorizedRowBatch datum) {
+    try {
+      writer.addRowBatch(datum);
+    } catch (IOException e) {
+      throw new RuntimeException("Problem writing to ORC file " + path, e);
+    }
+  }
+
+  @Override
+  public Metrics metrics() {
+    try {
+      long rows = writer.getNumberOfRows();
+      ColumnStatistics[] stats = writer.getStatistics();
+      // we don't currently have columnSizes or distinct counts.
+      Map<Integer, Long> valueCounts = new HashMap<>();
+      Map<Integer, Long> nullCounts = new HashMap<>();
+      for(int c=1; c < stats.length; ++c) {
+        int fieldId = columnIds.get(c);
+        valueCounts.put(fieldId, stats[c].getNumberOfValues());
+      }
+      for(TypeDescription child: orcSchema.getChildren()) {
+        int c = child.getId();
+        int fieldId = columnIds.get(c);
+        nullCounts.put(fieldId, rows - stats[c].getNumberOfValues());
+      }
+      return new Metrics(rows, null, valueCounts, nullCounts);
+    } catch (IOException e) {
+      throw new RuntimeException("Can't get statistics " + path, e);
+    }
+  }
+
+  @Override
+  public void close() throws IOException {
+    writer.close();
+  }
+
+  public TypeDescription getSchema() {
+    return orcSchema;
+  }
+}
-Original file line number
+Diff line change
 *.iml
 # gradle build
 .gradle
 -build
 +build
 +out