|
1 | 1 | package io.cdap.plugin.gcp.gcs.sink; |
2 | 2 |
|
3 | | -import com.google.cloud.storage.Blob; |
4 | | -import com.google.common.annotations.VisibleForTesting; |
5 | 3 | import io.cdap.cdap.api.data.format.StructuredRecord; |
6 | 4 | import io.cdap.cdap.etl.api.validation.FormatContext; |
7 | 5 | import io.cdap.cdap.etl.api.validation.ValidatingOutputFormat; |
8 | | -import io.cdap.plugin.gcp.common.GCPUtils; |
9 | | -import io.cdap.plugin.gcp.gcs.StorageClient; |
10 | 6 | import org.apache.hadoop.conf.Configuration; |
11 | | -import org.apache.hadoop.fs.Path; |
12 | 7 | import org.apache.hadoop.io.NullWritable; |
13 | 8 | import org.apache.hadoop.mapreduce.JobContext; |
14 | | -import org.apache.hadoop.mapreduce.JobStatus; |
15 | 9 | import org.apache.hadoop.mapreduce.OutputCommitter; |
16 | 10 | import org.apache.hadoop.mapreduce.OutputFormat; |
17 | 11 | import org.apache.hadoop.mapreduce.RecordWriter; |
18 | 12 | import org.apache.hadoop.mapreduce.TaskAttemptContext; |
19 | | -import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; |
20 | 13 | import org.apache.hadoop.util.ReflectionUtils; |
21 | | -import org.slf4j.Logger; |
22 | | -import org.slf4j.LoggerFactory; |
23 | 14 |
|
24 | 15 | import java.io.IOException; |
25 | 16 | import java.util.HashMap; |
|
29 | 20 | * OutputFormatProvider for GCSSink |
30 | 21 | */ |
31 | 22 | public class GCSOutputFormatProvider implements ValidatingOutputFormat { |
32 | | - |
33 | | - private static final Logger LOG = LoggerFactory.getLogger(GCSOutputFormatProvider.class); |
34 | 23 | private static final String DELEGATE_OUTPUTFORMAT_CLASSNAME = "gcssink.delegate.outputformat.classname"; |
35 | 24 | private static final String OUTPUT_FOLDER = "gcssink.metric.output.folder"; |
36 | 25 | public static final String RECORD_COUNT_FORMAT = "recordcount.%s"; |
@@ -102,133 +91,6 @@ public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) |
102 | 91 | } |
103 | 92 | } |
104 | 93 |
|
105 | | - /** |
106 | | - * OutputCommitter for GCS |
107 | | - */ |
108 | | - public static class GCSOutputCommitter extends OutputCommitter { |
109 | | - |
110 | | - private final OutputCommitter delegate; |
111 | | - |
112 | | - public GCSOutputCommitter(OutputCommitter delegate) { |
113 | | - this.delegate = delegate; |
114 | | - } |
115 | | - |
116 | | - @Override |
117 | | - public void setupJob(JobContext jobContext) throws IOException { |
118 | | - delegate.setupJob(jobContext); |
119 | | - } |
120 | | - |
121 | | - @Override |
122 | | - public void cleanupJob(JobContext jobContext) throws IOException { |
123 | | - delegate.cleanupJob(jobContext); |
124 | | - } |
125 | | - |
126 | | - @Override |
127 | | - public void commitJob(JobContext jobContext) throws IOException { |
128 | | - delegate.commitJob(jobContext); |
129 | | - } |
130 | | - |
131 | | - @Override |
132 | | - public void abortJob(JobContext jobContext, JobStatus.State state) throws IOException { |
133 | | - delegate.abortJob(jobContext, state); |
134 | | - } |
135 | | - |
136 | | - @Override |
137 | | - public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException { |
138 | | - delegate.setupTask(taskAttemptContext); |
139 | | - } |
140 | | - |
141 | | - @Override |
142 | | - public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException { |
143 | | - return delegate.needsTaskCommit(taskAttemptContext); |
144 | | - } |
145 | | - |
146 | | - @Override |
147 | | - public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException { |
148 | | - /*On commit task, there seems to be some inconsistency across different hadoop implementations regarding the path |
149 | | - where output file is stored. For some implementations it appears in the path returned by FileOutputCommitter |
150 | | - getCommittedTaskPath and for some it does not.Before commit, the files appear to be consistently present in path |
151 | | - returned by FileOutputCommitter getTaskAttemptPath. Hence, find the output file from taskAttemptPath and add |
152 | | - metadata before commit happens. After commit, file would have been moved out of the taskAttemptPath. */ |
153 | | - try { |
154 | | - updateMetricMetaData(taskAttemptContext); |
155 | | - } catch (Exception exception) { |
156 | | - LOG.warn("Unable to record metric for task. Metric emitted for the number of affected rows may be incorrect.", |
157 | | - exception); |
158 | | - } |
159 | | - |
160 | | - delegate.commitTask(taskAttemptContext); |
161 | | - } |
162 | | - |
163 | | - private void updateMetricMetaData(TaskAttemptContext taskAttemptContext) throws IOException { |
164 | | - if (!(delegate instanceof FileOutputCommitter)) { |
165 | | - return; |
166 | | - } |
167 | | - |
168 | | - FileOutputCommitter fileOutputCommitter = (FileOutputCommitter) delegate; |
169 | | - Configuration configuration = taskAttemptContext.getConfiguration(); |
170 | | - //Task is not yet committed, so should be available in attempt path |
171 | | - Path taskAttemptPath = fileOutputCommitter.getTaskAttemptPath(taskAttemptContext); |
172 | | - if (configuration == null || taskAttemptPath == null) { |
173 | | - return; |
174 | | - } |
175 | | - |
176 | | - //read the count from configuration |
177 | | - String keyInConfig = String.format(RECORD_COUNT_FORMAT, taskAttemptContext.getTaskAttemptID()); |
178 | | - Map<String, String> metaData = new HashMap<>(); |
179 | | - metaData.put(GCSBatchSink.RECORD_COUNT, String.valueOf(configuration.getLong(keyInConfig, 0L))); |
180 | | - StorageClient storageClient = getStorageClient(configuration); |
181 | | - //update metadata on the output file present in the directory for this task |
182 | | - Blob blob = storageClient.pickABlob(taskAttemptPath.toString()); |
183 | | - if (blob == null) { |
184 | | - LOG.info("Could not find a file in path %s to apply count metadata.", taskAttemptPath.toString()); |
185 | | - return; |
186 | | - } |
187 | | - blob.toBuilder().setContentType(configuration.get(GCSBatchSink.CONTENT_TYPE)).setMetadata(metaData).build() |
188 | | - .update(); |
189 | | - } |
190 | | - |
191 | | - @VisibleForTesting |
192 | | - StorageClient getStorageClient(Configuration configuration) throws IOException { |
193 | | - String project = configuration.get(GCPUtils.FS_GS_PROJECT_ID); |
194 | | - String serviceAccount = null; |
195 | | - boolean isServiceAccountFile = GCPUtils.SERVICE_ACCOUNT_TYPE_FILE_PATH |
196 | | - .equals(configuration.get(GCPUtils.SERVICE_ACCOUNT_TYPE)); |
197 | | - if (isServiceAccountFile) { |
198 | | - serviceAccount = configuration.get(GCPUtils.CLOUD_JSON_KEYFILE, null); |
199 | | - } else { |
200 | | - serviceAccount = configuration.get(String.format("%s.%s", GCPUtils.CLOUD_JSON_KEYFILE_PREFIX, |
201 | | - GCPUtils.CLOUD_ACCOUNT_JSON_SUFFIX)); |
202 | | - } |
203 | | - return StorageClient.create(project, serviceAccount, isServiceAccountFile); |
204 | | - } |
205 | | - |
206 | | - @Override |
207 | | - public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException { |
208 | | - delegate.abortTask(taskAttemptContext); |
209 | | - } |
210 | | - |
211 | | - @Override |
212 | | - public boolean isCommitJobRepeatable(JobContext jobContext) throws IOException { |
213 | | - return delegate.isCommitJobRepeatable(jobContext); |
214 | | - } |
215 | | - |
216 | | - @Override |
217 | | - public boolean isRecoverySupported(JobContext jobContext) throws IOException { |
218 | | - return delegate.isRecoverySupported(jobContext); |
219 | | - } |
220 | | - |
221 | | - @Override |
222 | | - public boolean isRecoverySupported() { |
223 | | - return delegate.isRecoverySupported(); |
224 | | - } |
225 | | - |
226 | | - @Override |
227 | | - public void recoverTask(TaskAttemptContext taskContext) throws IOException { |
228 | | - delegate.recoverTask(taskContext); |
229 | | - } |
230 | | - } |
231 | | - |
232 | 94 | /** |
233 | 95 | * RecordWriter for GCSSink |
234 | 96 | */ |
|
0 commit comments