GoogleCloudPlatform
diff --git a/‎maven-archetypes/examples-java8/pom.xml‎
Lines changed: 1 addition & 0 deletions b/‎maven-archetypes/examples-java8/pom.xml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml‎
Lines changed: 11 additions & 3 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/pom.xml‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java‎
Lines changed: 11 additions & 13 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/DebuggingWordCount.java‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java‎
Lines changed: 4 additions & 3 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCount.java‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java‎
Lines changed: 7 additions & 7 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/MinimalWordCountJava8.java‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java‎
Lines changed: 12 additions & 31 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WindowedWordCount.java‎
Lines changed: 12 additions & 31 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java‎
Lines changed: 10 additions & 10 deletions b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/WordCount.java‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java‎
Lines changed: 1 addition & 1 deletion b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExampleBigQueryTableOptions.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java‎
Lines changed: 1 addition & 1 deletion b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicAndSubscriptionOptions.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java‎
Lines changed: 1 addition & 1 deletion b/‎maven-archetypes/examples-java8/src/main/resources/archetype-resources/src/main/java/common/ExamplePubsubTopicOptions.java‎
Lines changed: 1 addition & 1 deletion
@@ -74,6 +74,7 @@
       </plugins>
     </pluginManagement>
   </build>
+
   <!-- Dependency section from Beam omitted. Used in Beam for parallelizing build to ensure
        relevant sections are built, but that is not being done here. -->
 </project>
@@ -27,6 +27,7 @@
 
   <properties>
     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <surefire-plugin.version>2.20</surefire-plugin.version>
   </properties>
 
   <repositories>
@@ -58,7 +59,7 @@
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-surefire-plugin</artifactId>
-        <version>2.19.1</version>
+        <version>${surefire-plugin.version}</version>
         <configuration>
           <parallel>all</parallel>
           <threadCount>4</threadCount>
@@ -68,19 +69,26 @@
           <dependency>
             <groupId>org.apache.maven.surefire</groupId>
             <artifactId>surefire-junit47</artifactId>
-            <version>2.19.1</version>
+            <version>${surefire-plugin.version}</version>
           </dependency>
         </dependencies>
       </plugin>
 
+      <!-- Ensure that the Maven jar plugin runs before the Maven
+        shade plugin by listing the plugin higher within the file. -->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+      </plugin>
+
       <!--
         Configures `mvn package` to produce a bundled jar ("fat jar") for runners
         that require this for job submission to a cluster.
       -->
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-shade-plugin</artifactId>
-        <version>2.4.1</version>
+        <version>3.0.0</version>
         <executions>
           <execution>
             <phase>package</phase>
 
@@ -22,14 +22,14 @@
 import java.util.regex.Pattern;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.testing.PAssert;
-import org.apache.beam.sdk.transforms.Aggregator;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.Sum;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
 import org.slf4j.Logger;
@@ -51,7 +51,7 @@
  * <p>New Concepts:
  * <pre>
  *   1. Logging using SLF4J, even in a distributed environment
- *   2. Creating a custom aggregator (runners have varying levels of support)
+ *   2. Creating a custom metric (runners have varying levels of support)
  *   3. Testing your Pipeline via PAssert
  * </pre>
  *
@@ -90,29 +90,27 @@ public FilterTextFn(String pattern) {
     }
 
     /**
-     * Concept #2: A custom aggregator can track values in your pipeline as it runs. Each
-     * runner provides varying levels of support for aggregators, and may expose them
+     * Concept #2: A custom metric can track values in your pipeline as it runs. Each
+     * runner provides varying levels of support for metrics, and may expose them
      * in a dashboard, etc.
      */
-    private final Aggregator<Long, Long> matchedWords =
-        createAggregator("matchedWords", Sum.ofLongs());
-    private final Aggregator<Long, Long> unmatchedWords =
-        createAggregator("unmatchedWords", Sum.ofLongs());
+    private final Counter matchedWords = Metrics.counter(FilterTextFn.class, "matchedWords");
+    private final Counter unmatchedWords = Metrics.counter(FilterTextFn.class, "unmatchedWords");
 
     @ProcessElement
     public void processElement(ProcessContext c) {
       if (filter.matcher(c.element().getKey()).matches()) {
         // Log at the "DEBUG" level each element that we match. When executing this pipeline
         // these log lines will appear only if the log level is set to "DEBUG" or lower.
         LOG.debug("Matched: " + c.element().getKey());
-        matchedWords.addValue(1L);
+        matchedWords.inc();
         c.output(c.element());
       } else {
         // Log at the "TRACE" level each element that is not matched. Different log levels
         // can be used to control the verbosity of logging providing an effective mechanism
         // to filter less important information.
         LOG.trace("Did not match: " + c.element().getKey());
-        unmatchedWords.addValue(1L);
+        unmatchedWords.inc();
       }
     }
   }
@@ -138,7 +136,7 @@ public static void main(String[] args) {
     Pipeline p = Pipeline.create(options);
 
     PCollection<KV<String, Long>> filteredWords =
-        p.apply("ReadLines", TextIO.Read.from(options.getInputFile()))
+        p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
          .apply(new WordCount.CountWords())
          .apply(ParDo.of(new FilterTextFn(options.getFilterPattern())));
 
@@ -151,7 +149,7 @@ public static void main(String[] args) {
      * <p>Below we verify that the set of filtered words matches our expected counts. Note
      * that PAssert does not provide any output and that successful completion of the
      * Pipeline implies that the expectations were met. Learn more at
-     * https://cloud.google.com/dataflow/pipelines/testing-your-pipeline on how to test
+     * https://beam.apache.org/documentation/pipelines/test-your-pipeline/ on how to test
      * your Pipeline and see {@link DebuggingWordCountTest} for an example unit test.
      */
     List<KV<String, Long>> expectedResults = Arrays.asList(
 
@@ -17,6 +17,7 @@
  */
 package ${package};
 
+import ${package}.common.ExampleUtils;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.TextIO;
 import org.apache.beam.sdk.options.PipelineOptions;
@@ -74,7 +75,7 @@ public static void main(String[] args) {
     // the input text (a set of Shakespeare's texts).
 
     // This example reads a public data set consisting of the complete works of Shakespeare.
-    p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*"))
+    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*"))
 
      // Concept #2: Apply a ParDo transform to our PCollection of text lines. This ParDo invokes a
      // DoFn (defined in-line) on each element that tokenizes the text line into individual words.
@@ -83,7 +84,7 @@ public static void main(String[] args) {
      .apply("ExtractWords", ParDo.of(new DoFn<String, String>() {
                        @ProcessElement
                        public void processElement(ProcessContext c) {
-                         for (String word : c.element().split("[^a-zA-Z']+")) {
+                         for (String word : c.element().split(ExampleUtils.TOKENIZER_PATTERN)) {
                            if (!word.isEmpty()) {
                              c.output(word);
                            }
@@ -110,7 +111,7 @@ public String apply(KV<String, Long> input) {
      // formatted strings) to a series of text files.
      //
      // By default, it will write to a set of files with names like wordcount-00001-of-00005
-     .apply(TextIO.Write.to("wordcounts"));
+     .apply(TextIO.write().to("wordcounts"));
 
     // Run the pipeline.
     p.run().waitUntilFinish();
 
@@ -55,17 +55,17 @@ public static void main(String[] args) {
 
     Pipeline p = Pipeline.create(options);
 
-    p.apply(TextIO.Read.from("gs://apache-beam-samples/shakespeare/*"))
-     .apply(FlatMapElements.via((String word) -> Arrays.asList(word.split("[^a-zA-Z']+")))
-         .withOutputType(TypeDescriptors.strings()))
+    p.apply(TextIO.read().from("gs://apache-beam-samples/shakespeare/*"))
+     .apply(FlatMapElements
+         .into(TypeDescriptors.strings())
+         .via((String word) -> Arrays.asList(word.split("[^\\p{L}]+"))))
      .apply(Filter.by((String word) -> !word.isEmpty()))
      .apply(Count.<String>perElement())
      .apply(MapElements
-         .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue())
-         .withOutputType(TypeDescriptors.strings()))
-
+         .into(TypeDescriptors.strings())
+         .via((KV<String, Long> wordCount) -> wordCount.getKey() + ": " + wordCount.getValue()))
      // CHANGE 3/3: The Google Cloud Storage path is required for outputting the results to.
-     .apply(TextIO.Write.to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
+     .apply(TextIO.write().to("gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX"));
 
     p.run().waitUntilFinish();
   }
 
@@ -21,7 +21,7 @@
 import java.util.concurrent.ThreadLocalRandom;
 import ${package}.common.ExampleBigQueryTableOptions;
 import ${package}.common.ExampleOptions;
-import ${package}.common.WriteWindowedFilesDoFn;
+import ${package}.common.WriteOneFilePerWindow;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.PipelineResult;
 import org.apache.beam.sdk.io.TextIO;
@@ -31,11 +31,9 @@
 import org.apache.beam.sdk.options.PipelineOptions;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.transforms.DoFn;
-import org.apache.beam.sdk.transforms.GroupByKey;
+import org.apache.beam.sdk.transforms.MapElements;
 import org.apache.beam.sdk.transforms.ParDo;
-import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
 import org.apache.beam.sdk.transforms.windowing.FixedWindows;
-import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
 import org.apache.beam.sdk.transforms.windowing.Window;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
@@ -53,7 +51,7 @@
  *
  * <p>Basic concepts, also in the MinimalWordCount, WordCount, and DebuggingWordCount examples:
  * Reading text files; counting a PCollection; writing to GCS; executing a Pipeline both locally
- * and using a selected runner; defining DoFns; creating a custom aggregator;
+ * and using a selected runner; defining DoFns;
  * user-defined PTransforms; defining PipelineOptions.
  *
  * <p>New Concepts:
@@ -163,12 +161,15 @@ public interface Options extends WordCount.WordCountOptions,
     @Default.InstanceFactory(DefaultToMinTimestampPlusOneHour.class)
     Long getMaxTimestampMillis();
     void setMaxTimestampMillis(Long value);
+
+    @Description("Fixed number of shards to produce per window, or null for runner-chosen sharding")
+    Integer getNumShards();
+    void setNumShards(Integer numShards);
   }
 
   public static void main(String[] args) throws IOException {
     Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
     final String output = options.getOutput();
-    final Duration windowSize = Duration.standardMinutes(options.getWindowSize());
     final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
     final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
 
@@ -180,7 +181,7 @@ public static void main(String[] args) throws IOException {
      */
     PCollection<String> input = pipeline
       /** Read from the GCS file. */
-      .apply(TextIO.Read.from(options.getInputFile()))
+      .apply(TextIO.read().from(options.getInputFile()))
       // Concept #2: Add an element timestamp, using an artificial time just to show windowing.
       // See AddTimestampFn for more detail on this.
       .apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
@@ -203,33 +204,13 @@ public static void main(String[] args) throws IOException {
     PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
 
     /**
-     * Concept #5: Customize the output format using windowing information
-     *
-     * <p>At this point, the data is organized by window. We're writing text files and and have no
-     * late data, so for simplicity we can use the window as the key and {@link GroupByKey} to get
-     * one output file per window. (if we had late data this key would not be unique)
-     *
-     * <p>To access the window in a {@link DoFn}, add a {@link BoundedWindow} parameter. This will
-     * be automatically detected and populated with the window for the current element.
-     */
-    PCollection<KV<IntervalWindow, KV<String, Long>>> keyedByWindow =
-        wordCounts.apply(
-            ParDo.of(
-                new DoFn<KV<String, Long>, KV<IntervalWindow, KV<String, Long>>>() {
-                  @ProcessElement
-                  public void processElement(ProcessContext context, IntervalWindow window) {
-                    context.output(KV.of(window, context.element()));
-                  }
-                }));
-
-    /**
-     * Concept #6: Format the results and write to a sharded file partitioned by window, using a
+     * Concept #5: Format the results and write to a sharded file partitioned by window, using a
      * simple ParDo operation. Because there may be failures followed by retries, the
      * writes must be idempotent, but the details of writing to files is elided here.
      */
-    keyedByWindow
-        .apply(GroupByKey.<IntervalWindow, KV<String, Long>>create())
-        .apply(ParDo.of(new WriteWindowedFilesDoFn(output)));
+    wordCounts
+        .apply(MapElements.via(new WordCount.FormatAsTextFn()))
+        .apply(new WriteOneFilePerWindow(output, options.getNumShards()));
 
     PipelineResult result = pipeline.run();
     try {
 
@@ -17,21 +17,22 @@
  */
 package ${package};
 
+import ${package}.common.ExampleUtils;
 import org.apache.beam.sdk.Pipeline;
 import org.apache.beam.sdk.io.TextIO;
+import org.apache.beam.sdk.metrics.Counter;
+import org.apache.beam.sdk.metrics.Metrics;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.Description;
 import org.apache.beam.sdk.options.PipelineOptions;
 import org.apache.beam.sdk.options.PipelineOptionsFactory;
 import org.apache.beam.sdk.options.Validation.Required;
-import org.apache.beam.sdk.transforms.Aggregator;
 import org.apache.beam.sdk.transforms.Count;
 import org.apache.beam.sdk.transforms.DoFn;
 import org.apache.beam.sdk.transforms.MapElements;
 import org.apache.beam.sdk.transforms.PTransform;
 import org.apache.beam.sdk.transforms.ParDo;
 import org.apache.beam.sdk.transforms.SimpleFunction;
-import org.apache.beam.sdk.transforms.Sum;
 import org.apache.beam.sdk.values.KV;
 import org.apache.beam.sdk.values.PCollection;
 
@@ -44,8 +45,8 @@
  * pipeline, for introduction of additional concepts.
  *
  * <p>For a detailed walkthrough of this example, see
- *   <a href="http://beam.apache.org/use/walkthroughs/">
- *   http://beam.apache.org/use/walkthroughs/
+ *   <a href="https://beam.apache.org/get-started/wordcount-example/">
+ *   https://beam.apache.org/get-started/wordcount-example/
  *   </a>
  *
  * <p>Basic concepts, also in the MinimalWordCount example:
@@ -86,17 +87,16 @@ public class WordCount {
    * to a ParDo in the pipeline.
    */
   static class ExtractWordsFn extends DoFn<String, String> {
-    private final Aggregator<Long, Long> emptyLines =
-        createAggregator("emptyLines", Sum.ofLongs());
+    private final Counter emptyLines = Metrics.counter(ExtractWordsFn.class, "emptyLines");
 
     @ProcessElement
     public void processElement(ProcessContext c) {
       if (c.element().trim().isEmpty()) {
-        emptyLines.addValue(1L);
+        emptyLines.inc();
       }
 
       // Split the line into words.
-      String[] words = c.element().split("[^a-zA-Z']+");
+      String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
 
       // Output each word encountered into the output PCollection.
       for (String word : words) {
@@ -176,10 +176,10 @@ public static void main(String[] args) {
 
     // Concepts #2 and #3: Our pipeline applies the composite CountWords transform, and passes the
     // static FormatAsTextFn() to the ParDo transform.
-    p.apply("ReadLines", TextIO.Read.from(options.getInputFile()))
+    p.apply("ReadLines", TextIO.read().from(options.getInputFile()))
      .apply(new CountWords())
      .apply(MapElements.via(new FormatAsTextFn()))
-     .apply("WriteCounts", TextIO.Write.to(options.getOutput()));
+     .apply("WriteCounts", TextIO.write().to(options.getOutput()));
 
     p.run().waitUntilFinish();
   }
 
@@ -18,10 +18,10 @@
 package ${package}.common;
 
 import com.google.api.services.bigquery.model.TableSchema;
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.DefaultValueFactory;
 import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.GcpOptions;
 import org.apache.beam.sdk.options.PipelineOptions;
 
 /**
 
@@ -17,10 +17,10 @@
  */
 package ${package}.common;
 
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.DefaultValueFactory;
 import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.GcpOptions;
 import org.apache.beam.sdk.options.PipelineOptions;
 
 /**
 
@@ -17,10 +17,10 @@
  */
 package ${package}.common;
 
+import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
 import org.apache.beam.sdk.options.Default;
 import org.apache.beam.sdk.options.DefaultValueFactory;
 import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.GcpOptions;
 import org.apache.beam.sdk.options.PipelineOptions;
 
 /**