Stuctured streamimg examples

satendrakumar · satendrakumar · commit d59a48a96ad4 · 2021-07-29T10:24:01.000+05:30
diff --git a/README.md b/README.md
@@ -15,3 +15,11 @@
 ```shell script
  sbt "runMain com.techmonad.learn.RDDOps"
 ```
+#### [Run Structured Streaming operations](https://github.com/techmonad/learning-spark/blob/master/src/main/scala/com/techmonad/learn/StructuredStreamingOps.scala)
+```shell script
+// start natcat server and paste some texts
+ nc -lk 9999
+
+ //Now start streaming app
+ sbt "runMain com.techmonad.learn.StructuredStreamingOps"
+```
diff --git a/src/main/scala/com/techmonad/learn/RDDOps.scala b/src/main/scala/com/techmonad/learn/RDDOps.scala
@@ -1,6 +1,8 @@
 package com.techmonad.learn
 
+import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.rdd.RDD
+import org.apache.spark.util.LongAccumulator
 
 object RDDOps extends SparkSessionProvider {
 
@@ -19,7 +21,9 @@ object RDDOps extends SparkSessionProvider {
             .split("\\s+")
             .filter { word => word.length > 0 }
         }
+      //.countByValue() //OR
         .map { word => (word, 1) }
+      //.countByKey() // OR
         .reduceByKey { case (count1, count2) => count1 + count2 }
 
     wordCounts.collect.foreach(println)
@@ -67,6 +71,20 @@ object RDDOps extends SparkSessionProvider {
     val userDetailsRight: RDD[(Int, (Option[User], Detail))] = userWithId.rightOuterJoin(detailWithId)
     userDetailsRight.collect.foreach(println)
 
+    //Accumulator
+    val acc: LongAccumulator = sc.longAccumulator("acc")
+    // change the value
+    acc.add(2)
+
+    println(acc.value)
+
+    // broadcast the id = 1212 on all the machine in the cluster
+    val bcId: Broadcast[Int] = sc.broadcast(1212)
+
+    // get Id on any worker nodes
+    val id: Int = bcId.value
+    println(id)
+
     spark.stop()
   }
 
diff --git a/src/main/scala/com/techmonad/learn/StructuredStreamingOps.scala b/src/main/scala/com/techmonad/learn/StructuredStreamingOps.scala
@@ -0,0 +1,48 @@
+package com.techmonad.learn
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.streaming.StreamingQuery
+
+object StructuredStreamingOps extends SparkSessionProvider {
+
+  def main(args: Array[String]): Unit = {
+    import spark.implicits._
+
+    val streamDF: DataFrame =
+      spark
+        .readStream
+        .format("socket")
+        .option("port", 9999)
+        .option("host", "localhost")
+        .load()
+
+    val words: DataFrame =
+      streamDF
+        .withColumn("words", split($"value", "\\s+"))
+        .withColumn("word", explode($"words"))
+        .select("word")
+        .groupBy("word")
+        .count()
+
+    val query: StreamingQuery =
+      words
+        .writeStream
+        .outputMode("complete")
+        .format("console")
+        .start()
+
+    query.awaitTermination()
+
+    /*    words
+          .writeStream
+          .foreachBatch { (df, batchNo) =>
+            df.show()
+          }
+          .start()
+          .awaitTermination()*/
+
+
+  }
+
+}