Skip to content

Commit fa1337d

Browse files
committed
Introduction to RDD, Dataset and Dataframe
0 parents  commit fa1337d

File tree

13 files changed

+537
-0
lines changed

13 files changed

+537
-0
lines changed

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
logs
2+
target
3+
/.idea
4+
/.idea_modules
5+
/.classpath
6+
/.project
7+
/.settings
8+
/RUNNING_PID
9+
*.log

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# learning-spark
2+
3+
##### Run dataset operation
4+
```shell script
5+
sbt "runMain com.techmonad.learn.DataSetOps"
6+
```
7+
8+
##### Run dataframe operation
9+
```shell script
10+
sbt "runMain com.techmonad.learn.DataFrameOps"
11+
```
12+
13+
14+
##### Run RDD operation
15+
```shell script
16+
sbt "runMain com.techmonad.learn.RDDOps"
17+
```

build.sbt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
name := "learning-spark"
2+
3+
version := "0.1"
4+
5+
scalaVersion := "2.12.10"
6+
7+
8+
libraryDependencies ++= Seq(
9+
"org.apache.spark" %% "spark-sql" % "3.1.2"
10+
)

data/tweets.csv

Lines changed: 219 additions & 0 deletions
Large diffs are not rendered by default.

data/user-details.csv

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
id,employer,location,salary
2+
1, Google, SFO,2000
3+
2, Yahoo, LA,5000
4+
3, Microsoft, GA,10000
5+
10, Apple, CA,9000
6+
1, Google, SFO,120000

data/users.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id,name,email
2+
3+
4+
5+

data/words.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
bob rob rob
2+
bob joy jai
3+
rob
4+
bob
5+
joy
6+
jai

project/build.properties

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
sbt.version = 1.3.8
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
PATTERN =[%-5p]-[%d]-[%c]-[%m]%n
2+
3+
log4j.rootLogger=WARN, consoleAppender, fileAppender
4+
5+
log4j.appender.consoleAppender=org.apache.log4j.ConsoleAppender
6+
log4j.appender.consoleAppender.layout=org.apache.log4j.PatternLayout
7+
log4j.appender.consoleAppender.layout.ConversionPattern=${PATTERN}
8+
9+
log4j.appender.fileAppender=org.apache.log4j.RollingFileAppender
10+
log4j.appender.fileAppender.layout=org.apache.log4j.PatternLayout
11+
log4j.appender.fileAppender.layout.ConversionPattern=${PATTERN}
12+
log4j.appender.fileAppender.File=sparkApp.log
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package com.techmonad.learn
2+
3+
import org.apache.spark.sql.DataFrame
4+
import org.apache.spark.sql.functions._
5+
6+
object DataFrameOps extends SparkSessionProvider {
7+
8+
def main(args: Array[String]): Unit = {
9+
10+
val df: DataFrame =
11+
spark
12+
.read
13+
.text("data/words.txt")
14+
println("###############Word Count################")
15+
val wordCounts =
16+
df
17+
.withColumn("words", split(col("value"), "\\s+"))
18+
.withColumn("word", explode(col("words")))
19+
.select("word")
20+
.groupBy("word")
21+
.agg(count("word").as("count"))
22+
wordCounts.show()
23+
24+
// Joins
25+
val users =
26+
spark
27+
.read
28+
.option("delimiter", ",")
29+
.option("header", "true")
30+
.csv("data/users.csv")
31+
users.show()
32+
33+
val userDetails =
34+
spark
35+
.read
36+
.option("header", "true")
37+
.option("delimiter", ",")
38+
.csv("data/user-details.csv")
39+
40+
userDetails.show()
41+
42+
println("#############Inner Join###################")
43+
val innerJoin: DataFrame = users.join(userDetails, Seq("id"), "inner")
44+
innerJoin.show()
45+
46+
println("#############Left Join###################")
47+
val leftJoin: DataFrame = users.join(userDetails, Seq("id"), "left")
48+
leftJoin.show()
49+
50+
println("#############Right Join###################")
51+
val rightJoin: DataFrame = users.join(userDetails, Seq("id"), "right")
52+
rightJoin.show()
53+
54+
55+
println("#############full Join###################")
56+
val fullJoin: DataFrame = users.join(userDetails, Seq("id"), "full")
57+
fullJoin.show()
58+
59+
spark.stop()
60+
}
61+
62+
}

0 commit comments

Comments
 (0)