Skip to content

Commit 2c76206

Browse files
authored
BAEL-9258: Concatenate Two Data Frames With the Same Column Name (#18851)
1 parent 7f9e74a commit 2c76206

File tree

2 files changed

+172
-0
lines changed

2 files changed

+172
-0
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package com.baeldung.spark.dataframeconcat;
2+
3+
import org.apache.spark.sql.Dataset;
4+
import org.apache.spark.sql.Row;
5+
import org.apache.spark.sql.SparkSession;
6+
import org.slf4j.Logger;
7+
import org.slf4j.LoggerFactory;
8+
9+
import java.util.Arrays;
10+
import java.util.List;
11+
12+
public class ConcatRowsExample {
13+
14+
private static final Logger logger = LoggerFactory.getLogger(ConcatRowsExample.class);
15+
16+
public static void main(String[] args) {
17+
SparkSession spark = SparkSession.builder()
18+
.appName("Row-wise Concatenation Example")
19+
.master("local[*]")
20+
.getOrCreate();
21+
22+
try {
23+
// Create sample data
24+
List<Person> data1 = Arrays.asList(
25+
new Person(1, "Alice"),
26+
new Person(2, "Bob")
27+
);
28+
29+
List<Person> data2 = Arrays.asList(
30+
new Person(3, "Charlie"),
31+
new Person(4, "Diana")
32+
);
33+
34+
Dataset<Row> df1 = spark.createDataFrame(data1, Person.class);
35+
Dataset<Row> df2 = spark.createDataFrame(data2, Person.class);
36+
37+
logger.info("First DataFrame:");
38+
df1.show();
39+
40+
logger.info("Second DataFrame:");
41+
df2.show();
42+
43+
// Row-wise concatenation using reusable method
44+
Dataset<Row> combined = concatenateDataFrames(df1, df2);
45+
46+
logger.info("After row-wise concatenation:");
47+
combined.show();
48+
} finally {
49+
spark.stop();
50+
}
51+
}
52+
53+
/**
54+
* Concatenates two DataFrames row-wise using unionByName.
55+
* This method is extracted for reusability and testing.
56+
*/
57+
public static Dataset<Row> concatenateDataFrames(Dataset<Row> df1, Dataset<Row> df2) {
58+
return df1.unionByName(df2);
59+
}
60+
61+
public static class Person implements java.io.Serializable {
62+
private int id;
63+
private String name;
64+
65+
public Person() {
66+
}
67+
68+
public Person(int id, String name) {
69+
this.id = id;
70+
this.name = name;
71+
}
72+
73+
public int getId() {
74+
return id;
75+
}
76+
77+
public void setId(int id) {
78+
this.id = id;
79+
}
80+
81+
public String getName() {
82+
return name;
83+
}
84+
85+
public void setName(String name) {
86+
this.name = name;
87+
}
88+
}
89+
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
package com.baeldung.spark.dataframeconcat;
2+
3+
import org.apache.spark.sql.Dataset;
4+
import org.apache.spark.sql.Row;
5+
import org.apache.spark.sql.SparkSession;
6+
import org.junit.jupiter.api.*;
7+
8+
import java.util.Arrays;
9+
10+
import static org.junit.jupiter.api.Assertions.*;
11+
12+
class ConcatRowsExampleUnitTest {
13+
14+
private static SparkSession spark;
15+
private Dataset<Row> df1;
16+
private Dataset<Row> df2;
17+
18+
@BeforeAll
19+
static void setupClass() {
20+
spark = SparkSession.builder()
21+
.appName("Row-wise Concatenation Test")
22+
.master("local[*]")
23+
.getOrCreate();
24+
}
25+
26+
@BeforeEach
27+
void setup() {
28+
df1 = spark.createDataFrame(
29+
Arrays.asList(
30+
new ConcatRowsExample.Person(1, "Alice"),
31+
new ConcatRowsExample.Person(2, "Bob")
32+
),
33+
ConcatRowsExample.Person.class
34+
);
35+
36+
df2 = spark.createDataFrame(
37+
Arrays.asList(
38+
new ConcatRowsExample.Person(3, "Charlie"),
39+
new ConcatRowsExample.Person(4, "Diana")
40+
),
41+
ConcatRowsExample.Person.class
42+
);
43+
}
44+
45+
@AfterAll
46+
static void tearDownClass() {
47+
spark.stop();
48+
}
49+
50+
@Test
51+
void givenTwoDataFrames_whenConcatenated_thenRowCountMatches() {
52+
Dataset<Row> combined = ConcatRowsExample.concatenateDataFrames(df1, df2);
53+
54+
assertEquals(
55+
4,
56+
combined.count(),
57+
"The combined DataFrame should have 4 rows"
58+
);
59+
}
60+
61+
@Test
62+
void givenTwoDataFrames_whenConcatenated_thenSchemaRemainsSame() {
63+
Dataset<Row> combined = ConcatRowsExample.concatenateDataFrames(df1, df2);
64+
65+
assertEquals(
66+
df1.schema(),
67+
combined.schema(),
68+
"Schema should remain consistent after concatenation"
69+
);
70+
}
71+
72+
@Test
73+
void givenTwoDataFrames_whenConcatenated_thenDataContainsExpectedName() {
74+
Dataset<Row> combined = ConcatRowsExample.concatenateDataFrames(df1, df2);
75+
76+
assertTrue(
77+
combined
78+
.filter("name = 'Charlie'")
79+
.count() > 0,
80+
"Combined DataFrame should contain Charlie"
81+
);
82+
}
83+
}

0 commit comments

Comments
 (0)