Skip to content

Commit 7626ed2

Browse files
committed
Update API docs
1 parent 948ca8c commit 7626ed2

File tree

3 files changed

+181
-19
lines changed

3 files changed

+181
-19
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ The Spark Connect C++ client is **not a replacement** for Python or Scala Spark
112112
| Analytics | Window Functions || Planned |
113113
| Catalog | Table/Database Management || Planned |
114114
| Streaming | Structured Streaming || Not Implemented |
115+
| GraphFrames | Graph processing & analytics || Implemented |
115116

116117
---
117118

docs/API_REFERENCE.md

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1199,4 +1199,165 @@ df.show();
11991199
| Grant | Martin | 72 | grrm@cmpny.com | Grant Martin | adult |
12001200
| Hannah | Abbott | 18 | h.abbott@hogwarts... | Hannah Abbott | minor |
12011201
+----------------------+----------------------+----------------------+----------------------+----------------------+----------------------+
1202+
```
1203+
1204+
## GraphFrames
1205+
1206+
### Page Rank
1207+
1208+
```cpp
1209+
DataFrame *vertices = nullptr;
1210+
DataFrame *edges = nullptr;
1211+
1212+
vertices = spark->sql(R"(
1213+
SELECT CAST(id AS INT) AS id, name, age FROM VALUES
1214+
(1, 'Alice', 34),
1215+
(2, 'Bob', 36),
1216+
(3, 'Charlie', 30),
1217+
(4, 'Anne', 29)
1218+
AS people(id, name, age)
1219+
)");
1220+
1221+
edges = spark->sql(R"(
1222+
SELECT CAST(src AS INT) AS src, CAST(dst AS INT) AS dst, relationship FROM VALUES
1223+
(1, 2, 'friend'),
1224+
(2, 3, 'follow'),
1225+
(3, 1, 'friend'),
1226+
(1, 4, 'colleague')
1227+
AS connections(src, dst, relationship)
1228+
)");
1229+
1230+
auto gf = GraphFrame(*vertices, *edges);
1231+
1232+
auto rows = gf().pageRank(0.15, 5).collect();
1233+
gf().pageRank(0.15, 5).show();
1234+
```
1235+
1236+
### Motif Matching
1237+
1238+
```cpp
1239+
auto gf = GraphFrame(*vertices, *edges);
1240+
1241+
gf().find("(a)-[e]->(b)");
1242+
gf().find("(a)-[e1]->(b); (b)-[e2]->(c); (c)-[e3]->(a)");
1243+
gf().find("(a)-[e1]->(b); (b)-[e2]->(c)");
1244+
gf().find("(a)-[e]->(b)");
1245+
gf().find("(a)-[e]->(b)").show();
1246+
```
1247+
1248+
### Triplets
1249+
1250+
```cpp
1251+
auto gf = GraphFrame(*vertices, *edges);
1252+
gf().triplets();
1253+
gf().triplets().show();
1254+
```
1255+
1256+
### Filter Edges
1257+
1258+
```cpp
1259+
auto gf = GraphFrame(*vertices, *edges);
1260+
gf().filterEdges("relationship = 'friend'");
1261+
gf().filterEdges(col("relationship") == lit("friend"));
1262+
gf().filterEdges("relationship = 'enemy'");
1263+
gf().filterEdges("relationship = 'friend'").show()
1264+
```
1265+
1266+
### Filter Vertices
1267+
1268+
```cpp
1269+
auto gf = GraphFrame(*vertices, *edges);
1270+
gf().filterVertices("age < 34");
1271+
gf().filterVertices(col("age") < lit(34));
1272+
gf().filterVertices("age > 100");
1273+
gf().filterVertices("age < 34").show();
1274+
```
1275+
1276+
### Drop Isolated Vertices
1277+
1278+
```cpp
1279+
auto gf = GraphFrame(*vertices, *edges);
1280+
gf().dropIsolatedVertices();
1281+
1282+
auto v_with_isolated = spark->sql(R"(
1283+
SELECT * FROM VALUES
1284+
(1, 'Alice', 34),
1285+
(2, 'Bob', 36),
1286+
(3, 'Charlie', 30),
1287+
(4, 'Anne', 29),
1288+
(99, 'Ghost', 99)
1289+
AS people(id, name, age)
1290+
)");
1291+
1292+
GraphFrame(v_with_isolated, *edges).dropIsolatedVertices().show();
1293+
```
1294+
1295+
### Breadth First Search
1296+
1297+
```cpp
1298+
auto gf = GraphFrame(*vertices, *edges);
1299+
gf().bfs("id = 1", "id = 3");
1300+
gf().bfs("id = 4", "id = 1");
1301+
gf().bfs("id = 1", "id = 2", "relationship = 'friend'");
1302+
gf().bfs(col("id") == lit(1), col("id") == lit(3));
1303+
gf().bfs("id = 1", "id = 3").show();
1304+
```
1305+
1306+
### Connected Components
1307+
1308+
```cpp
1309+
auto gf = GraphFrame(*vertices, *edges);
1310+
gf().connectedComponents();
1311+
gf().connectedComponents().show();
1312+
```
1313+
1314+
### Strongly Connected Components
1315+
1316+
```cpp
1317+
auto gf = GraphFrame(*vertices, *edges);
1318+
gf().stronglyConnectedComponents(10);
1319+
gf().stronglyConnectedComponents();
1320+
gf().stronglyConnectedComponents().show();
1321+
```
1322+
1323+
### Shortest Paths
1324+
1325+
```cpp
1326+
auto gf = GraphFrame(*vertices, *edges);
1327+
gf().shortestPaths(std::vector<int32_t>{1, 3});
1328+
gf().shortestPaths(std::vector<int32_t>{1});
1329+
gf().shortestPaths(std::vector<int32_t>{1}).show();
1330+
```
1331+
1332+
### Triangle Count
1333+
1334+
```cpp
1335+
auto gf = GraphFrame(*vertices, *edges);
1336+
1337+
gf().triangleCount();
1338+
gf().triangleCount().show();
1339+
1340+
auto rows = gf().triangleCount().collect();
1341+
1342+
std::map<int32_t, int64_t> counts;
1343+
for (auto &row : rows)
1344+
counts[row.get<int32_t>("id")] = row.get<int64_t>("count");
1345+
```
1346+
1347+
### Label Propagation
1348+
1349+
```cpp
1350+
auto gf = GraphFrame(*vertices, *edges);
1351+
gf().labelPropagation(5);
1352+
```
1353+
1354+
### Method Chaining (GraphFrames)
1355+
1356+
```cpp
1357+
// GraphFrames result into plain DataFrame ops
1358+
auto result = gf()
1359+
.find("(a)-[e]->(b)")
1360+
.filter("e.relationship = 'friend'");
1361+
1362+
auto result = gf().pageRank(0.15, 5).filter("pagerank > 0.0");
12021363
```

tests/graphframes.cpp

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,25 @@ TEST_F(SparkIntegrationTest, TriangleCountShow)
420420
}
421421

422422
// --------------------------------------------------------------------------
423-
// Chaining - GraphFrames results into plain DataFrame ops
423+
// labelPropagation() — omitted
424+
//
425+
// Important:
426+
// Requires increased Spark executor memory in some cases.
427+
// Especially when running alongside other long-running algorithms
428+
// (ConnectedComponents, SCC, TriangleCount)
429+
// --------------------------------------------------------------------------
430+
TEST_F(SparkIntegrationTest, LabelPropagationReturnsOneRowPerVertex)
431+
{
432+
EXPECT_EQ(gfCount(gf().labelPropagation(5)), 4);
433+
}
434+
435+
TEST_F(SparkIntegrationTest, LabelPropagationHasLabelColumn)
436+
{
437+
EXPECT_THAT(gfColumns(gf().labelPropagation(5)), Contains("label"));
438+
}
439+
440+
// --------------------------------------------------------------------------
441+
// Chaining - GraphFrames result into plain DataFrame ops
424442
// --------------------------------------------------------------------------
425443
TEST_F(SparkIntegrationTest, FindThenFilter)
426444
{
@@ -441,22 +459,4 @@ TEST_F(SparkIntegrationTest, PageRankOnSubgraph)
441459
auto sub_v = vertices->filter("age >= 30");
442460
auto sub_e = edges->filter("src IN (1,2,3) AND dst IN (1,2,3)");
443461
EXPECT_EQ(gfCount(GraphFrame(sub_v, sub_e).pageRank(0.15, 3)), 3);
444-
}
445-
446-
// --------------------------------------------------------------------------
447-
// labelPropagation() — omitted
448-
//
449-
// Important:
450-
// Requires increased Spark executor memory in some cases.
451-
// Especially when running alongside other long-running algorithms
452-
// (ConnectedComponents, SCC, TriangleCount)
453-
// --------------------------------------------------------------------------
454-
TEST_F(SparkIntegrationTest, LabelPropagationReturnsOneRowPerVertex)
455-
{
456-
EXPECT_EQ(gfCount(gf().labelPropagation(5)), 4);
457-
}
458-
459-
TEST_F(SparkIntegrationTest, LabelPropagationHasLabelColumn)
460-
{
461-
EXPECT_THAT(gfColumns(gf().labelPropagation(5)), Contains("label"));
462462
}

0 commit comments

Comments
 (0)