Skip to content

Commit 752af69

Browse files
Add PolBlogs Dataset (#105)
* add PolBlogs * add PolBlogs * add tests * add tests * updated MLDatasets.jl * add docstrings and tests * add docstrings * updated docs * updated docs * corrected index * corrected spelling * updated type * updated type * spelling corrected * updated docstring * updated docstring * resolved renaming conficts * updated docs * minor changes * resolved styling issues Co-authored-by: Carlo Lucibello <[email protected]> * renamed function * updated docstrings * updated type Co-authored-by: Carlo Lucibello <[email protected]>
1 parent e828d74 commit 752af69

File tree

5 files changed

+93
-1
lines changed

5 files changed

+93
-1
lines changed

docs/src/datasets/graphs.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ CiteSeer.dataset
1010
Cora
1111
Cora.dataset
1212
OGBDataset
13+
PolBlogs
14+
PolBlogs.edge_index
15+
PolBlogs.labels
1316
PubMed
1417
PubMed.dataset
1518
TUDataset

src/MLDatasets.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ include("planetoid.jl")
5959
include("CiteSeer/CiteSeer.jl")
6060
include("TUDataset/TUDataset.jl")
6161
include("OGBDataset/OGBDataset.jl")
62-
62+
include("PolBlogs/PolBlogs.jl")
6363

6464
function __init__()
6565
# initialize optional dependencies

src/PolBlogs/PolBlogs.jl

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
export PolBlogs
2+
3+
4+
"""
5+
PolBlogs
6+
7+
The Political Blogs dataset from the [The Political Blogosphere and
8+
the 2004 US Election: Divided they Blog](https://dl.acm.org/doi/10.1145/1134271.1134277) paper.
9+
10+
`PolBlogs` is a graph with 1,490 vertices (representing political blogs) and 19,025 edges (links between blogs).
11+
12+
The links are automatically extracted from a crawl of the front page of the blog.
13+
14+
Each vertex receives a label indicating the political leaning of the blog: liberal or conservative.
15+
16+
# Interface
17+
18+
- [`PolBlogs.edge_index`](@ref)
19+
- [`PolBlogs.labels`](@ref)
20+
"""
21+
module PolBlogs
22+
23+
using DataDeps
24+
using DelimitedFiles
25+
using ..MLDatasets: datafile
26+
27+
export edge_index, labels
28+
29+
const LINK = "https://netset.telecom-paris.fr/datasets/polblogs.tar.gz"
30+
const DEPNAME = "PolBlogs"
31+
const DATA = ["adjacency.csv", "labels.csv"]
32+
33+
function __init__()
34+
register(DataDep(DEPNAME,
35+
"""
36+
Dataset : The $DEPNAME dataset
37+
Website : $LINK
38+
""",
39+
LINK,
40+
post_fetch_method = unpack
41+
))
42+
end
43+
44+
"""
45+
edge_index(; dir = nothing)
46+
47+
Returns a 19025 x 2 matrix containing edge indices where first column as source node and second column as target node together they represent an edge
48+
49+
```julia-repl
50+
using MLDatasets: PolBlogs
51+
adj = PolBlogs.edge_index()
52+
```
53+
"""
54+
function edge_index(; dir = nothing)
55+
path = datafile(DEPNAME, DATA[1], dir)
56+
adj = readdlm(path, ',')
57+
return Matrix{Int64}(adj)
58+
end
59+
60+
"""
61+
labels(; dir = nothing)
62+
63+
Returns a vector containing the 1490 labels.
64+
65+
```julia-repl
66+
using MLDatasets: PolBlogs
67+
labels = PolBlogs.labels()
68+
```
69+
"""
70+
function labels(; dir=nothing)
71+
path = datafile(DEPNAME, DATA[2], dir)
72+
labels = readdlm(path, ',')
73+
Matrix{Int64}(labels) |> vec
74+
end
75+
76+
end #module

test/runtests.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ dataset_tests = [
2727
"tst_cora.jl",
2828
"tst_pubmed.jl",
2929
"tst_tudataset.jl",
30+
"tst_polblogs.jl",
3031
]
3132

3233
container_tests = [

test/tst_polblogs.jl

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
data_dir = withenv("DATADEPS_ALWAY_ACCEPT"=>"true") do
2+
datadep"PolBlogs"
3+
end
4+
5+
@testset "PolBlogs" begin
6+
adj = PolBlogs.edge_index()
7+
labels = PolBlogs.labels()
8+
@test adj isa Matrix{Int64}
9+
@test labels isa Vector{Int64}
10+
@test size(adj) == (19025,2)
11+
@test size(labels) == (1490,)
12+
end

0 commit comments

Comments
 (0)