Skip to content
This repository was archived by the owner on Oct 8, 2020. It is now read-only.

Commit 180869a

Browse files
committed
Merge branch 'develop'
2 parents f4c0f83 + d98c9a6 commit 180869a

File tree

25 files changed

+1725
-575
lines changed

25 files changed

+1725
-575
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ project/plugins/project/
1919
.worksheet
2020
.idea/
2121
*.iml
22+
23+
deptree.txt
24+

pom.xml

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
<modelVersion>4.0.0</modelVersion>
55
<groupId>net.sansa-stack</groupId>
66
<artifactId>sansa-examples-parent_2.11</artifactId>
7-
<version>2017-12</version>
7+
<version>2018-06</version>
88
<packaging>pom</packaging>
99
<name>SANSA-Examples - Parent</name>
1010
<description>SANSA examples</description>
@@ -58,11 +58,11 @@
5858
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
5959
<scala.version>2.11.11</scala.version>
6060
<scala.binary.version>2.11</scala.binary.version>
61-
<spark.version>2.2.1</spark.version>
62-
<flink.version>1.4.0</flink.version>
63-
<hadoop.version>2.7.0</hadoop.version>
64-
<jena.version>3.5.0</jena.version>
65-
<sansa.version>0.3.0</sansa.version>
61+
<spark.version>2.3.1</spark.version>
62+
<flink.version>1.5.0</flink.version>
63+
<hadoop.version>2.8.3</hadoop.version>
64+
<jena.version>3.7.0</jena.version>
65+
<sansa.version>0.4.0</sansa.version>
6666
</properties>
6767

6868
<dependencyManagement>
@@ -166,13 +166,13 @@
166166
<!-- RDF Layer -->
167167
<dependency>
168168
<groupId>${project.groupId}</groupId>
169-
<artifactId>sansa-rdf-spark-bundle_${scala.binary.version}</artifactId>
169+
<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
170170
<version>${sansa.version}</version>
171171
</dependency>
172172

173173
<dependency>
174174
<groupId>${project.groupId}</groupId>
175-
<artifactId>sansa-rdf-flink-core_${scala.binary.version}</artifactId>
175+
<artifactId>sansa-rdf-flink_${scala.binary.version}</artifactId>
176176
<version>${sansa.version}</version>
177177
</dependency>
178178
<!-- OWL Layer -->
@@ -190,7 +190,7 @@
190190
<!-- Query Layer -->
191191
<dependency>
192192
<groupId>${project.groupId}</groupId>
193-
<artifactId>sansa-query-spark-bundle_${scala.binary.version}</artifactId>
193+
<artifactId>sansa-query-spark_${scala.binary.version}</artifactId>
194194
<version>${sansa.version}</version>
195195
</dependency>
196196

@@ -224,6 +224,14 @@
224224
<version>${sansa.version}</version>
225225
</dependency>
226226

227+
<dependency>
228+
<groupId>com.holdenkarau</groupId>
229+
<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
230+
<version>2.3.0_0.9.0</version>
231+
<scope>test</scope>
232+
</dependency>
233+
234+
227235
</dependencies>
228236
</dependencyManagement>
229237

@@ -284,12 +292,11 @@
284292
<plugin>
285293
<groupId>com.amashchenko.maven.plugin</groupId>
286294
<artifactId>gitflow-maven-plugin</artifactId>
287-
<version>1.3.1</version>
295+
<version>1.8.0</version>
288296
<configuration>
289297
<gitFlowConfig>
290298
<versionTagPrefix>v</versionTagPrefix>
291299
</gitFlowConfig>
292-
<pushRemote>false</pushRemote>
293300
</configuration>
294301
</plugin>
295302

sansa-examples-flink/pom.xml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<artifactId>sansa-examples-parent_2.11</artifactId>
77
<groupId>net.sansa-stack</groupId>
8-
<version>2017-12</version>
8+
<version>2018-06</version>
99
</parent>
1010
<artifactId>sansa-examples-flink_2.11</artifactId>
1111
<name>SANSA Examples - Apache Flink</name>
@@ -15,7 +15,7 @@
1515
<!-- SANSA RDF -->
1616
<dependency>
1717
<groupId>${project.groupId}</groupId>
18-
<artifactId>sansa-rdf-flink-core_${scala.binary.version}</artifactId>
18+
<artifactId>sansa-rdf-flink_${scala.binary.version}</artifactId>
1919
</dependency>
2020

2121
<!-- SANSA OWL -->

sansa-examples-spark/pom.xml

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
<parent>
66
<artifactId>sansa-examples-parent_2.11</artifactId>
77
<groupId>net.sansa-stack</groupId>
8-
<version>2017-12</version>
8+
<version>2018-06</version>
99
</parent>
1010
<artifactId>sansa-examples-spark_2.11</artifactId>
1111
<name>SANSA Examples - Apache Spark</name>
@@ -59,7 +59,7 @@
5959
<version>0.1.0-SNAPSHOT</version> </dependency> -->
6060
<dependency>
6161
<groupId>${project.groupId}</groupId>
62-
<artifactId>sansa-rdf-spark-bundle_${scala.binary.version}</artifactId>
62+
<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
6363
</dependency>
6464

6565
<!-- SANSA OWL -->
@@ -89,7 +89,7 @@
8989
<!-- SANSA Querying -->
9090
<dependency>
9191
<groupId>${project.groupId}</groupId>
92-
<artifactId>sansa-query-spark-bundle_${scala.binary.version}</artifactId>
92+
<artifactId>sansa-query-spark_${scala.binary.version}</artifactId>
9393
<exclusions>
9494
<exclusion>
9595
<groupId>org.eclipse.jetty</groupId>
@@ -146,6 +146,13 @@
146146
<artifactId>scopt_${scala.binary.version}</artifactId>
147147
</dependency>
148148

149+
<!--
150+
<dependency>
151+
<groupId>com.holdenkarau</groupId>
152+
<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
153+
</dependency>
154+
-->
155+
149156
</dependencies>
150157

151158
<build>

sansa-examples-spark/src/main/resources/AnomalyDetection/dataset.nt

Lines changed: 831 additions & 242 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# This configuration file contains the settings for the assessment.
2+
rdf.qualityassessment.dataset.prefixes=["http://dbpedia.org/"]
3+
4+
rdf.qualityassessment.dataset.subject="http://dbpedia.org/ontology/Person"
5+
rdf.qualityassessment.dataset.property="http://commons.dbpedia.org/property/source"
6+
7+
rdf.qualityassessment.dataset.lowerBound=0.1
8+
rdf.qualityassessment.dataset.upperBound=0.9
9+
10+
rdf.qualityassessment.dataset.shortUri.threshold = 95

sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/BorderFlowClustering.scala

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,79 @@ package net.sansa_stack.examples.spark.ml.clustering
33
import scala.collection.mutable
44
import org.apache.spark.sql.SparkSession
55
import org.apache.log4j.{ Level, Logger }
6-
import net.sansa_stack.ml.spark.clustering.BorderFlow
6+
import net.sansa_stack.ml.spark.clustering.{ BorderFlow, FirstHardeninginBorderFlow }
7+
import org.apache.jena.riot.Lang
8+
import net.sansa_stack.rdf.spark.io._
9+
import net.sansa_stack.rdf.spark.model.graph._
710

811
object BorderFlowClustering {
912

1013
def main(args: Array[String]) {
1114
parser.parse(args, Config()) match {
1215
case Some(config) =>
13-
run(config.in)
16+
run(config.alg, config.in, config.out, config.outevlsoft, config.outevlhard)
1417
case None =>
1518
println(parser.usage)
1619
}
1720
}
1821

19-
def run(input: String): Unit = {
22+
def run(algName: String, input: String, output: String, outputevlsoft: String, outputevlhard: String): Unit = {
2023

2124
val spark = SparkSession.builder
22-
.appName(s"BorderFlow example ( $input )")
25+
.appName(s"BorderFlow example: $algName ( $input )")
2326
.master("local[*]")
27+
.config("spark.hadoop.validateOutputSpecs", "false")
2428
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
2529
.getOrCreate()
2630

2731
println("============================================")
28-
println("| Border Flow example |")
32+
println(s"| Border Flow example ($algName) |")
2933
println("============================================")
3034

31-
BorderFlow(spark, input)
35+
val lang = Lang.NTRIPLES
36+
val triples = spark.rdf(lang)(input)
37+
val graph = triples.asStringGraph()
38+
39+
val borderflow = algName match {
40+
case "borderflow" => BorderFlow(spark, graph, output, outputevlsoft, outputevlhard)
41+
case "firsthardening" => FirstHardeninginBorderFlow(spark, graph, output, outputevlhard)
42+
case _ =>
43+
throw new RuntimeException("'" + algName + "' - Not supported, yet.")
44+
}
3245

3346
spark.stop
3447

3548
}
3649

37-
case class Config(in: String = "")
50+
case class Config(alg: String = "borderflow", in: String = "", out: String = "", outevlsoft: String = "", outevlhard: String = "")
3851

3952
val parser = new scopt.OptionParser[Config]("BorderFlow") {
4053

4154
head("BorderFlow: an example BorderFlow app.")
4255

56+
opt[String]('a', "algName").required().valueName("{borderflow | firsthardening }").
57+
action((x, c) => c.copy(alg = x)).
58+
text("BorderFlow algorithm type")
59+
4360
opt[String]('i', "input").required().valueName("<path>").
4461
action((x, c) => c.copy(in = x)).
4562
text("path to file contains the input files")
4663

64+
opt[String]('o', "out").required().valueName("<directory>").
65+
action((x, c) => c.copy(out = x)).
66+
text("the output directory")
67+
68+
opt[String]('e', "outevlsoft").optional().valueName("<directory>").
69+
action((x, c) => c.copy(outevlsoft = x)).
70+
text("the outevlsoft directory (used only for alg 'borderflow')")
71+
72+
opt[String]('h', "outevlhard").required().valueName("<directory>").
73+
action((x, c) => c.copy(outevlhard = x)).
74+
text("the outevlhard directory ")
75+
4776
help("help").text("prints this usage text")
77+
checkConfig(c =>
78+
if (c.alg == "borderflow" && c.outevlsoft.isEmpty) failure("Option --outevlsoft must not be empty if alg 'borderflow' is set")
79+
else success)
4880
}
49-
}
81+
}

sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/RDFGraphPIClustering.scala

Lines changed: 20 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3,95 +3,51 @@ package net.sansa_stack.examples.spark.ml.clustering
33
import scala.collection.mutable
44
import org.apache.spark.sql.SparkSession
55
import org.apache.log4j.{ Level, Logger }
6-
import org.apache.spark.graphx.GraphLoader
76
import org.apache.jena.riot.{ Lang, RDFDataMgr }
87
import java.io.ByteArrayInputStream
9-
import org.apache.spark.graphx._
10-
import org.apache.spark.rdd.RDD
11-
import net.sansa_stack.ml.spark.clustering.{ RDFGraphPICClustering => RDFGraphPICClusteringAlg }
8+
import org.apache.jena.riot.Lang
9+
import net.sansa_stack.rdf.spark.io._
10+
import net.sansa_stack.rdf.spark.model.graph._
11+
import net.sansa_stack.rdf._
12+
import net.sansa_stack.ml.spark.clustering.RDFGraphPowerIterationClustering
1213

1314
object RDFGraphPIClustering {
1415

1516
def main(args: Array[String]) {
1617
parser.parse(args, Config()) match {
1718
case Some(config) =>
18-
run(config.in, config.k, config.maxIterations)
19+
run(config.in, config.out, config.k, config.maxIterations)
1920
case None =>
2021
println(parser.usage)
2122
}
2223
}
2324

24-
def run(input: String, k: Int, maxIterations: Int): Unit = {
25+
def run(input: String, output: String, k: Int, maxIterations: Int): Unit = {
2526

2627
val spark = SparkSession.builder
2728
.appName(s"Power Iteration Clustering example ( $input )")
2829
.master("local[*]")
2930
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
3031
.getOrCreate()
31-
32-
Logger.getRootLogger.setLevel(Level.ERROR)
32+
System.setProperty("spark.akka.frameSize", "2000")
3333

3434
println("============================================")
3535
println("| Power Iteration Clustering example |")
3636
println("============================================")
3737

38-
val sparkSession = SparkSession.builder
39-
.master("local[*]")
40-
.appName(" Power Iteration Clustering example (" + input + ")")
41-
.getOrCreate()
42-
Logger.getRootLogger.setLevel(Level.ERROR)
43-
44-
// Load the graph
45-
//val graph = GraphLoader.edgeListFile(sparkSession.sparkContext, input)
46-
47-
// Load the RDF dataset
48-
val RDFfile = sparkSession.sparkContext.textFile(input).map(line =>
49-
RDFDataMgr.createIteratorTriples(new ByteArrayInputStream(line.getBytes), Lang.NTRIPLES, null).next())
50-
51-
val r = RDFfile.map(f => {
52-
val s = f.getSubject.getURI
53-
val p = f.getPredicate.getURI
54-
val o = f.getObject.getURI
55-
56-
(s, p, o)
57-
})
38+
val lang = Lang.NTRIPLES
39+
val triples = spark.rdf(lang)(input)
5840

59-
val v11 = r.map(f => f._1)
60-
val v22 = r.map(f => f._3)
61-
val indexedmap = (v11.union(v22)).distinct().zipWithIndex()
41+
val graph = triples.asStringGraph()
6242

63-
val vertices: RDD[(VertexId, String)] = indexedmap.map(x => (x._2, x._1))
64-
val _iriToId: RDD[(String, VertexId)] = indexedmap.map(x => (x._1, x._2))
65-
66-
val tuples = r.keyBy(f => f._1).join(indexedmap).map({
67-
case (k, ((s, p, o), si)) => (o, (si, p))
68-
})
69-
70-
val edgess: RDD[Edge[String]] = tuples.join(indexedmap).map({
71-
case (k, ((si, p), oi)) => Edge(si, oi, p)
72-
})
73-
74-
val graph = org.apache.spark.graphx.Graph(vertices, edgess)
75-
76-
val model = RDFGraphPICClusteringAlg(sparkSession, graph, k, maxIterations).run()
77-
78-
val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
79-
val assignments = clusters.toList.sortBy { case (k, v) => v.length }
80-
val assignmentsStr = assignments
81-
.map {
82-
case (k, v) =>
83-
s"$k -> ${v.sorted.mkString("[", ",", "]")}"
84-
}.mkString(",")
85-
val sizesStr = assignments.map {
86-
_._2.size
87-
}.sorted.mkString("(", ",", ")")
88-
println(s"Cluster assignments: $assignmentsStr\ncluster sizes: $sizesStr")
43+
val cluster = RDFGraphPowerIterationClustering(spark, graph, output, k, maxIterations)
44+
cluster.saveAsTextFile(output)
8945

9046
spark.stop
9147

9248
}
9349

94-
case class Config(in: String = "", k: Int = 3, maxIterations: Int = 50)
50+
case class Config(in: String = "", out: String = "", k: Int = 2, maxIterations: Int = 5)
9551

9652
val defaultParams = Config()
9753

@@ -100,9 +56,13 @@ object RDFGraphPIClustering {
10056
head("PowerIterationClusteringExample: an example PIC app using concentric circles.")
10157

10258
opt[String]('i', "input").required().valueName("<path>")
103-
.text(s"path to file that contains the input files (in N-Triple format)")
59+
.text(s"path (local/hdfs) to file that contains the input files (in N-Triple format)")
10460
.action((x, c) => c.copy(in = x))
10561

62+
opt[String]('o', "out").required().valueName("<directory>").
63+
action((x, c) => c.copy(out = x)).
64+
text("the output directory")
65+
10666
opt[Int]('k', "k")
10767
.text(s"number of circles (/clusters), default: ${defaultParams.k}")
10868
.action((x, c) => c.copy(k = x))
@@ -113,4 +73,4 @@ object RDFGraphPIClustering {
11373

11474
help("help").text("prints this usage text")
11575
}
116-
}
76+
}

0 commit comments

Comments
 (0)