SANSA-Stack
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎pom.xml‎
Lines changed: 18 additions & 11 deletions b/‎pom.xml‎
Lines changed: 18 additions & 11 deletions
diff --git a/‎sansa-examples-flink/pom.xml‎
Lines changed: 2 additions & 2 deletions b/‎sansa-examples-flink/pom.xml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎sansa-examples-spark/pom.xml‎
Lines changed: 10 additions & 3 deletions b/‎sansa-examples-spark/pom.xml‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎sansa-examples-spark/src/main/resources/AnomalyDetection/dataset.nt‎
Lines changed: 831 additions & 242 deletions b/‎sansa-examples-spark/src/main/resources/AnomalyDetection/dataset.nt‎
Lines changed: 831 additions & 242 deletions
diff --git a/‎sansa-examples-spark/src/main/resources/metrics.conf‎
Lines changed: 10 additions & 0 deletions b/‎sansa-examples-spark/src/main/resources/metrics.conf‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/BorderFlowClustering.scala‎
Lines changed: 40 additions & 8 deletions b/‎sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/BorderFlowClustering.scala‎
Lines changed: 40 additions & 8 deletions
diff --git a/‎sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/RDFGraphPIClustering.scala‎
Lines changed: 20 additions & 60 deletions b/‎sansa-examples-spark/src/main/scala/net/sansa_stack/examples/spark/ml/clustering/RDFGraphPIClustering.scala‎
Lines changed: 20 additions & 60 deletions
@@ -19,3 +19,6 @@ project/plugins/project/
 .worksheet
 .idea/
 *.iml
+
+deptree.txt
+
@@ -4,7 +4,7 @@
 	<modelVersion>4.0.0</modelVersion>
 	<groupId>net.sansa-stack</groupId>
 	<artifactId>sansa-examples-parent_2.11</artifactId>
-	<version>2017-12</version>
+	<version>2018-06</version>
 	<packaging>pom</packaging>
 	<name>SANSA-Examples - Parent</name>
 	<description>SANSA examples</description>
@@ -58,11 +58,11 @@
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
 		<scala.version>2.11.11</scala.version>
 		<scala.binary.version>2.11</scala.binary.version>
-		<spark.version>2.2.1</spark.version>
-		<flink.version>1.4.0</flink.version>
-		<hadoop.version>2.7.0</hadoop.version>
-		<jena.version>3.5.0</jena.version>
-		<sansa.version>0.3.0</sansa.version>
+		<spark.version>2.3.1</spark.version>
+		<flink.version>1.5.0</flink.version>
+		<hadoop.version>2.8.3</hadoop.version>
+		<jena.version>3.7.0</jena.version>
+		<sansa.version>0.4.0</sansa.version>
 	</properties>
 
 	<dependencyManagement>
@@ -166,13 +166,13 @@
 			<!-- RDF Layer -->
 			<dependency>
 				<groupId>${project.groupId}</groupId>
-				<artifactId>sansa-rdf-spark-bundle_${scala.binary.version}</artifactId>
+				<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
 				<version>${sansa.version}</version>
 			</dependency>
 
 			<dependency>
 				<groupId>${project.groupId}</groupId>
-				<artifactId>sansa-rdf-flink-core_${scala.binary.version}</artifactId>
+				<artifactId>sansa-rdf-flink_${scala.binary.version}</artifactId>
 				<version>${sansa.version}</version>
 			</dependency>
 			<!-- OWL Layer -->
@@ -190,7 +190,7 @@
 			<!-- Query Layer -->
 			<dependency>
 				<groupId>${project.groupId}</groupId>
-				<artifactId>sansa-query-spark-bundle_${scala.binary.version}</artifactId>
+				<artifactId>sansa-query-spark_${scala.binary.version}</artifactId>
 				<version>${sansa.version}</version>
 			</dependency>
 
@@ -224,6 +224,14 @@
 				<version>${sansa.version}</version>
 			</dependency>
 
+			<dependency>
+				<groupId>com.holdenkarau</groupId>
+				<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
+				<version>2.3.0_0.9.0</version>
+				<scope>test</scope>
+			</dependency>
+
+
 		</dependencies>
 	</dependencyManagement>
 
@@ -284,12 +292,11 @@
 				<plugin>
 					<groupId>com.amashchenko.maven.plugin</groupId>
 					<artifactId>gitflow-maven-plugin</artifactId>
-					<version>1.3.1</version>
+					<version>1.8.0</version>
 					<configuration>
 						<gitFlowConfig>
 							<versionTagPrefix>v</versionTagPrefix>
 						</gitFlowConfig>
-						<pushRemote>false</pushRemote>
 					</configuration>
 				</plugin>
 
 
@@ -5,7 +5,7 @@
 	<parent>
 		<artifactId>sansa-examples-parent_2.11</artifactId>
 		<groupId>net.sansa-stack</groupId>
-		<version>2017-12</version>
+		<version>2018-06</version>
 	</parent>
 	<artifactId>sansa-examples-flink_2.11</artifactId>
 	<name>SANSA Examples - Apache Flink</name>
@@ -15,7 +15,7 @@
 		<!-- SANSA RDF -->
 		<dependency>
 			<groupId>${project.groupId}</groupId>
-			<artifactId>sansa-rdf-flink-core_${scala.binary.version}</artifactId>
+			<artifactId>sansa-rdf-flink_${scala.binary.version}</artifactId>
 		</dependency>
 
 		<!-- SANSA OWL -->
 
@@ -5,7 +5,7 @@
 	<parent>
 		<artifactId>sansa-examples-parent_2.11</artifactId>
 		<groupId>net.sansa-stack</groupId>
-		<version>2017-12</version>
+		<version>2018-06</version>
 	</parent>
 	<artifactId>sansa-examples-spark_2.11</artifactId>
 	<name>SANSA Examples - Apache Spark</name>
@@ -59,7 +59,7 @@
 			<version>0.1.0-SNAPSHOT</version> </dependency> -->
 		<dependency>
 			<groupId>${project.groupId}</groupId>
-			<artifactId>sansa-rdf-spark-bundle_${scala.binary.version}</artifactId>
+			<artifactId>sansa-rdf-spark_${scala.binary.version}</artifactId>
 		</dependency>
 
 		<!-- SANSA OWL -->
@@ -89,7 +89,7 @@
 		<!-- SANSA Querying -->
 		<dependency>
 			<groupId>${project.groupId}</groupId>
-			<artifactId>sansa-query-spark-bundle_${scala.binary.version}</artifactId>
+			<artifactId>sansa-query-spark_${scala.binary.version}</artifactId>
 			<exclusions>
 				<exclusion>
 					<groupId>org.eclipse.jetty</groupId>
@@ -146,6 +146,13 @@
 			<artifactId>scopt_${scala.binary.version}</artifactId>
 		</dependency>
 
+<!--
+		<dependency>
+			<groupId>com.holdenkarau</groupId>
+			<artifactId>spark-testing-base_${scala.binary.version}</artifactId>
+		</dependency>
+-->
+
 	</dependencies>
 
 	<build>
 
@@ -0,0 +1,10 @@
+# This configuration file contains the settings for the assessment.
+rdf.qualityassessment.dataset.prefixes=["http://dbpedia.org/"]
+
+rdf.qualityassessment.dataset.subject="http://dbpedia.org/ontology/Person"
+rdf.qualityassessment.dataset.property="http://commons.dbpedia.org/property/source"
+
+rdf.qualityassessment.dataset.lowerBound=0.1
+rdf.qualityassessment.dataset.upperBound=0.9
+
+rdf.qualityassessment.dataset.shortUri.threshold = 95
@@ -3,47 +3,79 @@ package net.sansa_stack.examples.spark.ml.clustering
 import scala.collection.mutable
 import org.apache.spark.sql.SparkSession
 import org.apache.log4j.{ Level, Logger }
-import net.sansa_stack.ml.spark.clustering.BorderFlow
+import net.sansa_stack.ml.spark.clustering.{ BorderFlow, FirstHardeninginBorderFlow }
+import org.apache.jena.riot.Lang
+import net.sansa_stack.rdf.spark.io._
+import net.sansa_stack.rdf.spark.model.graph._
 
 object BorderFlowClustering {
 
   def main(args: Array[String]) {
     parser.parse(args, Config()) match {
       case Some(config) =>
-        run(config.in)
+        run(config.alg, config.in, config.out, config.outevlsoft, config.outevlhard)
       case None =>
         println(parser.usage)
     }
   }
 
-  def run(input: String): Unit = {
+  def run(algName: String, input: String, output: String, outputevlsoft: String, outputevlhard: String): Unit = {
 
     val spark = SparkSession.builder
-      .appName(s"BorderFlow example ( $input )")
+      .appName(s"BorderFlow example: $algName ( $input )")
       .master("local[*]")
+      .config("spark.hadoop.validateOutputSpecs", "false")
       .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
       .getOrCreate()
 
     println("============================================")
-    println("| Border Flow example                      |")
+    println(s"| Border Flow example    ($algName)       |")
     println("============================================")
 
-    BorderFlow(spark, input)
+    val lang = Lang.NTRIPLES
+    val triples = spark.rdf(lang)(input)
+    val graph = triples.asStringGraph()
+
+    val borderflow = algName match {
+      case "borderflow"     => BorderFlow(spark, graph, output, outputevlsoft, outputevlhard)
+      case "firsthardening" => FirstHardeninginBorderFlow(spark, graph, output, outputevlhard)
+      case _ =>
+        throw new RuntimeException("'" + algName + "' - Not supported, yet.")
+    }
 
     spark.stop
 
   }
 
-  case class Config(in: String = "")
+  case class Config(alg: String = "borderflow", in: String = "", out: String = "", outevlsoft: String = "", outevlhard: String = "")
 
   val parser = new scopt.OptionParser[Config]("BorderFlow") {
 
     head("BorderFlow: an example BorderFlow app.")
 
+    opt[String]('a', "algName").required().valueName("{borderflow | firsthardening }").
+      action((x, c) => c.copy(alg = x)).
+      text("BorderFlow algorithm type")
+
     opt[String]('i', "input").required().valueName("<path>").
       action((x, c) => c.copy(in = x)).
       text("path to file contains the input files")
 
+    opt[String]('o', "out").required().valueName("<directory>").
+      action((x, c) => c.copy(out = x)).
+      text("the output directory")
+
+    opt[String]('e', "outevlsoft").optional().valueName("<directory>").
+      action((x, c) => c.copy(outevlsoft = x)).
+      text("the outevlsoft directory (used only for alg 'borderflow')")
+
+    opt[String]('h', "outevlhard").required().valueName("<directory>").
+      action((x, c) => c.copy(outevlhard = x)).
+      text("the outevlhard directory ")
+
     help("help").text("prints this usage text")
+    checkConfig(c =>
+      if (c.alg == "borderflow" && c.outevlsoft.isEmpty) failure("Option --outevlsoft must not be empty if alg 'borderflow' is set")
+      else success)
   }
-}
+}
@@ -3,95 +3,51 @@ package net.sansa_stack.examples.spark.ml.clustering
 import scala.collection.mutable
 import org.apache.spark.sql.SparkSession
 import org.apache.log4j.{ Level, Logger }
-import org.apache.spark.graphx.GraphLoader
 import org.apache.jena.riot.{ Lang, RDFDataMgr }
 import java.io.ByteArrayInputStream
-import org.apache.spark.graphx._
-import org.apache.spark.rdd.RDD
-import net.sansa_stack.ml.spark.clustering.{ RDFGraphPICClustering => RDFGraphPICClusteringAlg }
+import org.apache.jena.riot.Lang
+import net.sansa_stack.rdf.spark.io._
+import net.sansa_stack.rdf.spark.model.graph._
+import net.sansa_stack.rdf._
+import net.sansa_stack.ml.spark.clustering.RDFGraphPowerIterationClustering
 
 object RDFGraphPIClustering {
 
   def main(args: Array[String]) {
     parser.parse(args, Config()) match {
       case Some(config) =>
-        run(config.in, config.k, config.maxIterations)
+        run(config.in, config.out, config.k, config.maxIterations)
       case None =>
         println(parser.usage)
     }
   }
 
-  def run(input: String, k: Int, maxIterations: Int): Unit = {
+  def run(input: String, output: String, k: Int, maxIterations: Int): Unit = {
 
     val spark = SparkSession.builder
       .appName(s"Power Iteration Clustering example ( $input )")
       .master("local[*]")
       .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
       .getOrCreate()
-
-    Logger.getRootLogger.setLevel(Level.ERROR)
+    System.setProperty("spark.akka.frameSize", "2000")
 
     println("============================================")
     println("| Power Iteration Clustering   example     |")
     println("============================================")
 
-    val sparkSession = SparkSession.builder
-      .master("local[*]")
-      .appName(" Power Iteration Clustering example (" + input + ")")
-      .getOrCreate()
-    Logger.getRootLogger.setLevel(Level.ERROR)
-
-    // Load the graph
-    //val graph = GraphLoader.edgeListFile(sparkSession.sparkContext, input)
-
-    // Load the RDF dataset
-    val RDFfile = sparkSession.sparkContext.textFile(input).map(line =>
-      RDFDataMgr.createIteratorTriples(new ByteArrayInputStream(line.getBytes), Lang.NTRIPLES, null).next())
-
-    val r = RDFfile.map(f => {
-      val s = f.getSubject.getURI
-      val p = f.getPredicate.getURI
-      val o = f.getObject.getURI
-
-      (s, p, o)
-    })
+    val lang = Lang.NTRIPLES
+    val triples = spark.rdf(lang)(input)
 
-    val v11 = r.map(f => f._1)
-    val v22 = r.map(f => f._3)
-    val indexedmap = (v11.union(v22)).distinct().zipWithIndex()
+    val graph = triples.asStringGraph()
 
-    val vertices: RDD[(VertexId, String)] = indexedmap.map(x => (x._2, x._1))
-    val _iriToId: RDD[(String, VertexId)] = indexedmap.map(x => (x._1, x._2))
-
-    val tuples = r.keyBy(f => f._1).join(indexedmap).map({
-      case (k, ((s, p, o), si)) => (o, (si, p))
-    })
-
-    val edgess: RDD[Edge[String]] = tuples.join(indexedmap).map({
-      case (k, ((si, p), oi)) => Edge(si, oi, p)
-    })
-
-    val graph = org.apache.spark.graphx.Graph(vertices, edgess)
-
-    val model = RDFGraphPICClusteringAlg(sparkSession, graph, k, maxIterations).run()
-
-    val clusters = model.assignments.collect().groupBy(_.cluster).mapValues(_.map(_.id))
-    val assignments = clusters.toList.sortBy { case (k, v) => v.length }
-    val assignmentsStr = assignments
-      .map {
-        case (k, v) =>
-          s"$k -> ${v.sorted.mkString("[", ",", "]")}"
-      }.mkString(",")
-    val sizesStr = assignments.map {
-      _._2.size
-    }.sorted.mkString("(", ",", ")")
-    println(s"Cluster assignments: $assignmentsStr\ncluster sizes: $sizesStr")
+    val cluster = RDFGraphPowerIterationClustering(spark, graph, output, k, maxIterations)
+    cluster.saveAsTextFile(output)
 
     spark.stop
 
   }
 
-  case class Config(in: String = "", k: Int = 3, maxIterations: Int = 50)
+  case class Config(in: String = "", out: String = "", k: Int = 2, maxIterations: Int = 5)
 
   val defaultParams = Config()
 
@@ -100,9 +56,13 @@ object RDFGraphPIClustering {
     head("PowerIterationClusteringExample: an example PIC app using concentric circles.")
 
     opt[String]('i', "input").required().valueName("<path>")
-      .text(s"path to file that contains the input files (in N-Triple format)")
+      .text(s"path (local/hdfs) to file that contains the input files (in N-Triple format)")
       .action((x, c) => c.copy(in = x))
 
+    opt[String]('o', "out").required().valueName("<directory>").
+      action((x, c) => c.copy(out = x)).
+      text("the output directory")
+
     opt[Int]('k', "k")
       .text(s"number of circles (/clusters), default: ${defaultParams.k}")
       .action((x, c) => c.copy(k = x))
@@ -113,4 +73,4 @@ object RDFGraphPIClustering {
 
     help("help").text("prints this usage text")
   }
-}
+}