Skip to content
This repository was archived by the owner on Oct 8, 2020. It is now read-only.

Commit 9b0ae6e

Browse files
More robust N-Triples parser
1 parent 6ea2c60 commit 9b0ae6e

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed
Lines changed: 75 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
package net.sansa_stack.inference.utils
22

3+
import java.util.regex.Pattern
4+
5+
import scala.util.Try
6+
37
import net.sansa_stack.inference.data.RDFTriple
48

59
/**
@@ -8,10 +12,76 @@ import net.sansa_stack.inference.data.RDFTriple
812
* @author Lorenz Buehmann
913
*/
1014
class NTriplesStringToRDFTriple
11-
extends Function1[String, RDFTriple]
12-
with java.io.Serializable {
13-
override def apply(s: String): RDFTriple = {
14-
val tokens = s.replace("<", "").replace(">", "").split(" ") // split by white space
15-
RDFTriple(tokens(0), tokens(1), tokens(2))
15+
extends Function1[String, Option[RDFTriple]]
16+
with java.io.Serializable
17+
with Logging {
18+
19+
// val pattern: Pattern = Pattern.compile("^<([^>]+)>|(?<!<)([^>]+)(?<!>)\\s*<([^>]+)>\\s*<?([^>]+)>?\\s*\\.$")
20+
21+
val pattern: Pattern = Pattern.compile(
22+
"""|^
23+
|(<([^>]*)>|(?<!<)([^>]+)(?<!>))
24+
|\s*
25+
|<([^>]+)>
26+
|\s*
27+
|(<([^>]+)>|(.*))
28+
|\s*\.$
29+
""".stripMargin.replaceAll("\n", "").trim)
30+
31+
override def apply(s: String): Option[RDFTriple] = {
32+
parseRegexPattern(s)
33+
// parseRegexSplit(s)
1634
}
35+
36+
def parseRegexPattern(s : String): Option[RDFTriple] = Try {
37+
val matcher = pattern.matcher(s)
38+
39+
if (matcher.matches) {
40+
// for(i <- 0 to matcher.groupCount())
41+
// println(i + ":" + matcher.group(i))
42+
43+
val subject = if (matcher.group(2) == null) {
44+
matcher.group(1)
45+
} else {
46+
matcher.group(2)
47+
}
48+
49+
val obj = if (matcher.group(6) == null) {
50+
matcher.group(7).trim
51+
} else {
52+
matcher.group(6)
53+
}
54+
55+
RDFTriple(subject, matcher.group(4), obj)
56+
} else {
57+
warn(s"WARN: Illegal N-Triples syntax. Ignoring triple $s")
58+
throw new Exception(s"WARN: Illegal N-Triples syntax. Ignoring triple $s")
59+
}
60+
61+
}.toOption
62+
63+
def parseRegexSplit(s: String): Option[RDFTriple] = Try {
64+
val s1 = s.trim
65+
val split = s1.substring(0, s1.lastIndexOf('.')).split("\\s", 3)
66+
var obj = split(2).trim
67+
obj = obj.substring(0, obj.lastIndexOf('.'))
68+
RDFTriple(split(0), split(1), obj)
69+
}.toOption
70+
1771
}
72+
73+
object NTriplesStringToRDFTriple {
74+
def main(args: Array[String]): Unit = {
75+
val s1 = "<> <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> <http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ."
76+
val s2 = "_:genid32 <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> <http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ."
77+
val s3 = "<http://example.org/a> <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> <http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ."
78+
val s4 = "<http://example.org/a> <http://www.w3.org/1999/02/22-rdf-syntax-ns#rest> \"3\"^^<http://ex.org/int> ."
79+
80+
println(new NTriplesStringToRDFTriple().apply(s1))
81+
println(new NTriplesStringToRDFTriple().apply(s2))
82+
println(new NTriplesStringToRDFTriple().apply(s3))
83+
println(new NTriplesStringToRDFTriple().apply(s4))
84+
}
85+
}
86+
87+

sansa-inference-flink/src/main/scala/net/sansa_stack/inference/flink/data/RDFGraphLoader.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,9 @@ object RDFGraphLoader {
4646

4747
val tmp: List[String] = paths.map(path => path.toString).toList
4848

49-
val triples = tmp.map(f => env.readTextFile(f).map(new NTriplesStringToRDFTriple())).reduce(_ union _).name("triples")
49+
val converter = new NTriplesStringToRDFTriple()
50+
51+
val triples = tmp.map(f => env.readTextFile(f).flatMap(line => converter.apply(line))).reduce(_ union _).name("triples")
5052

5153
RDFGraph(triples)
5254
}

sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/RDFGraphLoader.scala

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,7 @@ import java.net.URI
55
import scala.language.implicitConversions
66

77
import org.apache.spark.SparkContext
8-
import org.apache.spark.rdd.RDD
9-
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider, TableScan}
10-
import org.apache.spark.sql.types.{StringType, StructField, StructType}
11-
import org.apache.spark.sql.{Dataset, Row, SQLContext, SparkSession}
8+
import org.apache.spark.sql.{Dataset, SparkSession}
129
import org.slf4j.LoggerFactory
1310

1411
import net.sansa_stack.inference.data.{RDFTriple, SQLSchema, SQLSchemaDefault}
@@ -41,9 +38,11 @@ object RDFGraphLoader {
4138
logger.info("loading triples from disk...")
4239
val startTime = System.currentTimeMillis()
4340

41+
val converter = new NTriplesStringToRDFTriple()
42+
4443
val triples = session.sparkContext
4544
.textFile(path, minPartitions) // read the text file
46-
.map(new NTriplesStringToRDFTriple()) // convert to triple object
45+
.flatMap(line => converter.apply(line)) // convert to triple object
4746
// .repartition(minPartitions)
4847

4948
// logger.info("finished loading " + triples.count() + " triples in " + (System.currentTimeMillis()-startTime) + "ms.")
@@ -88,9 +87,11 @@ object RDFGraphLoader {
8887
logger.info("loading triples from disk...")
8988
val startTime = System.currentTimeMillis()
9089

90+
val converter = new NTriplesStringToRDFTriple()
91+
9192
val triples = session.sparkContext
9293
.textFile(path, minPartitions) // read the text file
93-
.map(new NTriplesStringToRDFTriple()) // convert to triple object
94+
.flatMap(line => converter.apply(line)) // convert to triple object
9495

9596
// logger.info("finished loading " + triples.count() + " triples in " +
9697
// (System.currentTimeMillis()-startTime) + "ms.")

sansa-inference-spark/src/main/scala/net/sansa_stack/inference/spark/data/loader/sql/NTriplesRelation.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,9 @@ class NTriplesRelation(location: String, userSchema: StructType)
3030
.sparkContext
3131
.textFile(location)
3232

33-
val rows = rdd.map(new NTriplesStringToRDFTriple()).map(t => Row.fromSeq(Seq(t.s, t.p, t.o)))
33+
val converter = new NTriplesStringToRDFTriple()
34+
35+
val rows = rdd.flatMap(x => converter.apply(x)).map(t => Row.fromSeq(Seq(t.s, t.p, t.o)))
3436

3537
rows
3638
}

0 commit comments

Comments
 (0)