Skip to content

Commit ef628b4

Browse files
authored
Use validate-iri.js to validate IRIs
- By default the "pragmatic" mode is used (scheme validation + no invalid char according to RDF Turtle) - The iriValidationStrategy option allows to customize the validation strategy Benchmark using the GeoSpecies Knowledge Base (1.8M triples). - no IRI validation: 8.338s - pragmatic IRI validation: 9.116s - full IRI validation: 12.053s
1 parent 5213223 commit ef628b4

File tree

5 files changed

+42
-63
lines changed

5 files changed

+42
-63
lines changed

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,12 +90,13 @@ myParser.import(myTextStream)
9090
Optionally, the following parameters can be set in the `RdfXmlParser` constructor:
9191

9292
* `dataFactory`: A custom [RDFJS DataFactory](http://rdf.js.org/#datafactory-interface) to construct terms and triples. _(Default: `require('@rdfjs/data-model')`)_
93-
* `baseIRI`: An initital default base IRI. _(Default: `''`)_
93+
* `baseIRI`: An initial default base IRI. _(Default: `''`)_
9494
* `defaultGraph`: The default graph for constructing [quads](http://rdf.js.org/#dom-datafactory-quad). _(Default: `defaultGraph()`)_
9595
* `strict`: If the internal SAX parser should parse XML in strict mode, and error if it is invalid. _(Default: `false`)_
9696
* `trackPosition`: If the internal position (line, column) should be tracked an emitted in error messages. _(Default: `false`)_
9797
* `allowDuplicateRdfIds`: By default [multiple occurrences of the same `rdf:ID` value are not allowed](https://www.w3.org/TR/rdf-syntax-grammar/#section-Syntax-ID-xml-base). By setting this option to `true`, this uniqueness check can be disabled. _(Default: `false`)_
9898
* `validateUri`: By default, the parser validates each URI. _(Default: `true`)_
99+
* `iriValidationStrategy`: Allows to customize the used IRI validation strategy using the `IriValidationStrategy` enumeration. _(Default: `IriValidationStrategy.Pragmatic`)_
99100

100101
```javascript
101102
new RdfXmlParser({

lib/RdfXmlParser.ts

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,12 @@ import * as RDF from "@rdfjs/types";
22
import {resolve} from "relative-to-absolute-iri";
33
import {SaxesParser, SaxesTagNS} from "saxes";
44
import {PassThrough, Transform} from "readable-stream";
5-
import EventEmitter = NodeJS.EventEmitter;
65
import {ParseError} from "./ParseError";
76
import {DataFactory} from "rdf-data-factory";
7+
import {IriValidationStrategy, validateIri} from "validate-iri";
8+
import EventEmitter = NodeJS.EventEmitter;
89

910
export class RdfXmlParser extends Transform implements RDF.Sink<EventEmitter, RDF.Stream> {
10-
11-
// Regex for valid IRIs
12-
public static readonly IRI_REGEX: RegExp = /^([A-Za-z][A-Za-z0-9+-.]*):[^ "<>{}|\\\[\]`]*$/;
13-
1411
public static readonly MIME_TYPE = 'application/rdf+xml';
1512

1613
public static readonly RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
@@ -51,6 +48,7 @@ export class RdfXmlParser extends Transform implements RDF.Sink<EventEmitter, RD
5148
private readonly allowDuplicateRdfIds?: boolean;
5249
private readonly saxParser: SaxesParser;
5350
private readonly validateUri: boolean;
51+
private readonly iriValidationStrategy: IriValidationStrategy;
5452

5553
private readonly activeTagStack: IActiveTag[] = [];
5654
private readonly nodeIds: {[id: string]: boolean} = {};
@@ -74,25 +72,15 @@ export class RdfXmlParser extends Transform implements RDF.Sink<EventEmitter, RD
7472
if (this.validateUri !== false) {
7573
this.validateUri = true;
7674
}
75+
if (!this.iriValidationStrategy) {
76+
this.iriValidationStrategy = this.validateUri ? IriValidationStrategy.Pragmatic : IriValidationStrategy.None;
77+
}
7778

7879
this.saxParser = new SaxesParser({ xmlns: true, position: this.trackPosition });
7980

8081
this.attachSaxListeners();
8182
}
8283

83-
/**
84-
* Check if the given IRI is valid.
85-
* @param {string} iri A potential IRI.
86-
* @return {boolean} If the given IRI is valid.
87-
*/
88-
public static isValidIri(iri: string): boolean {
89-
return RdfXmlParser.IRI_REGEX.test(iri);
90-
}
91-
92-
get uriValidationEnabled() {
93-
return this.validateUri;
94-
}
95-
9684
/**
9785
* Parses the given text stream into a quad stream.
9886
* @param {NodeJS.EventEmitter} stream A text stream.
@@ -148,8 +136,9 @@ export class RdfXmlParser extends Transform implements RDF.Sink<EventEmitter, RD
148136
*/
149137
public uriToNamedNode(uri: string): RDF.NamedNode {
150138
// Validate URI
151-
if (this.uriValidationEnabled && !RdfXmlParser.isValidIri(uri)) {
152-
throw this.newParseError(`Invalid URI: ${uri}`);
139+
const uriValidationResult = validateIri(uri, this.iriValidationStrategy);
140+
if (uriValidationResult instanceof Error) {
141+
throw this.newParseError(uriValidationResult.message);
153142
}
154143
return this.dataFactory.namedNode(uri);
155144
}
@@ -701,6 +690,11 @@ export interface IRdfXmlParserArgs {
701690
* By default, it is equal to true.
702691
*/
703692
validateUri?: boolean;
693+
/**
694+
* Allows to customize the used IRI validation strategy using the `IriValidationStrategy` enumeration.
695+
* By default, the "pragmatic" strategy is used.
696+
*/
697+
iriValidationStrategy?: IriValidationStrategy;
704698
}
705699

706700
export interface IActiveTag {

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@
9595
"rdf-data-factory": "^1.1.0",
9696
"relative-to-absolute-iri": "^1.0.0",
9797
"readable-stream": "^4.0.0",
98-
"saxes": "^6.0.0"
98+
"saxes": "^6.0.0",
99+
"validate-iri": "^1.0.0"
99100
},
100101
"sideEffects": false
101102
}

test/RdfXmlParser-test.ts

Lines changed: 19 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import {DataFactory} from "rdf-data-factory";
77
const streamifyString = require('streamify-string');
88
const streamifyArray = require('streamify-array');
99
import arrayifyStream from 'arrayify-stream';
10+
import {IriValidationStrategy} from "validate-iri";
1011
const quad = require('rdf-quad');
1112
const DF = new DataFactory();
1213

@@ -26,7 +27,7 @@ describe('RdfXmlParser', () => {
2627
expect((<any> instance).defaultGraph).toBe(DF.defaultGraph());
2728
expect((<any> instance).saxParser).toBeInstanceOf(SaxesParser);
2829
expect((<any> instance).validateUri).toBeTruthy();
29-
expect(instance.uriValidationEnabled).toBeTruthy();
30+
expect((<any> instance).iriValidationStrategy).toEqual(IriValidationStrategy.Pragmatic);
3031
});
3132

3233
it('should be constructable with empty args', () => {
@@ -37,7 +38,7 @@ describe('RdfXmlParser', () => {
3738
expect((<any> instance).defaultGraph).toBe(DF.defaultGraph());
3839
expect((<any> instance).saxParser).toBeInstanceOf(SaxesParser);
3940
expect((<any> instance).validateUri).toBeTruthy();
40-
expect(instance.uriValidationEnabled).toBeTruthy();
41+
expect((<any> instance).iriValidationStrategy).toEqual(IriValidationStrategy.Pragmatic);
4142
});
4243

4344
it('should be constructable with args with a custom data factory', () => {
@@ -84,41 +85,7 @@ describe('RdfXmlParser', () => {
8485
const instance = new RdfXmlParser({ validateUri: false });
8586
expect(instance).toBeInstanceOf(RdfXmlParser);
8687
expect((<any> instance).validateUri).toBeFalsy();
87-
expect(instance.uriValidationEnabled).toBeFalsy();
88-
});
89-
90-
describe('#isValidIri', () => {
91-
it('should be false for null', async () => {
92-
expect(RdfXmlParser.isValidIri(null)).toBeFalsy();
93-
});
94-
95-
it('should be false for an empty string', async () => {
96-
expect(RdfXmlParser.isValidIri('')).toBeFalsy();
97-
});
98-
99-
it('should be false for an abc', async () => {
100-
expect(RdfXmlParser.isValidIri('abc')).toBeFalsy();
101-
});
102-
103-
it('should be true for an abc:def', async () => {
104-
expect(RdfXmlParser.isValidIri('abc:def')).toBeTruthy();
105-
});
106-
107-
it('should be true for an http://google.com', async () => {
108-
expect(RdfXmlParser.isValidIri('http://google.com')).toBeTruthy();
109-
});
110-
111-
it('should be false for an http://google.com<', async () => {
112-
expect(RdfXmlParser.isValidIri('http://google.com<')).toBeFalsy();
113-
});
114-
115-
it('should be false for an http://google .com', async () => {
116-
expect(RdfXmlParser.isValidIri('http://google .com')).toBeFalsy();
117-
});
118-
119-
it('should be false for an invalid scheme', async () => {
120-
expect(RdfXmlParser.isValidIri('%http://google.com')).toBeFalsy();
121-
});
88+
expect((<any> instance).iriValidationStrategy).toEqual(IriValidationStrategy.None);
12289
});
12390

12491
describe('a default instance', () => {
@@ -162,12 +129,12 @@ abc`)).rejects.toBeTruthy();
162129

163130
it('create error on a URI with an invalid scheme', () => {
164131
expect(() => parser.valueToUri('%https://example.com/', {}))
165-
.toThrow(new Error('Invalid URI: %https://example.com/'));
132+
.toThrow(new Error('Invalid IRI according to RDF Turtle: \'%https://example.com/\''));
166133
});
167134

168135
it('create error on a URI with an invalid character', () => {
169136
expect(() => parser.valueToUri('https://example.com/<', {}))
170-
.toThrow(new Error('Invalid URI: https://example.com/<'));
137+
.toThrow(new Error('Invalid IRI according to RDF Turtle: \'https://example.com/<\''));
171138
});
172139

173140
it('create a named node from a relative URI when a baseIRI is given', () => {
@@ -397,7 +364,7 @@ abc`)).rejects.toBeTruthy();
397364
<ex:prop>1</ex:prop>
398365
<rdf:Description>
399366
</rdf:RDF>`)).rejects.toEqual(
400-
new Error('Invalid URI: #abc'));
367+
new Error('Invalid IRI according to RDF Turtle: \'#abc\''));
401368
});
402369

403370
// 2.10
@@ -782,7 +749,7 @@ abc`)).rejects.toBeTruthy();
782749
<ns1:b rdf:resource="_:bnode"/>
783750
</rdf:Description>
784751
785-
</rdf:RDF>`)).rejects.toEqual(new Error('Invalid URI: _:bnode'));
752+
</rdf:RDF>`)).rejects.toEqual(new Error('Invalid IRI according to RDF Turtle: \'_:bnode\''));
786753
});
787754

788755
// Illegal XML name production
@@ -2373,6 +2340,17 @@ abc`)).rejects.toBeTruthy();
23732340
expect(streamParser.read(1)).toBeFalsy();
23742341
expect(streamParser.writable).toBeFalsy();
23752342
});
2343+
2344+
2345+
it('should properly support XML encoded URIs', async () => {
2346+
expect(parse(parser, `<?xml version="1.0" encoding="utf-8" ?>
2347+
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
2348+
xmlns:ns0="b:">
2349+
<rdf:Description rdf:about="a:&#xA;">
2350+
<ns0:b rdf:resource="c:c"/>
2351+
</rdf:Description>
2352+
</rdf:RDF>`)).rejects.toBeTruthy();
2353+
});
23762354
});
23772355
});
23782356

yarn.lock

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3981,6 +3981,11 @@ v8-to-istanbul@^9.0.1:
39813981
"@types/istanbul-lib-coverage" "^2.0.1"
39823982
convert-source-map "^1.6.0"
39833983

3984+
validate-iri@^1.0.0:
3985+
version "1.0.0"
3986+
resolved "https://registry.yarnpkg.com/validate-iri/-/validate-iri-1.0.0.tgz#a109600246e8a7515ecd238cdcddb7ca54b95d2d"
3987+
integrity sha512-4htbVgPOAS8RihVeyp/Pq5bnpKKhA2FcpsYTTION9rejFSZuIUX80SzO/+WMtSR3OVV1NFJx3J6DjrAZw83eCA==
3988+
39843989
validate-npm-package-license@^3.0.1:
39853990
version "3.0.4"
39863991
resolved "https://registry.yarnpkg.com/validate-npm-package-license/-/validate-npm-package-license-3.0.4.tgz#fc91f6b9c7ba15c857f4cb2c5defeec39d4f410a"

0 commit comments

Comments
 (0)