Skip to content

Commit 92213a5

Browse files
Primary keys validation
(cherry picked from commit 82d5991)
1 parent e2a41c3 commit 92213a5

File tree

13 files changed

+295
-4
lines changed

13 files changed

+295
-4
lines changed

pom.xml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
<junit.version>5.13.2</junit.version>
2828
<slf4j-simple.version>2.0.17</slf4j-simple.version>
2929
<apache-commons-collections4.version>4.5.0</apache-commons-collections4.version>
30+
<externalsortinginjava.version>0.6.2</externalsortinginjava.version>
3031
<maven-compiler-plugin.version>3.14.0</maven-compiler-plugin.version>
3132
<maven-dependency-plugin.version>3.8.1</maven-dependency-plugin.version>
3233
<maven-source-plugin.version>3.3.1</maven-source-plugin.version>
@@ -231,6 +232,13 @@
231232
<version>${tableschema-java-version}</version>
232233
</dependency>
233234

235+
<!-- Sorting -->
236+
<dependency>
237+
<groupId>com.google.code.externalsortinginjava</groupId>
238+
<artifactId>externalsortinginjava</artifactId>
239+
<version>${externalsortinginjava.version}</version>
240+
</dependency>
241+
234242
<!-- Unit Testing -->
235243
<dependency>
236244
<groupId>org.junit.jupiter</groupId>

src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java

Lines changed: 69 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import com.fasterxml.jackson.annotation.JsonProperty;
77
import com.fasterxml.jackson.core.JsonProcessingException;
88
import com.fasterxml.jackson.databind.ObjectMapper;
9+
import com.google.code.externalsorting.ExternalSort;
910
import io.frictionlessdata.datapackage.Dialect;
1011
import io.frictionlessdata.datapackage.JSONBase;
1112
import io.frictionlessdata.datapackage.Package;
@@ -14,10 +15,7 @@
1415
import io.frictionlessdata.datapackage.exceptions.DataPackageValidationException;
1516
import io.frictionlessdata.datapackage.fk.PackageForeignKey;
1617
import io.frictionlessdata.tableschema.Table;
17-
import io.frictionlessdata.tableschema.exception.ForeignKeyException;
18-
import io.frictionlessdata.tableschema.exception.JsonSerializingException;
19-
import io.frictionlessdata.tableschema.exception.TableIOException;
20-
import io.frictionlessdata.tableschema.exception.TypeInferringException;
18+
import io.frictionlessdata.tableschema.exception.*;
2119
import io.frictionlessdata.tableschema.field.Field;
2220
import io.frictionlessdata.tableschema.fk.ForeignKey;
2321
import io.frictionlessdata.tableschema.io.FileReference;
@@ -40,6 +38,7 @@
4038
import java.nio.file.Files;
4139
import java.nio.file.Path;
4240
import java.util.*;
41+
import java.util.stream.Collectors;
4342

4443
/**
4544
* Abstract base implementation of a Resource.
@@ -368,6 +367,7 @@ public List<Table> getTables() throws Exception {
368367
return tables;
369368
}
370369

370+
@Override
371371
public void checkRelations(Package pkg) {
372372
if (null != schema) {
373373
List<PackageForeignKey> fks = new ArrayList<>();
@@ -439,6 +439,71 @@ public void checkRelations(Package pkg) {
439439
}
440440
}
441441

442+
@Override
443+
public void checkPrimaryKeys() {
444+
if (null != schema) {
445+
Object pkObj = schema.getPrimaryKey();
446+
if (pkObj == null) {
447+
return; // no primary key defined
448+
}
449+
450+
// Normalize PK fields
451+
String[] pkFields;
452+
if (pkObj instanceof String) {
453+
pkFields = new String[]{(String) pkObj};
454+
} else if (pkObj instanceof String[]) {
455+
pkFields = (String[]) pkObj;
456+
} else {
457+
throw new PrimaryKeyException("Unsupported primary key type: " + pkObj.getClass());
458+
}
459+
460+
try {
461+
// Dump all keys to a temporary file
462+
Path tempFile = Files.createTempFile("pk-check", ".txt");
463+
try (BufferedWriter writer = Files.newBufferedWriter(tempFile)) {
464+
List<Object> data = this.getData(true, false, true, false);
465+
for (Object d : data) {
466+
Map<String, Object> row = (Map<String, Object>) d;
467+
String key = Arrays.stream(pkFields)
468+
.map(f -> String.valueOf(row.get(f)))
469+
.collect(Collectors.joining("\t"));
470+
writer.write(key);
471+
writer.newLine();
472+
}
473+
}
474+
475+
// Use ExternalSort to sort the file
476+
File inputFile = tempFile.toFile();
477+
File sortedFile = Files.createTempFile("pk-check-sorted", ".txt").toFile();
478+
479+
List<File> tempChunks = ExternalSort.sortInBatch(inputFile);
480+
ExternalSort.mergeSortedFiles(tempChunks, sortedFile);
481+
482+
// Scan sorted file line-by-line for duplicates
483+
try (BufferedReader reader = new BufferedReader(new FileReader(sortedFile))) {
484+
String prev = null;
485+
String line;
486+
while ((line = reader.readLine()) != null) {
487+
if (line.equals(prev)) {
488+
throw new PrimaryKeyException(
489+
"Primary key violation in resource '" + this.getName() +
490+
"': duplicate key " + line
491+
);
492+
}
493+
prev = line;
494+
}
495+
}
496+
497+
// Cleanup
498+
Files.deleteIfExists(tempFile);
499+
Files.deleteIfExists(sortedFile.toPath());
500+
501+
} catch (Exception e) {
502+
throw new PrimaryKeyException("Error validating primary keys: " + e.getMessage());
503+
}
504+
}
505+
}
506+
442507
public void validate(Package pkg) {
443508

444509
try {

src/main/java/io/frictionlessdata/datapackage/resource/Resource.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,8 @@ static ResourceBuilder builder(String resourceName) {
364364

365365
void checkRelations(Package pkg) throws Exception;
366366

367+
void checkPrimaryKeys() throws Exception;
368+
367369
/**
368370
* Recreate a Resource object from a JSON descriptor, a base path to resolve relative file paths against
369371
* and a flag that tells us whether we are reading from inside a ZIP archive.
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.frictionlessdata.datapackage;
15+
16+
import io.frictionlessdata.datapackage.resource.Resource;
17+
import io.frictionlessdata.tableschema.exception.PrimaryKeyException;
18+
import org.junit.jupiter.api.DisplayName;
19+
import org.junit.jupiter.api.Test;
20+
21+
import java.nio.file.Path;
22+
23+
import static org.junit.jupiter.api.Assertions.*;
24+
import static org.junit.jupiter.api.Assertions.assertEquals;
25+
26+
public class PrimaryKeysTest {
27+
28+
@Test
29+
@DisplayName("Test the uniqueness of simple primary keys - invalid case")
30+
void testPrimaryKeysUniqueInvalid() throws Exception {
31+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json");
32+
Package pkg = new Package(resourcePath, true);
33+
Resource teams = pkg.getResource("teams");
34+
35+
Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
36+
assertInstanceOf(PrimaryKeyException.class, ex);
37+
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key 1", ex.getMessage());
38+
}
39+
40+
@Test
41+
@DisplayName("Test the uniqueness of simple primary keys - valid case")
42+
void testPrimaryKeysUniqueValid() throws Exception {
43+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json");
44+
Package pkg = new Package(resourcePath, true);
45+
Resource teams = pkg.getResource("teams");
46+
47+
assertDoesNotThrow(teams::checkPrimaryKeys);
48+
}
49+
50+
@Test
51+
@DisplayName("Test the uniqueness of composite primary keys - invalid case")
52+
void testCompositePrimaryKeysUniqueInvalid() throws Exception {
53+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json");
54+
Package pkg = new Package(resourcePath, true);
55+
Resource teams = pkg.getResource("teams");
56+
57+
Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
58+
assertInstanceOf(PrimaryKeyException.class, ex);
59+
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key UK\tLondon", ex.getMessage());
60+
}
61+
62+
@Test
63+
@DisplayName("Test the uniqueness of composite primary keys - valid case")
64+
void testCompositePrimaryKeysUniqueValid() throws Exception {
65+
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json");
66+
Package pkg = new Package(resourcePath, true);
67+
Resource teams = pkg.getResource("teams");
68+
69+
assertDoesNotThrow(teams::checkPrimaryKeys);
70+
}
71+
}

src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,10 @@ public String getSerializationFormat() {
736736
public void checkRelations(Package aPackage) throws Exception {
737737
}
738738

739+
@Override
740+
public void checkPrimaryKeys() throws Exception {
741+
}
742+
739743
@Override
740744
public void validate(Package aPackage) {
741745
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "name",
13+
"type": "string"
14+
},
15+
{
16+
"name": "country",
17+
"type": "string"
18+
},
19+
{
20+
"name": "city",
21+
"type": "string"
22+
}
23+
],
24+
"primaryKey": ["country", "city"]
25+
},
26+
"path": "teams.csv"
27+
}
28+
]
29+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "name",
13+
"type": "string"
14+
},
15+
{
16+
"name": "country",
17+
"type": "string"
18+
},
19+
{
20+
"name": "city",
21+
"type": "string"
22+
}
23+
],
24+
"primaryKey": ["country", "city"]
25+
},
26+
"path": "teams-valid.csv"
27+
}
28+
]
29+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
name,country,city
2+
Arsenal,UK,London
3+
Real,Spain,Madrid
4+
Bayern,Germany,Munich
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
name,country,city
2+
Arsenal,UK,London
3+
Real,Spain,Madrid
4+
Bayern,Germany,Munich
5+
Chelsea,UK,London
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"name": "foreign-keys",
3+
"resources": [
4+
{
5+
"name": "teams",
6+
"profile": "tabular-data-resource",
7+
"encoding": "UTF-8",
8+
"format": "csv",
9+
"schema": {
10+
"fields": [
11+
{
12+
"name": "id",
13+
"type": "integer",
14+
"constraints": {
15+
"required": true,
16+
"unique": true
17+
}
18+
},
19+
{
20+
"name": "name",
21+
"type": "string"
22+
},
23+
{
24+
"name": "city",
25+
"type": "string"
26+
}
27+
],
28+
"primaryKey": "id"
29+
},
30+
"path": "teams.csv"
31+
}
32+
]
33+
}

0 commit comments

Comments
 (0)