diff --git a/pom.xml b/pom.xml index 1587d56..441dc78 100644 --- a/pom.xml +++ b/pom.xml @@ -27,6 +27,7 @@ 5.13.2 2.0.17 4.5.0 + 0.6.2 3.14.0 3.8.1 3.3.1 @@ -231,6 +232,13 @@ ${tableschema-java-version} + + + com.google.code.externalsortinginjava + externalsortinginjava + ${externalsortinginjava.version} + + org.junit.jupiter diff --git a/src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java b/src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java index dd53e73..4d0e51c 100644 --- a/src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java +++ b/src/main/java/io/frictionlessdata/datapackage/resource/AbstractResource.java @@ -6,6 +6,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.code.externalsorting.ExternalSort; import io.frictionlessdata.datapackage.Dialect; import io.frictionlessdata.datapackage.JSONBase; import io.frictionlessdata.datapackage.Package; @@ -14,10 +15,7 @@ import io.frictionlessdata.datapackage.exceptions.DataPackageValidationException; import io.frictionlessdata.datapackage.fk.PackageForeignKey; import io.frictionlessdata.tableschema.Table; -import io.frictionlessdata.tableschema.exception.ForeignKeyException; -import io.frictionlessdata.tableschema.exception.JsonSerializingException; -import io.frictionlessdata.tableschema.exception.TableIOException; -import io.frictionlessdata.tableschema.exception.TypeInferringException; +import io.frictionlessdata.tableschema.exception.*; import io.frictionlessdata.tableschema.field.Field; import io.frictionlessdata.tableschema.fk.ForeignKey; import io.frictionlessdata.tableschema.io.FileReference; @@ -40,6 +38,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.stream.Collectors; /** * Abstract base implementation of a Resource. @@ -368,6 +367,7 @@ public List getTables() throws Exception { return tables; } + @Override public void checkRelations(Package pkg) { if (null != schema) { List fks = new ArrayList<>(); @@ -439,6 +439,71 @@ public void checkRelations(Package pkg) { } } + @Override + public void checkPrimaryKeys() { + if (null != schema) { + Object pkObj = schema.getPrimaryKey(); + if (pkObj == null) { + return; // no primary key defined + } + + // Normalize PK fields + String[] pkFields; + if (pkObj instanceof String) { + pkFields = new String[]{(String) pkObj}; + } else if (pkObj instanceof String[]) { + pkFields = (String[]) pkObj; + } else { + throw new PrimaryKeyException("Unsupported primary key type: " + pkObj.getClass()); + } + + try { + // Dump all keys to a temporary file + Path tempFile = Files.createTempFile("pk-check", ".txt"); + try (BufferedWriter writer = Files.newBufferedWriter(tempFile)) { + List data = this.getData(true, false, true, false); + for (Object d : data) { + Map row = (Map) d; + String key = Arrays.stream(pkFields) + .map(f -> String.valueOf(row.get(f))) + .collect(Collectors.joining("\t")); + writer.write(key); + writer.newLine(); + } + } + + // Use ExternalSort to sort the file + File inputFile = tempFile.toFile(); + File sortedFile = Files.createTempFile("pk-check-sorted", ".txt").toFile(); + + List tempChunks = ExternalSort.sortInBatch(inputFile); + ExternalSort.mergeSortedFiles(tempChunks, sortedFile); + + // Scan sorted file line-by-line for duplicates + try (BufferedReader reader = new BufferedReader(new FileReader(sortedFile, StandardCharsets.UTF_8))) { + String prev = null; + String line; + while ((line = reader.readLine()) != null) { + if (line.equals(prev)) { + throw new PrimaryKeyException( + "Primary key violation in resource '" + this.getName() + + "': duplicate key " + line + ); + } + prev = line; + } + } + + // Cleanup + Files.deleteIfExists(tempFile); + Files.deleteIfExists(sortedFile.toPath()); + + } catch (Exception e) { + throw new PrimaryKeyException("Error validating primary keys: " + e.getMessage()); + } + } + } + public void validate(Package pkg) { try { diff --git a/src/main/java/io/frictionlessdata/datapackage/resource/Resource.java b/src/main/java/io/frictionlessdata/datapackage/resource/Resource.java index fe5d73f..0544c73 100644 --- a/src/main/java/io/frictionlessdata/datapackage/resource/Resource.java +++ b/src/main/java/io/frictionlessdata/datapackage/resource/Resource.java @@ -364,6 +364,8 @@ static ResourceBuilder builder(String resourceName) { void checkRelations(Package pkg) throws Exception; + void checkPrimaryKeys() throws Exception; + /** * Recreate a Resource object from a JSON descriptor, a base path to resolve relative file paths against * and a flag that tells us whether we are reading from inside a ZIP archive. diff --git a/src/test/java/io/frictionlessdata/datapackage/PrimaryKeysTest.java b/src/test/java/io/frictionlessdata/datapackage/PrimaryKeysTest.java new file mode 100644 index 0000000..4e02d61 --- /dev/null +++ b/src/test/java/io/frictionlessdata/datapackage/PrimaryKeysTest.java @@ -0,0 +1,71 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package io.frictionlessdata.datapackage; + +import io.frictionlessdata.datapackage.resource.Resource; +import io.frictionlessdata.tableschema.exception.PrimaryKeyException; +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class PrimaryKeysTest { + + @Test + @DisplayName("Test the uniqueness of simple primary keys - invalid case") + void testPrimaryKeysUniqueInvalid() throws Exception { + Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json"); + Package pkg = new Package(resourcePath, true); + Resource teams = pkg.getResource("teams"); + + Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys); + assertInstanceOf(PrimaryKeyException.class, ex); + assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key 1", ex.getMessage()); + } + + @Test + @DisplayName("Test the uniqueness of simple primary keys - valid case") + void testPrimaryKeysUniqueValid() throws Exception { + Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json"); + Package pkg = new Package(resourcePath, true); + Resource teams = pkg.getResource("teams"); + + assertDoesNotThrow(teams::checkPrimaryKeys); + } + + @Test + @DisplayName("Test the uniqueness of composite primary keys - invalid case") + void testCompositePrimaryKeysUniqueInvalid() throws Exception { + Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json"); + Package pkg = new Package(resourcePath, true); + Resource teams = pkg.getResource("teams"); + + Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys); + assertInstanceOf(PrimaryKeyException.class, ex); + assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key UK\tLondon", ex.getMessage()); + } + + @Test + @DisplayName("Test the uniqueness of composite primary keys - valid case") + void testCompositePrimaryKeysUniqueValid() throws Exception { + Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json"); + Package pkg = new Package(resourcePath, true); + Resource teams = pkg.getResource("teams"); + + assertDoesNotThrow(teams::checkPrimaryKeys); + } +} diff --git a/src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java b/src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java index 6f01c6d..384a186 100644 --- a/src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java +++ b/src/test/java/io/frictionlessdata/datapackage/resource/NonTabularResourceTest.java @@ -736,6 +736,10 @@ public String getSerializationFormat() { public void checkRelations(Package aPackage) throws Exception { } + @Override + public void checkPrimaryKeys() throws Exception { + } + @Override public void validate(Package aPackage) { } diff --git a/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json b/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json new file mode 100644 index 0000000..4264323 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json @@ -0,0 +1,29 @@ +{ + "name": "foreign-keys", + "resources": [ + { + "name": "teams", + "profile": "tabular-data-resource", + "encoding": "UTF-8", + "format": "csv", + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "primaryKey": ["country", "city"] + }, + "path": "teams.csv" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json b/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json new file mode 100644 index 0000000..07c1241 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json @@ -0,0 +1,29 @@ +{ + "name": "foreign-keys", + "resources": [ + { + "name": "teams", + "profile": "tabular-data-resource", + "encoding": "UTF-8", + "format": "csv", + "schema": { + "fields": [ + { + "name": "name", + "type": "string" + }, + { + "name": "country", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "primaryKey": ["country", "city"] + }, + "path": "teams-valid.csv" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/composite/teams-valid.csv b/src/test/resources/fixtures/datapackages/primary-keys/composite/teams-valid.csv new file mode 100644 index 0000000..8a8159f --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/composite/teams-valid.csv @@ -0,0 +1,4 @@ +name,country,city +Arsenal,UK,London +Real,Spain,Madrid +Bayern,Germany,Munich \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/composite/teams.csv b/src/test/resources/fixtures/datapackages/primary-keys/composite/teams.csv new file mode 100644 index 0000000..233f276 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/composite/teams.csv @@ -0,0 +1,5 @@ +name,country,city +Arsenal,UK,London +Real,Spain,Madrid +Bayern,Germany,Munich +Chelsea,UK,London \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json b/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json new file mode 100644 index 0000000..d51b947 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json @@ -0,0 +1,33 @@ +{ + "name": "foreign-keys", + "resources": [ + { + "name": "teams", + "profile": "tabular-data-resource", + "encoding": "UTF-8", + "format": "csv", + "schema": { + "fields": [ + { + "name": "id", + "type": "integer", + "constraints": { + "required": true, + "unique": true + } + }, + { + "name": "name", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "primaryKey": "id" + }, + "path": "teams.csv" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json b/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json new file mode 100644 index 0000000..b2ad306 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json @@ -0,0 +1,33 @@ +{ + "name": "foreign-keys", + "resources": [ + { + "name": "teams", + "profile": "tabular-data-resource", + "encoding": "UTF-8", + "format": "csv", + "schema": { + "fields": [ + { + "name": "id", + "type": "integer", + "constraints": { + "required": true, + "unique": true + } + }, + { + "name": "name", + "type": "string" + }, + { + "name": "city", + "type": "string" + } + ], + "primaryKey": "id" + }, + "path": "teams-valid.csv" + } + ] +} \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/simple/teams-valid.csv b/src/test/resources/fixtures/datapackages/primary-keys/simple/teams-valid.csv new file mode 100644 index 0000000..1d69a7d --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/simple/teams-valid.csv @@ -0,0 +1,4 @@ +id,name,city +1,Arsenal,London +2,Real,Madrid +3,Bayern,Munich \ No newline at end of file diff --git a/src/test/resources/fixtures/datapackages/primary-keys/simple/teams.csv b/src/test/resources/fixtures/datapackages/primary-keys/simple/teams.csv new file mode 100644 index 0000000..07ea407 --- /dev/null +++ b/src/test/resources/fixtures/datapackages/primary-keys/simple/teams.csv @@ -0,0 +1,4 @@ +id,name,city +1,Arsenal,London +1,Real,Madrid +1,Bayern,Munich \ No newline at end of file