diff --git a/src/Anonymization/Anonymizer/AnonymizerRegistry.php b/src/Anonymization/Anonymizer/AnonymizerRegistry.php index d4b9b8b8..11db9df7 100644 --- a/src/Anonymization/Anonymizer/AnonymizerRegistry.php +++ b/src/Anonymization/Anonymizer/AnonymizerRegistry.php @@ -6,6 +6,14 @@ use Composer\InstalledVersions; use MakinaCorpus\DbToolsBundle\Anonymization\Config\AnonymizerConfig; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackEnumAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackEnumGeneratedAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackFileEnumAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackFileMultipleColumnAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackMultipleColumnAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackMultipleColumnGeneratedAnonymizer; +use MakinaCorpus\DbToolsBundle\Anonymization\Pack\PackRegistry; use MakinaCorpus\DbToolsBundle\Attribute\AsAnonymizer; use MakinaCorpus\QueryBuilder\DatabaseSession; @@ -32,15 +40,33 @@ class AnonymizerRegistry Core\StringPatternAnonymizer::class, ]; + private PackRegistry $packRegistry; + /** @var array */ private ?array $classes = null; + /** @var array */ private ?array $metadata = null; + + /** + * Paths where to lookup for custom anonymizers. + * + * @var array + */ private array $paths = []; - public function __construct(?array $paths = null) + /** + * Pack filenames where to lookup for PHP-less packs. + * + * @var array + */ + private array $packs = []; + + public function __construct(?array $paths = null, ?array $packs = null) { $this->addPath($paths ?? []); + $this->addPack($packs ?? []); + $this->packRegistry = new PackRegistry(); } /** @@ -51,6 +77,14 @@ public function addPath(array $paths): void $this->paths = \array_unique(\array_merge($this->paths, $paths)); } + /** + * Add PHP-less configuration file pack. + */ + public function addPack(array $packs): void + { + $this->packs = \array_unique(\array_merge($this->packs, $packs)); + } + /** * Get all registered anonymizers classe names. * @@ -72,10 +106,19 @@ public function createAnonymizer( Context $context, DatabaseSession $databaseSession, ): AbstractAnonymizer { - $className = $this->getAnonymizerClass($name); - - $ret = new $className($config->table, $config->targetName, $databaseSession, $context, $config->options); - \assert($ret instanceof AbstractAnonymizer); + if ($this->packRegistry->hasPack($name)) { + $ret = $this->createAnonymizerFromPack( + $this->packRegistry->getPackAnonymizer($name), + $config, + $context, + $databaseSession, + ); + } else { + $className = $this->getAnonymizerClass($name); + + $ret = new $className($config->table, $config->targetName, $databaseSession, $context, $config->options); + \assert($ret instanceof AbstractAnonymizer); + } if ($ret instanceof WithAnonymizerRegistry) { $ret->setAnonymizerRegistry($this); @@ -84,6 +127,98 @@ public function createAnonymizer( return $ret; } + /** + * Create anonymizer instance from pack. + */ + private function createAnonymizerFromPack( + PackAnonymizer $packAnonymizer, + AnonymizerConfig $config, + Context $context, + DatabaseSession $databaseSession + ): AbstractAnonymizer { + // Merge incomming user options with options from the pack. + // Pack given options will override the user one. + $options = $config->options->with($packAnonymizer->options->all()); + + // Anonymizer from pack factory. Hardcoded for now. + if ($packAnonymizer instanceof PackEnumAnonymizer) { + return new Core\StringAnonymizer( + $config->table, + $config->targetName, + $databaseSession, + $context, + // @todo Convert data to an array if an iterable was + // here. Later, change getSample() signature of + // AbstractEnumAnonymizer to accept any iterable. + $options->with([ + 'sample' => \is_array($packAnonymizer->data) ? $packAnonymizer->data : \iterator_to_array($packAnonymizer->data), + ]), + ); + } + + if ($packAnonymizer instanceof PackMultipleColumnAnonymizer) { + return new Core\MultipleColumnAnonymizer( + $config->table, + $config->targetName, + $databaseSession, + // @todo Convert data to an array if an iterable was + // here. Later, change getSample() signature of + // AbstractEnumAnonymizer to accept any iterable. + $options->with([ + 'columns' => $packAnonymizer->columns, + 'sample' => \is_array($packAnonymizer->data) ? $packAnonymizer->data : \iterator_to_array($packAnonymizer->data), + ]), + ); + } + + if ($packAnonymizer instanceof PackEnumGeneratedAnonymizer) { + if (1 !== \count($packAnonymizer->pattern)) { + // @todo + throw new \LogicException("Not implemented yet: pattern anonymizer does not support multiple patterns yet."); + } + + return new Core\StringPatternAnonymizer( + $config->table, + $config->targetName, + $databaseSession, + $context, + $options->with([ + 'pattern' => $packAnonymizer->pattern[0], + ]), + ); + } + + if ($packAnonymizer instanceof PackMultipleColumnGeneratedAnonymizer) { + // @todo + throw new \LogicException("Not implemented yet: missing arbitrary column generator anonymizer."); + } + + if ($packAnonymizer instanceof PackFileEnumAnonymizer) { + return new Core\FileEnumAnonymizer( + $config->table, + $config->targetName, + $databaseSession, + $context, + $options->with(['source' => $packAnonymizer->filename]), + ); + } + + if ($packAnonymizer instanceof PackFileMultipleColumnAnonymizer) { + return new Core\FileMultipleColumnAnonymizer( + $config->table, + $config->targetName, + $databaseSession, + $context, + $options->with([ + 'columns' => $packAnonymizer->columns, + 'source' => $packAnonymizer->filename, + ]), + ); + } + + throw new \LogicException(\sprintf("Pack anonymizer with class '%s' is not implement yet.", \get_class($packAnonymizer))); + } + /** * Get anonymizer metadata. */ @@ -173,6 +308,12 @@ private function initialize(): void } } } + + if ($this->packs) { + foreach ($this->packs as $filename) { + $this->packRegistry->addPack($filename); + } + } } /** @@ -214,8 +355,10 @@ private function locatePacks(): void $path = $directory . '/src/Anonymizer/'; if (\is_dir($path)) { $this->addPath([$path]); + } elseif (\file_exists($path . '/db_tools.pack.yaml')) { + $this->addPack([$path . '/db_tools.pack.yaml']); } else { - \trigger_error(\sprintf("Anonymizers pack '%s' in '%s' as no 'src/Anonymizer/' directory and is thus not usable.", $package, $directory), \E_USER_ERROR); + \trigger_error(\sprintf("Anonymizers pack '%s' in '%s' as no 'src/Anonymizer/' directory nor 'db_tools.pack.yaml' file and is thus not usable.", $package, $directory), \E_USER_ERROR); } } } diff --git a/src/Anonymization/Anonymizer/Core/FileMultipleColumnAnonymizer.php b/src/Anonymization/Anonymizer/Core/FileMultipleColumnAnonymizer.php index f97a77a1..b199e05f 100644 --- a/src/Anonymization/Anonymizer/Core/FileMultipleColumnAnonymizer.php +++ b/src/Anonymization/Anonymizer/Core/FileMultipleColumnAnonymizer.php @@ -29,6 +29,8 @@ character (default is ','). - 'file_skip_header': when reading any file, set this to true to skip the first line (default is false). + All other options are key-value pairs, keys are column names as defined in the + 'columns' option, values are targetted database column names to anonymize. TXT )] class FileMultipleColumnAnonymizer extends AbstractMultipleColumnAnonymizer diff --git a/src/Anonymization/Anonymizer/Core/StringPatternAnonymizer.php b/src/Anonymization/Anonymizer/Core/StringPatternAnonymizer.php index fdb9f99a..b556b23b 100644 --- a/src/Anonymization/Anonymizer/Core/StringPatternAnonymizer.php +++ b/src/Anonymization/Anonymizer/Core/StringPatternAnonymizer.php @@ -110,12 +110,10 @@ public function createAnonymizeExpression(Update $update): Expression // so we need to create as many instances as we have ranges. $childAnonymizer = $this->getAnonymizer( 'integer', - new Options( - [ + new Options([ 'min' => $part->start, 'max' => $part->stop, - ], - ) + ]), ); \assert($childAnonymizer instanceof AbstractSingleColumnAnonymizer); diff --git a/src/Anonymization/Pack/Pack.php b/src/Anonymization/Pack/Pack.php new file mode 100644 index 00000000..7e6237fd --- /dev/null +++ b/src/Anonymization/Pack/Pack.php @@ -0,0 +1,212 @@ + */ + private array $anonymizers; + + public function __construct( + public readonly string $id, + public readonly string $directory, + ) {} + + /** @internal Add anonymizer during pack definition parsing. */ + public function addPackAnonymizer(PackAnonymizer $anonymizer): void + { + $this->anonymizers[$anonymizer->id] = $anonymizer; + } + + /** Get anonymizer description. */ + public function getPackAnonymizer(string $id): PackAnonymizer + { + return $this->anonymizers[$id] ?? throw new ConfigurationException(\sprintf("Anonymizer '%s.%s' does not exist.", $this->id, $id)); + } + + /** + * Create pack from file. + */ + public static function fromFile(string $filename): Pack + { + $input = match (FileReader::getFileExtension($filename)) { + 'yaml' => Yaml::parseFile($filename), + default => throw new ConfigurationException("Unsupported pack file type."), + }; + + return self::fromArray($input, \dirname($filename)); + } + + /** + * Create pack from array input. + */ + public static function fromArray(array $input, string $directory): Pack + { + if (empty($input['name']) || !\is_string($input['name'])) { + throw new ConfigurationException(\sprintf("Missing 'name' property in pack description in folder: %s.", $directory)); + } + if (empty($input['data']) || !\is_array($input['data'])) { + throw new ConfigurationException(\sprintf("Missing or empty 'data' property in pack description in folder: %s.", $directory)); + } + + $ret = new Pack($input['name'], $directory); + + foreach ($input['data'] as $id => $anonymizerInput) { + $ret->addPackAnonymizer(self::createAnonymizerFromArray($ret->id, $directory, $id, $anonymizerInput)); + } + + return $ret; + } + + /** + * Create pack anonymizer from array input. + */ + private static function createAnonymizerFromArray(string $packId, string $directory, string $id, array $input): PackAnonymizer + { + $completeId = $packId . '.' . $id; + + $options = new Options(); + + $fileOptions = [ + 'file_csv_enclosure' => $input['file_csv_enclosure'] ?? '"', + 'file_csv_escape' => $input['file_csv_escape'] ?? '\\', + 'file_csv_separator' => $input['file_csv_separator'] ?? ',', + 'file_skip_header' => $input['file_skip_header'] ?? false, + ]; + + // First parse user documentation. + $description = null; + if (!empty($input['description'])) { + if (!\is_string($input['description'])) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'description' must be a string value.", $completeId)); + } + $description = $input['description']; + } + + // Discriminate column based anonymizers versus single value enums. + $columns = null; + if (!empty($input['columns'])) { + if (!\is_array($input['columns'])) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'columns' must be an array of string values.", $completeId)); + } + $columns = []; + foreach ($input['columns'] as $column) { + if (null !== $column && !\is_string($column)) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'columns' must be an array of string values.", $completeId)); + } + $columns[] = $column; + } + } + + // Plain raw data cannot be patterns, simply read from file. + if (isset($input['data'])) { + if (isset($input['pattern'])) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' cannot have both 'data' and 'pattern' property.", $completeId)); + } + + if (\is_string($input['data'])) { + $filename = \rtrim($directory, '/') . '/' . \ltrim($input['data'], '/'); + if ($columns) { + return new PackFileMultipleColumnAnonymizer( + $completeId, + $description, + $options->with($fileOptions), + $columns, + $filename, + ); + } + + return new PackFileEnumAnonymizer( + $completeId, + $description, + $options->with($fileOptions), + $filename, + ); + } + + if (\is_array($input['data'])) { + if ($columns) { + return new PackMultipleColumnAnonymizer( + $completeId, + $description, + $options, + $columns, + $input['data'], + ); + } + + return new PackEnumAnonymizer( + $completeId, + $description, + $options, + $input['data'], + ); + } + + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'data' must be a string file path or an array of values.", $completeId)); + } + + // Pattern data, using string patterns. + if (isset($input['pattern'])) { + if ($columns) { + if (!\is_array($input['pattern'])) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'pattern' must be array of values.", $completeId)); + } + + if (\count($input['pattern']) !== \count($columns)) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'pattern' value count must be the same as the 'columns' value count.", $completeId)); + } + + // Normalize values from the "pattern" array, by creating + // StringPattern instances, also do input sanity cleanup + // by removing array keys, to match columns. + $patterns = []; + foreach ($input['pattern'] as $index => $value) { + if (null !== $value && !\is_string($value)) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'pattern' value at index #%s must be a string or null.", $completeId, $index)); + } + $patterns[] = $value ? new StringPattern($value, $packId) : null; + } + + return new PackMultipleColumnGeneratedAnonymizer( + $completeId, + $description, + $options, + $columns, + $patterns, + ); + } + + $patterns = []; + if (\is_string($input['pattern'])) { + $patterns[] = new StringPattern($input['pattern'], $packId); + } elseif (\is_array($input['pattern'])) { + foreach ($input['pattern'] as $index => $value) { + if (null !== $value && !\is_string($value)) { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'pattern' value at index #%s must be a string or null.", $completeId, $index)); + } + $patterns[] = $value ? new StringPattern($value, $packId) : null; + } + } else { + throw new ConfigurationException(\sprintf("Anonymizer '%s' property 'pattern' must be a string or an array of string.", $completeId)); + } + + return new PackEnumGeneratedAnonymizer( + $completeId, + $description, + $options, + $patterns, + ); + } + + throw new ConfigurationException(\sprintf("Anonymizer '%s' must have one of 'data' or 'pattern' property.", $completeId)); + } +} diff --git a/src/Anonymization/Pack/PackAnonymizer.php b/src/Anonymization/Pack/PackAnonymizer.php new file mode 100644 index 00000000..e4b6efbe --- /dev/null +++ b/src/Anonymization/Pack/PackAnonymizer.php @@ -0,0 +1,16 @@ + $data + */ + public function __construct( + string $id, + ?string $description, + Options $options, + public readonly iterable $data, + ) { + parent::__construct($id, $description, $options); + } +} diff --git a/src/Anonymization/Pack/PackEnumGeneratedAnonymizer.php b/src/Anonymization/Pack/PackEnumGeneratedAnonymizer.php new file mode 100644 index 00000000..d731b0bf --- /dev/null +++ b/src/Anonymization/Pack/PackEnumGeneratedAnonymizer.php @@ -0,0 +1,28 @@ + $pattern + */ + public function __construct( + string $id, + ?string $description, + Options $options, + public readonly array $pattern, + ) { + parent::__construct($id, $description, $options); + } +} diff --git a/src/Anonymization/Pack/PackFileEnumAnonymizer.php b/src/Anonymization/Pack/PackFileEnumAnonymizer.php new file mode 100644 index 00000000..423c5e83 --- /dev/null +++ b/src/Anonymization/Pack/PackFileEnumAnonymizer.php @@ -0,0 +1,19 @@ + $columns + */ + public function __construct( + string $id, + ?string $description, + Options $options, + public readonly array $columns, + public readonly string $filename, + ) { + parent::__construct($id, $description, $options); + } +} diff --git a/src/Anonymization/Pack/PackMultipleColumnAnonymizer.php b/src/Anonymization/Pack/PackMultipleColumnAnonymizer.php new file mode 100644 index 00000000..3f99c009 --- /dev/null +++ b/src/Anonymization/Pack/PackMultipleColumnAnonymizer.php @@ -0,0 +1,24 @@ + $columns + * @param iterable> $data + */ + public function __construct( + string $id, + ?string $description, + Options $options, + public readonly array $columns, + public readonly iterable $data, + ) { + parent::__construct($id, $description, $options); + } +} diff --git a/src/Anonymization/Pack/PackMultipleColumnGeneratedAnonymizer.php b/src/Anonymization/Pack/PackMultipleColumnGeneratedAnonymizer.php new file mode 100644 index 00000000..bc5c0d5d --- /dev/null +++ b/src/Anonymization/Pack/PackMultipleColumnGeneratedAnonymizer.php @@ -0,0 +1,33 @@ + $columns + * @param array<\MakinaCorpus\DbToolsBundle\Anonymization\Anonymizer\Pattern\StringPattern|null> $patterns + * Count must match $columns count. + */ + public function __construct( + string $id, + ?string $description, + Options $options, + public readonly array $columns, + public readonly array $patterns, + ) { + parent::__construct($id, $description, $options); + + if (\count($columns) !== \count($patterns)) { + throw new ConfigurationException("Column count and pattern count must match."); + } + } +} diff --git a/src/Anonymization/Pack/PackRegistry.php b/src/Anonymization/Pack/PackRegistry.php new file mode 100644 index 00000000..ef71f009 --- /dev/null +++ b/src/Anonymization/Pack/PackRegistry.php @@ -0,0 +1,66 @@ +filenames)) { + // @todo Throw cannot initialize the same pack twice? + // Not sure it worthes it, it would have any side effect. + return; + } + + $pack = Pack::fromFile($filename); + + if ($previousFilename = $this->filenames[$pack->id]) { + throw new ConfigurationException(\sprintf( + "Pack '%s' was already registered by file '%s' while registering file '%s'", + $pack->id, + $previousFilename, + $filename, + )); + } + + $this->packs[$pack->id] = $pack; + $this->filenames[$pack->id] = $filename; + } + + public function getPack(string $name): Pack + { + if ($pos = \strpos($name, '.')) { + $name = \substr($name, $pos - 1); + } + + return $this->packs[$name] ?? throw new ConfigurationException(\sprintf("Pack '%s' does not exist.", $name)); + } + + public function hasPack(string $name): bool + { + if ($pos = \strpos($name, '.')) { + $name = \substr($name, $pos - 1); + } + + return \array_key_exists($name, $this->packs); + } + + /** Get anonymizer description. */ + public function getPackAnonymizer(string $name): PackAnonymizer + { + if (!\str_contains($name, '.')) { + throw new ConfigurationException("Identifier must be in the form 'PACK.ANONYMIZER'."); + } + + list($packId, $anonymizerId) = \explode('.', $name, 2); + + return $this->getPack($packId)->getPackAnonymizer($anonymizerId); + } +} diff --git a/tests/Resources/Anonymization/Pack/pack.sample.yaml b/tests/Resources/Anonymization/Pack/pack.sample.yaml new file mode 100644 index 00000000..37b79ae4 --- /dev/null +++ b/tests/Resources/Anonymization/Pack/pack.sample.yaml @@ -0,0 +1,90 @@ +name: fr-fr +data: + # Arbitrary data list. Whenever you use the "data" property name, it means + # that raw data you wrote will end up as a sample. + # Any null or empty string, or whitespace-only string will be ignored. + address_street_prefix: + data: [rue, avenue, impasse, voie, chemin, route] + + # When "data" is a string, it is processed as a filename, relative to the + # directory this file is into. + address_street_name: + data: ./resources/address_street_names.txt + + # You can use "pattern" instead of data, this will use the string pattern + # anonymizer instead. You can read documentation in the "String pattern" + # section under the "Core anonymizers" documentation page. + address_street: + pattern: "[0-2000] {street_prefix} {street_name}" + + # If "pattern" is an array with multiple values, then one in the list + # will be randomly selected for each anonymized row. + address_secondary: + pattern: + - "Apt. [0-500]" + - "[0-9] étage" + + # Abitrary multiple column data list. Using the "columns" property sets + # the column names. + # Using the "data" property name allows you to directly write the raw data + # in this YAML file. + # When "data" is a string, it is processed as a filename, relative to the + # directory this file is into. + # File can only be a CSV file for now. Values in each line must match the + # same count and order than the "columns" property. + address_hexasmal: + # If you need to ignore a column in the original data file, you can + # set null as a column name: it will explicitely drop the column in + # the sample data. + # We are here using the "HexaSmal.csv" open data file from french + # public services (containing all official french locality names). + # This is why we are ignoring a useless (for us) column. + columns: [code_insee, locality, postal_code, null, dependent_locality] + data: ./resources/address/hexasmal.csv + # When reading a CSV file, you can use the following options. + # Following values are the default, setting it is optional. + file_csv_enclosure: '"' + file_csv_escape: '\\' + file_csv_separator: ',' + # When reading any file, this will skip the first row. + file_skip_header: false + + # Generated multiple column data list. Using "pattern" instead of of "data" + # when "columns" is specified means that complete sample rows are being + # generated using string patterns. + # You can give only one alternative here. + address: + columns: [country, locality, region, postal_code, street_address, secondary_address] + pattern: + # String sans rien, alors cette string (par défaut un pattern, mais sans tokens dedans) + - FRANCE + # On peut référencer une valeur dans une colonne d'un autre sample + # Ici feinte, un coalesce (dependent_locality, locality) + - "{address_hexasmal.dependant_locality|address_hexasmal.locality}" + - REGION TODO + # Si on référence de nouveau une valeur d'un autre sample, ça prend + # une valeur dans le même row que précédemment (on conserve la cohérence). + # en SQL pur, ça peut se traduire par un JOIN bien écrit. + - "{address_hexasmal.postal_code}" + # Valeur dans une single data list + - "{address_street}" + - "{address_secondary}" + +# @todo +# - description des packs et des anonymizers +# - options documentation? probablement pas, vu que toutes les options seront les mêmes +# - change Anonymizer::anonymize() and move into a createExpression() method +# - anonymize() become $update->set($this->columnName, $this->createExpression()) +# - this allows using createExpression() out of SET clause + +# data: table de sample +# pattern: +# conversion en concat +# data depuis arbitraire: +# join +# data depuis généré: +# join sur table expression qui réplique l'élement généré +# random int: +# code de generation random int +# "|" (or) +# conversion en coalesce diff --git a/tests/Unit/Anonymization/Pack/PackRegistryTest.php b/tests/Unit/Anonymization/Pack/PackRegistryTest.php new file mode 100644 index 00000000..2cdb8cd1 --- /dev/null +++ b/tests/Unit/Anonymization/Pack/PackRegistryTest.php @@ -0,0 +1,9 @@ +getDirectory('pack.sample.yaml')); + + // @todo + self::expectNotToPerformAssertions(); + } +}