diff --git a/Console/Command/BatchingOptimizeCommand.php b/Console/Command/BatchingOptimizeCommand.php new file mode 100644 index 000000000..c257357ab --- /dev/null +++ b/Console/Command/BatchingOptimizeCommand.php @@ -0,0 +1,436 @@ + Lowest possible value (0.00 * standard deviation = 0), the recommended batch size will be almost equal to the strictly calculated maximum batch size + * 0.25 => Default value (0.25 * standard deviation), the recommended batch size will be close to the strictly calculated maximum batch size + * 3.00 => Highest possible value (3 * standard deviation), the recommended batch size will be greatly lower than the calculated maximum batch size + */ + protected const DEFAULT_MARGIN = 0.25; + + /** + * Min value for safety margin + */ + protected const MIN_MARGIN = 0; + + /** + * Max value for safety margin + */ + protected const MAX_MARGIN = 3; + + /** + * The sample size if the amount of products fetched to determine the recommended batch size + * Can be updated by the --sample-size option + */ + protected const DEFAULT_SAMPLE_SIZE = 20; + + /** + * Max Sample size + */ + protected const MAX_SAMPLE_SIZE = 1000; + + protected const OPTION_SAMPLE_SIZE = 'sample-size'; + protected const OPTION_SAMPLE_SIZE_SHORTCUT = 's'; + + protected const OPTION_MARGIN = 'margin'; + protected const OPTION_MARGIN_SHORTCUT = 'm'; + + /** + * Simple product types (should generate smaller product records) + */ + protected const PRODUCTS_SIMPLE_TYPES = [ + 'simple', + 'downloadable', + 'virtual', + 'giftcard' + ]; + + /** + * Complex product types (should generate bigger product records) + */ + protected const PRODUCTS_COMPLEX_TYPES = [ + 'configurable', + 'grouped', + 'bundle' + ]; + + /** + * @var array|null + */ + protected ?array $storeCounts = []; + + public function __construct( + protected AlgoliaConnector $algoliaConnector, + protected State $state, + protected StoreNameFetcher $storeNameFetcher, + protected StoreManagerInterface $storeManager, + protected IndexOptionsBuilder $indexOptionsBuilder, + protected ProductHelper $productHelper, + protected ConfigHelper $configHelper, + protected RecordBuilder $recordBuilder, + protected WriterInterface $configWriter, + ?string $name = null + ) { + parent::__construct($state, $storeNameFetcher, $name); + } + + protected function getCommandPrefix(): string + { + return parent::getCommandPrefix() . 'batching:'; + } + + protected function getCommandName(): string + { + return 'optimize'; + } + + protected function getCommandDescription(): string + { + return "Performs catalog analysis and provides recommendation regarding optimal batching size for product indexing."; + } + + protected function getStoreArgumentDescription(): string + { + return 'ID(s) for store(s) to optimize (optional), if no store is specified, all stores will be taken into account.'; + } + + protected function getAdditionalDefinition(): array + { + return [ + new InputOption( + self::OPTION_SAMPLE_SIZE, + '-' . self::OPTION_SAMPLE_SIZE_SHORTCUT, + InputOption::VALUE_REQUIRED, + 'Sample size (number of products) - DEFAULT: ' . self::DEFAULT_SAMPLE_SIZE . ' - MAXIMUM: ' . self::MAX_SAMPLE_SIZE, + ), + new InputOption( + self::OPTION_MARGIN, + '-' . self::OPTION_MARGIN_SHORTCUT, + InputOption::VALUE_REQUIRED, + 'Safety margin - DEFAULT: ' . self::DEFAULT_MARGIN . ' - FROM ' . self::MIN_MARGIN . ' TO ' . self::MAX_MARGIN, + ) + ]; + } + + /** + * @throws NoSuchEntityException|LocalizedException + */ + protected function execute(InputInterface $input, OutputInterface $output): int + { + $this->input = $input; + $this->output = $output; + $this->setAreaCode(); + + $storeIds = $this->getStoreIds($input); + + try { + $this->validateOptions(); + $this->scanProductRecords($storeIds); + } catch (\Exception $e) { + $this->output->writeln('' . $e->getMessage() . ''); + return CLI::RETURN_FAILURE; + } + + return Cli::RETURN_SUCCESS; + } + + /** + * Ensures sample size and margin options are valid + * + * @return void + * @throws AlgoliaException + */ + protected function validateOptions(): void + { + if ( + $this->input->getOption(self::OPTION_SAMPLE_SIZE) + && ( + !ctype_digit((string) $this->input->getOption(self::OPTION_SAMPLE_SIZE)) + || (int) $this->input->getOption(self::OPTION_SAMPLE_SIZE) > self::MAX_SAMPLE_SIZE + ) + ) { + throw new AlgoliaException("Sample size option should be an integer (maximum 1000)" ); + } + + if ( + $this->input->getOption(self::OPTION_MARGIN) + && ( + !is_numeric($this->input->getOption(self::OPTION_MARGIN)) + || (float) $this->input->getOption(self::OPTION_MARGIN) > self::MAX_MARGIN + || (float) $this->input->getOption(self::OPTION_MARGIN) < self::MIN_MARGIN + ) + ) { + throw new AlgoliaException("Margin option should be a decimal value (between 0 and 3)" ); + } + } + + /** + * @param array $storeIds + * @return void + * @throws AlgoliaException + * @throws DiagnosticsException + * @throws LocalizedException + * @throws NoSuchEntityException + */ + protected function scanProductRecords(array $storeIds = []): void + { + if (count($storeIds)) { + foreach ($storeIds as $storeId) { + $this->scanProductRecordsForStore($storeId); + } + } else { + $this->scanProductRecordsForAllStores(); + } + } + + /** + * @return void + * @throws AlgoliaException + * @throws DiagnosticsException + * @throws LocalizedException + * @throws NoSuchEntityException + */ + protected function scanProductRecordsForAllStores(): void + { + $storeIds = array_keys($this->storeManager->getStores()); + + foreach ($storeIds as $storeId) { + $this->scanProductRecordsForStore($storeId); + } + } + + /** + * @param int $storeId + * @return void + * @throws AlgoliaException + * @throws DiagnosticsException + * @throws LocalizedException + * @throws NoSuchEntityException + */ + protected function scanProductRecordsForStore(int $storeId): void + { + $storeName = $this->storeNameFetcher->getStoreName($storeId); + + if (!$this->configHelper->isIndexingEnabled($storeId)) { + $this->output->writeln('Indexing is disabled for store ' . $storeName . ''); + return; + } + + if (!isset($this->storeCounts[$storeId])) { + $this->setStoreCounts($storeId); + } + + $this->output->writeln(' '); + $this->output->writeln(' ====== Products for store ' . $storeName . ' ====== '); + $this->output->writeln('Simple Products: ' . $this->storeCounts[$storeId]['simple'] . ' (' . round($this->storeCounts[$storeId]['simple_percentage'], 2) . '% of total)'); + $this->output->writeln('Complex Products: ' . $this->storeCounts[$storeId]['complex'] . ' (' . round($this->storeCounts[$storeId]['complex_percentage'], 2) . '% of total)'); + + $this->output->writeln(' ============ '); + $this->output->writeln('Total: ' . $this->storeCounts[$storeId]['total'] . ' products'); + + $this->output->writeln(' ============ '); + + $sample = $this->storeCounts[$storeId]['sample']; + + if (count($sample) > 0) { + $this->output->writeln('Sample (' . count($sample) . ' products):'); + foreach ($sample as $sku => $size) { + $this->output->writeln(' - ' . $size . 'B (sku: ' . $sku . ')'); + } + } + + $this->output->writeln(' ============ '); + $sizeAverage = (int) round(MathHelper::getAverage($sample)); + $this->output->writeln('Min record size : ' . $this->storeCounts[$storeId]['sample_min'] . 'B'); + $this->output->writeln('Max record size : ' . $this->storeCounts[$storeId]['sample_max'] . 'B'); + $this->output->writeln('Average record size : ' . $sizeAverage . 'B'); + + $estimatedBatchCount = $this->getEstimatedMaxBatchCount($sizeAverage); + $this->output->writeln('Estimated Max batch count : ' . $estimatedBatchCount . ' records'); + + $standardDeviation = MathHelper::getSampleStandardDeviation($sample); + $this->output->writeln('Standard Deviation : ' . $standardDeviation); + + $margin = $this->input->getOption(self::OPTION_MARGIN) ?? self::DEFAULT_MARGIN; + $this->output->writeln('Safety margin : ' . $margin); + + $recommendedBatchCount = $this->getRecommendedBatchCount($sizeAverage, $standardDeviation, $margin); + $this->output->writeln(' ============ '); + $this->output->writeln('Recommended batch count : ' . $recommendedBatchCount . ' records'); + $this->output->writeln(' '); + $this->output->writeln('Important: Those numbers are estimates only. Indexing activity should be monitored after making changes to ensure batches are not exceeding the recommended size of 10 MB.'); + $this->output->writeln(' ============ '); + $this->output->writeln( + 'This will override your "Maximum number of records processed per indexing job" configuration to ' . $recommendedBatchCount . ' for store "' . $storeName . '".'); + $this->output->writeln(' '); + + if ($this->confirmOperation()) { + $this->configWriter->save( + ConfigHelper::NUMBER_OF_ELEMENT_BY_PAGE, + $recommendedBatchCount, + 'stores', + $storeId + ); + } + } + + /** + * @param int $storeId + * @return void + * @throws AlgoliaException + * @throws DiagnosticsException + * @throws LocalizedException + * @throws NoSuchEntityException + */ + protected function setStoreCounts(int $storeId): void + { + $simpleProducts = $this->getProductsCollectionForStore($storeId, self::PRODUCTS_SIMPLE_TYPES); + $complexProducts = $this->getProductsCollectionForStore($storeId, self::PRODUCTS_COMPLEX_TYPES); + + $this->storeCounts[$storeId] = [ + 'simple' => $simpleProducts->count(), + 'complex' => $complexProducts->count() + ]; + + $this->storeCounts[$storeId]['total'] = + (int) $this->storeCounts[$storeId]['simple'] + (int) $this->storeCounts[$storeId]['complex']; + + $this->storeCounts[$storeId]['simple_percentage'] = $this->storeCounts[$storeId]['total'] > 0 ? + ($this->storeCounts[$storeId]['simple'] * 100) / $this->storeCounts[$storeId]['total'] : + 0; + + $this->storeCounts[$storeId]['complex_percentage'] = $this->storeCounts[$storeId]['total'] > 0 ? + ($this->storeCounts[$storeId]['complex'] * 100) / $this->storeCounts[$storeId]['total']: + 0; + + + $sampleSize = $this->input->getOption(self::OPTION_SAMPLE_SIZE) ?? self::DEFAULT_SAMPLE_SIZE; + $simpleSampleSize = (int)round($sampleSize * ($this->storeCounts[$storeId]['simple_percentage'] / 100)); + $complexSampleSize = (int)round($sampleSize * ($this->storeCounts[$storeId]['complex_percentage'] / 100)); + + $this->storeCounts[$storeId]['simple_sample_size'] = $simpleSampleSize; + $this->storeCounts[$storeId]['complex_sample_size'] = $complexSampleSize; + + $this->storeCounts[$storeId]['sample'] = array_merge( + $this->getProductsSizes($simpleProducts, $simpleSampleSize), + $this->getProductsSizes($complexProducts, $complexSampleSize) + ); + + $this->storeCounts[$storeId]['sample_min'] = min($this->storeCounts[$storeId]['sample']); + $this->storeCounts[$storeId]['sample_max'] = max($this->storeCounts[$storeId]['sample']); + } + + /** + * Generates a product collection with the same helper as the product indexer to get the exact amount of expected products in the Algolia index + * + * @param int $storeId + * @param array $productTypes + * @return Collection + */ + protected function getProductsCollectionForStore(int $storeId, array $productTypes = []): Collection + { + $onlyVisible = !$this->configHelper->includeNonVisibleProductsInIndex(); + $collection = $this->productHelper->getProductCollectionQuery($storeId, null, $onlyVisible); + if (count($productTypes) > 0) { + $collection->addAttributeToFilter('type_id', ['in' => $productTypes]); + } + + // Randomize the results to get a more "diverse" sample + $collection->getSelect()->orderRand(); + + return $collection; + } + + /** + * @param Collection $products + * @param int $sampleSize + * @return array + * @throws LocalizedException + * @throws NoSuchEntityException + * @throws DiagnosticsException + * @throws AlgoliaException + */ + protected function getProductsSizes(Collection $products, int $sampleSize): array + { + $stats = []; + $limit = 0; + + foreach ($products as $product) { + if ($limit >= $sampleSize) { + break; + } + + $serializedRecord = json_encode($this->recordBuilder->buildRecord($product)); + + if (function_exists('mb_strlen')) { + $size = mb_strlen($serializedRecord, '8bit'); + } else { + $size = strlen($serializedRecord); + } + + $stats[$product->getSku()] = $size; + $limit++; + } + + return $stats; + } + + /** + * Determines the maximum estimated batch count which will be considered as the upper boundary + * + * @param int $averageSize + * @return int + */ + protected function getEstimatedMaxBatchCount(int $averageSize): int + { + return (int) round(self::MAX_BATCH_SIZE_IN_BYTES / $averageSize); + } + + /** + * Provides a recommended batch count according to: + * - the average record size provided by the product sample + * - the standard deviation of the product sample + * - an arbitrary safety margin (1 to 10) to allow the user to alter the strictness of the recommendation + * (the lower the margin is, the closer it will be from the maximum batch count) + * + * @param int $averageSize + * @param float $standardDeviation + * @param float $margin + * @return int + */ + protected function getRecommendedBatchCount(int $averageSize, float $standardDeviation, float $margin = self::DEFAULT_MARGIN): int + { + return (int) (self::MAX_BATCH_SIZE_IN_BYTES / ($averageSize + $margin * $standardDeviation)); + } +} diff --git a/Helper/MathHelper.php b/Helper/MathHelper.php new file mode 100644 index 000000000..a38f3faca --- /dev/null +++ b/Helper/MathHelper.php @@ -0,0 +1,39 @@ +assertEquals($expectedResult, MathHelper::getAverage($values)); + } + + /** + * @dataProvider standardDeviationProvider + */ + public function testStandardDeviation($values, $expectedResult) + { + $this->assertEquals($expectedResult, MathHelper::getSampleStandardDeviation($values)); + } + + public static function averageProvider(): array + { + /** Tested with https://www.calculator.net/average-calculator.html */ + return [ + ['values' => [], 'expectedResult' => 0], + ['values' => [1, 3], 'expectedResult' => 2], + ['values' => ['foo' => 1, 'bar' => 3], 'expectedResult' => 2], + ['values' => [1, 9], 'expectedResult' => 5], + ['values' => [1, 2], 'expectedResult' => 1.5], + ['values' => [1, 2, 3], 'expectedResult' => 2], + ['values' => [1, 2, 4], 'expectedResult' => 2.33], + ['values' => [11253, 10025, 9521, 13250], 'expectedResult' => 11012.25], + ['values' => [10, 12, 23, 23, 16, 23, 21, 16], 'expectedResult' => 18], + ]; + } + + public static function standardDeviationProvider(): array + { + /** Tested with https://www.calculator.net/standard-deviation-calculator.html */ + return [ + ['values' => [], 'expectedResult' => 0.0], + ['values' => [1], 'expectedResult' => 0.0], + ['values' => [1, 1], 'expectedResult' => 0.0], + ['values' => [1, 3], 'expectedResult' => 1.41], + ['values' => [1, 4, 12], 'expectedResult' => 5.69], + ['values' => [3, 4, 6], 'expectedResult' => 1.53], + ['values' => [3, 4, 6, 8, 7, 11], 'expectedResult' => 2.88], + ['values' => [11253, 10025, 9521, 13250], 'expectedResult' => 1659.72], + ['values' => [10, 12, 23, 23, 16, 23, 21, 16], 'expectedResult' => 5.24], + ]; + } +} diff --git a/etc/di.xml b/etc/di.xml index efafc7619..54a985f5e 100755 --- a/etc/di.xml +++ b/etc/di.xml @@ -152,6 +152,7 @@ Algolia\AlgoliaSearch\Console\Command\ReplicaRebuildCommand Algolia\AlgoliaSearch\Console\Command\ReplicaDisableVirtualCommand Algolia\AlgoliaSearch\Console\Command\SynonymDeduplicateCommand + Algolia\AlgoliaSearch\Console\Command\BatchingOptimizeCommand Algolia\AlgoliaSearch\Console\Command\Indexer\IndexProductsCommand Algolia\AlgoliaSearch\Console\Command\Indexer\IndexCategoriesCommand @@ -200,6 +201,17 @@ + + + Algolia\AlgoliaSearch\Service\AlgoliaConnector\Proxy + Algolia\AlgoliaSearch\Service\StoreNameFetcher\Proxy + Algolia\AlgoliaSearch\Service\Product\IndexOptionsBuilder\Proxy + Algolia\AlgoliaSearch\Helper\Entity\ProductHelper\Proxy + Algolia\AlgoliaSearch\Service\Product\RecordBuilder\Proxy + Algolia\AlgoliaSearch\Helper\ConfigHelper\Proxy + + +