|
| 1 | +// Licensed to Elasticsearch B.V under one or more agreements. |
| 2 | +// Elasticsearch B.V licenses this file to you under the Apache 2.0 License. |
| 3 | +// See the LICENSE file in the project root for more information |
| 4 | + |
| 5 | +using System.Collections; |
| 6 | +using System.Security.Cryptography; |
| 7 | +using System.Text; |
| 8 | + |
| 9 | +namespace Elastic.Documentation.LegacyDocs; |
| 10 | + |
| 11 | +internal sealed class BloomFilter |
| 12 | +{ |
| 13 | + /// <summary> |
| 14 | + /// The bit array for the filter. |
| 15 | + /// </summary> |
| 16 | + private readonly BitArray _bitArray; |
| 17 | + |
| 18 | + /// <summary> |
| 19 | + /// The size of the bit array. |
| 20 | + /// </summary> |
| 21 | + private int Size => _bitArray.Length; |
| 22 | + |
| 23 | + /// <summary> |
| 24 | + /// The number of hash functions used. |
| 25 | + /// </summary> |
| 26 | + private int HashCount { get; } |
| 27 | + |
| 28 | + /// <summary> |
| 29 | + /// Private constructor to be used by factory methods. |
| 30 | + /// </summary> |
| 31 | + private BloomFilter(int size, int hashCount) |
| 32 | + { |
| 33 | + if (size <= 0) |
| 34 | + throw new ArgumentOutOfRangeException(nameof(size), "Size must be greater than zero."); |
| 35 | + if (hashCount <= 0) |
| 36 | + throw new ArgumentOutOfRangeException(nameof(hashCount), "Hash count must be greater than zero."); |
| 37 | + |
| 38 | + _bitArray = new BitArray(size); |
| 39 | + HashCount = hashCount; |
| 40 | + } |
| 41 | + |
| 42 | + /// <summary> |
| 43 | + /// Initializes a new BloomFilter with optimal parameters based on expected items and false positive probability. |
| 44 | + /// </summary> |
| 45 | + /// <param name="expectedItems">The expected number of items to be stored.</param> |
| 46 | + /// <param name="falsePositiveProbability">The desired false positive probability (e.g., 0.01 for 1%).</param> |
| 47 | + private BloomFilter(int expectedItems, double falsePositiveProbability) |
| 48 | + { |
| 49 | + if (expectedItems <= 0) |
| 50 | + throw new ArgumentOutOfRangeException(nameof(expectedItems), "Expected items must be greater than zero."); |
| 51 | + if (falsePositiveProbability is <= 0.0 or >= 1.0) |
| 52 | + throw new ArgumentOutOfRangeException(nameof(falsePositiveProbability), "False positive probability must be between 0 and 1."); |
| 53 | + |
| 54 | + var size = GetOptimalSize(expectedItems, falsePositiveProbability); |
| 55 | + var hashCount = GetOptimalHashCount(size, expectedItems); |
| 56 | + |
| 57 | + _bitArray = new BitArray(size); |
| 58 | + HashCount = hashCount; |
| 59 | + } |
| 60 | + |
| 61 | + /// <summary> |
| 62 | + /// Adds an item to the Bloom Filter. |
| 63 | + /// </summary> |
| 64 | + /// <param name="item">The item to add. The string will be UTF-8 encoded for hashing.</param> |
| 65 | + private void Add(string item) |
| 66 | + { |
| 67 | + var itemBytes = Encoding.UTF8.GetBytes(item); |
| 68 | + for (var i = 0; i < HashCount; i++) |
| 69 | + { |
| 70 | + var hash = GetHash(itemBytes, i); |
| 71 | + _bitArray[hash] = true; |
| 72 | + } |
| 73 | + } |
| 74 | + |
| 75 | + /// <summary> |
| 76 | + /// Checks if an item is possibly in the set. |
| 77 | + /// </summary> |
| 78 | + /// <param name="item">The item to check.</param> |
| 79 | + /// <returns>False if the item is definitely not in the set, True if it might be.</returns> |
| 80 | + public bool Check(string item) |
| 81 | + { |
| 82 | + var itemBytes = Encoding.UTF8.GetBytes(item); |
| 83 | + for (var i = 0; i < HashCount; i++) |
| 84 | + { |
| 85 | + var hash = GetHash(itemBytes, i); |
| 86 | + if (!_bitArray[hash]) |
| 87 | + return false; |
| 88 | + } |
| 89 | + return true; |
| 90 | + } |
| 91 | + |
| 92 | + /// <summary> |
| 93 | + /// Hashes the input data using SHA256 with a given seed. |
| 94 | + /// </summary> |
| 95 | + private int GetHash(byte[] data, int seed) |
| 96 | + { |
| 97 | + var seedBytes = BitConverter.GetBytes(seed); |
| 98 | + var combinedBytes = new byte[data.Length + seedBytes.Length]; |
| 99 | + Buffer.BlockCopy(data, 0, combinedBytes, 0, data.Length); |
| 100 | + Buffer.BlockCopy(seedBytes, 0, combinedBytes, data.Length, seedBytes.Length); |
| 101 | + var hashBytes = SHA256.HashData(combinedBytes); |
| 102 | + var hashInt = BitConverter.ToInt32(hashBytes, 0); |
| 103 | + return Math.Abs(hashInt % _bitArray.Length); |
| 104 | + } |
| 105 | + |
| 106 | + /// <summary> |
| 107 | + /// Creates a new BloomFilter from a collection of items. |
| 108 | + /// </summary> |
| 109 | + /// <param name="items">The collection of string items to add.</param> |
| 110 | + /// <param name="falsePositiveProbability">The desired false positive probability.</param> |
| 111 | + /// <returns>A new BloomFilter instance populated with the items.</returns> |
| 112 | + public static BloomFilter FromCollection(IEnumerable<string> items, double falsePositiveProbability) |
| 113 | + { |
| 114 | + var itemList = new List<string>(items); |
| 115 | + var filter = new BloomFilter(itemList.Count, falsePositiveProbability); |
| 116 | + foreach (var item in itemList) |
| 117 | + filter.Add(item); |
| 118 | + |
| 119 | + return filter; |
| 120 | + } |
| 121 | + |
| 122 | + // --- Persistence Methods --- |
| 123 | + |
| 124 | + /// <summary> |
| 125 | + /// Saves the Bloom Filter's state to a binary file. |
| 126 | + /// The format is: [4-byte Size int][4-byte HashCount int][bit array bytes] |
| 127 | + /// </summary> |
| 128 | + /// <param name="filePath">The path to the file.</param> |
| 129 | + public void Save(string filePath) |
| 130 | + { |
| 131 | + using var stream = File.Open(filePath, FileMode.Create); |
| 132 | + using var writer = new BinaryWriter(stream); |
| 133 | + // 1. Write the Size and HashCount as integers |
| 134 | + writer.Write(Size); |
| 135 | + writer.Write(HashCount); |
| 136 | + |
| 137 | + // 2. Write the bit array |
| 138 | + var bitArrayBytes = new byte[(Size + 7) / 8]; |
| 139 | + _bitArray.CopyTo(bitArrayBytes, 0); |
| 140 | + writer.Write(bitArrayBytes); |
| 141 | + } |
| 142 | + |
| 143 | + /// <summary> |
| 144 | + /// Loads a Bloom Filter from a binary file. |
| 145 | + /// </summary> |
| 146 | + /// <param name="filePath">The path to the file containing the filter data.</param> |
| 147 | + /// <returns>A new BloomFilter instance.</returns> |
| 148 | + public static BloomFilter Load(string filePath) |
| 149 | + { |
| 150 | + using var stream = File.OpenRead(filePath); |
| 151 | + using var reader = new BinaryReader(stream); |
| 152 | + // 1. Read metadata (Size and HashCount) |
| 153 | + var size = reader.ReadInt32(); |
| 154 | + var hashCount = reader.ReadInt32(); |
| 155 | + |
| 156 | + // 2. Create a new filter with the loaded parameters |
| 157 | + var filter = new BloomFilter(size, hashCount); |
| 158 | + |
| 159 | + // 3. Read the bit array data |
| 160 | + var byteCount = (size + 7) / 8; |
| 161 | + var bitArrayBytes = reader.ReadBytes(byteCount); |
| 162 | + |
| 163 | + // Re-initialize the internal BitArray with the loaded data |
| 164 | + for (var i = 0; i < size; i++) |
| 165 | + { |
| 166 | + if ((bitArrayBytes[i / 8] & (1 << (i % 8))) != 0) |
| 167 | + filter._bitArray[i] = true; |
| 168 | + } |
| 169 | + |
| 170 | + return filter; |
| 171 | + } |
| 172 | + |
| 173 | + |
| 174 | + // --- Optimal Parameter Calculation --- |
| 175 | + |
| 176 | + /// <summary> |
| 177 | + /// Calculates the optimal size of the bit array (m). |
| 178 | + /// Formula: m = - (n * log(p)) / (log(2)^2) |
| 179 | + /// </summary> |
| 180 | + private static int GetOptimalSize(int n, double p) => (int)Math.Ceiling(-1 * (n * Math.Log(p)) / Math.Pow(Math.Log(2), 2)); |
| 181 | + |
| 182 | + /// <summary> |
| 183 | + /// Calculates the optimal number of hash functions (k). |
| 184 | + /// Formula: k = (m/n) * log(2) |
| 185 | + /// </summary> |
| 186 | + private static int GetOptimalHashCount(int m, int n) => (int)Math.Ceiling((double)m / n * Math.Log(2)); |
| 187 | +} |
0 commit comments