Skip to content

Commit 9ecb990

Browse files
authored
Add all versions in version dropdown and filter out non-existing URLs (#1367)
* Add all versions in version dropdown and filter out non-existing URLs * Fix test setup * Fix format * Fix test setup * test * Refactor GetHash method * Refactor naming * Naming refactor, cleanup and docs * Use existing filesystem * Update src/tooling/docs-assembler/legacy-url-mappings.yml * cleanup: Remove unused dependencies * format: brace styles * Refactor: Make BloomFilter more generic
1 parent 01f24f9 commit 9ecb990

22 files changed

+561
-100
lines changed

Directory.Packages.props

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,4 @@
7070
</PackageVersion>
7171
<PackageVersion Include="xunit.v3" Version="2.0.2" />
7272
</ItemGroup>
73-
</Project>
73+
</Project>

docs-builder.sln

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Elastic.ApiExplorer.Tests",
107107
EndProject
108108
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Elastic.Documentation.Site", "src\Elastic.Documentation.Site\Elastic.Documentation.Site.csproj", "{89B83007-71E6-4B57-BA78-2544BFA476DB}"
109109
EndProject
110+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Elastic.Documentation.LegacyDocs", "src\Elastic.Documentation.LegacyDocs\Elastic.Documentation.LegacyDocs.csproj", "{111E7029-BB29-4039-9B45-04776798A8DD}"
111+
EndProject
112+
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Elastic.Documentation.LegacyDocs.Tests", "tests\Elastic.Documentation.LegacyDocs.Tests\Elastic.Documentation.LegacyDocs.Tests.csproj", "{164F55EC-9412-4CD4-81AD-3598B57632A6}"
113+
EndProject
110114
Global
111115
GlobalSection(SolutionConfigurationPlatforms) = preSolution
112116
Debug|Any CPU = Debug|Any CPU
@@ -184,6 +188,14 @@ Global
184188
{89B83007-71E6-4B57-BA78-2544BFA476DB}.Debug|Any CPU.Build.0 = Debug|Any CPU
185189
{89B83007-71E6-4B57-BA78-2544BFA476DB}.Release|Any CPU.ActiveCfg = Release|Any CPU
186190
{89B83007-71E6-4B57-BA78-2544BFA476DB}.Release|Any CPU.Build.0 = Release|Any CPU
191+
{111E7029-BB29-4039-9B45-04776798A8DD}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
192+
{111E7029-BB29-4039-9B45-04776798A8DD}.Debug|Any CPU.Build.0 = Debug|Any CPU
193+
{111E7029-BB29-4039-9B45-04776798A8DD}.Release|Any CPU.ActiveCfg = Release|Any CPU
194+
{111E7029-BB29-4039-9B45-04776798A8DD}.Release|Any CPU.Build.0 = Release|Any CPU
195+
{164F55EC-9412-4CD4-81AD-3598B57632A6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
196+
{164F55EC-9412-4CD4-81AD-3598B57632A6}.Debug|Any CPU.Build.0 = Debug|Any CPU
197+
{164F55EC-9412-4CD4-81AD-3598B57632A6}.Release|Any CPU.ActiveCfg = Release|Any CPU
198+
{164F55EC-9412-4CD4-81AD-3598B57632A6}.Release|Any CPU.Build.0 = Release|Any CPU
187199
EndGlobalSection
188200
GlobalSection(NestedProjects) = preSolution
189201
{4D198E25-C211-41DC-9E84-B15E89BD7048} = {BE6011CC-1200-4957-B01F-FCCA10C5CF5A}
@@ -212,5 +224,7 @@ Global
212224
{C883AC18-7C6A-482E-A9D7-C44DF8633425} = {BE6011CC-1200-4957-B01F-FCCA10C5CF5A}
213225
{0331559E-4ED1-4A56-9C35-3EAD4D7E696D} = {67B576EE-02FA-4F9B-94BC-3630BC09ECE5}
214226
{89B83007-71E6-4B57-BA78-2544BFA476DB} = {BE6011CC-1200-4957-B01F-FCCA10C5CF5A}
227+
{111E7029-BB29-4039-9B45-04776798A8DD} = {BE6011CC-1200-4957-B01F-FCCA10C5CF5A}
228+
{164F55EC-9412-4CD4-81AD-3598B57632A6} = {67B576EE-02FA-4F9B-94BC-3630BC09ECE5}
215229
EndGlobalSection
216230
EndGlobal
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.Collections;
6+
using System.Security.Cryptography;
7+
using System.Text;
8+
9+
namespace Elastic.Documentation.LegacyDocs;
10+
11+
internal sealed class BloomFilter
12+
{
13+
/// <summary>
14+
/// The bit array for the filter.
15+
/// </summary>
16+
private readonly BitArray _bitArray;
17+
18+
/// <summary>
19+
/// The size of the bit array.
20+
/// </summary>
21+
private int Size => _bitArray.Length;
22+
23+
/// <summary>
24+
/// The number of hash functions used.
25+
/// </summary>
26+
private int HashCount { get; }
27+
28+
/// <summary>
29+
/// Private constructor to be used by factory methods.
30+
/// </summary>
31+
private BloomFilter(int size, int hashCount)
32+
{
33+
if (size <= 0)
34+
throw new ArgumentOutOfRangeException(nameof(size), "Size must be greater than zero.");
35+
if (hashCount <= 0)
36+
throw new ArgumentOutOfRangeException(nameof(hashCount), "Hash count must be greater than zero.");
37+
38+
_bitArray = new BitArray(size);
39+
HashCount = hashCount;
40+
}
41+
42+
/// <summary>
43+
/// Initializes a new BloomFilter with optimal parameters based on expected items and false positive probability.
44+
/// </summary>
45+
/// <param name="expectedItems">The expected number of items to be stored.</param>
46+
/// <param name="falsePositiveProbability">The desired false positive probability (e.g., 0.01 for 1%).</param>
47+
private BloomFilter(int expectedItems, double falsePositiveProbability)
48+
{
49+
if (expectedItems <= 0)
50+
throw new ArgumentOutOfRangeException(nameof(expectedItems), "Expected items must be greater than zero.");
51+
if (falsePositiveProbability is <= 0.0 or >= 1.0)
52+
throw new ArgumentOutOfRangeException(nameof(falsePositiveProbability), "False positive probability must be between 0 and 1.");
53+
54+
var size = GetOptimalSize(expectedItems, falsePositiveProbability);
55+
var hashCount = GetOptimalHashCount(size, expectedItems);
56+
57+
_bitArray = new BitArray(size);
58+
HashCount = hashCount;
59+
}
60+
61+
/// <summary>
62+
/// Adds an item to the Bloom Filter.
63+
/// </summary>
64+
/// <param name="item">The item to add. The string will be UTF-8 encoded for hashing.</param>
65+
private void Add(string item)
66+
{
67+
var itemBytes = Encoding.UTF8.GetBytes(item);
68+
for (var i = 0; i < HashCount; i++)
69+
{
70+
var hash = GetHash(itemBytes, i);
71+
_bitArray[hash] = true;
72+
}
73+
}
74+
75+
/// <summary>
76+
/// Checks if an item is possibly in the set.
77+
/// </summary>
78+
/// <param name="item">The item to check.</param>
79+
/// <returns>False if the item is definitely not in the set, True if it might be.</returns>
80+
public bool Check(string item)
81+
{
82+
var itemBytes = Encoding.UTF8.GetBytes(item);
83+
for (var i = 0; i < HashCount; i++)
84+
{
85+
var hash = GetHash(itemBytes, i);
86+
if (!_bitArray[hash])
87+
return false;
88+
}
89+
return true;
90+
}
91+
92+
/// <summary>
93+
/// Hashes the input data using SHA256 with a given seed.
94+
/// </summary>
95+
private int GetHash(byte[] data, int seed)
96+
{
97+
var seedBytes = BitConverter.GetBytes(seed);
98+
var combinedBytes = new byte[data.Length + seedBytes.Length];
99+
Buffer.BlockCopy(data, 0, combinedBytes, 0, data.Length);
100+
Buffer.BlockCopy(seedBytes, 0, combinedBytes, data.Length, seedBytes.Length);
101+
var hashBytes = SHA256.HashData(combinedBytes);
102+
var hashInt = BitConverter.ToInt32(hashBytes, 0);
103+
return Math.Abs(hashInt % _bitArray.Length);
104+
}
105+
106+
/// <summary>
107+
/// Creates a new BloomFilter from a collection of items.
108+
/// </summary>
109+
/// <param name="items">The collection of string items to add.</param>
110+
/// <param name="falsePositiveProbability">The desired false positive probability.</param>
111+
/// <returns>A new BloomFilter instance populated with the items.</returns>
112+
public static BloomFilter FromCollection(IEnumerable<string> items, double falsePositiveProbability)
113+
{
114+
var itemList = new List<string>(items);
115+
var filter = new BloomFilter(itemList.Count, falsePositiveProbability);
116+
foreach (var item in itemList)
117+
filter.Add(item);
118+
119+
return filter;
120+
}
121+
122+
// --- Persistence Methods ---
123+
124+
/// <summary>
125+
/// Saves the Bloom Filter's state to a binary file.
126+
/// The format is: [4-byte Size int][4-byte HashCount int][bit array bytes]
127+
/// </summary>
128+
/// <param name="filePath">The path to the file.</param>
129+
public void Save(string filePath)
130+
{
131+
using var stream = File.Open(filePath, FileMode.Create);
132+
using var writer = new BinaryWriter(stream);
133+
// 1. Write the Size and HashCount as integers
134+
writer.Write(Size);
135+
writer.Write(HashCount);
136+
137+
// 2. Write the bit array
138+
var bitArrayBytes = new byte[(Size + 7) / 8];
139+
_bitArray.CopyTo(bitArrayBytes, 0);
140+
writer.Write(bitArrayBytes);
141+
}
142+
143+
/// <summary>
144+
/// Loads a Bloom Filter from a binary file.
145+
/// </summary>
146+
/// <param name="filePath">The path to the file containing the filter data.</param>
147+
/// <returns>A new BloomFilter instance.</returns>
148+
public static BloomFilter Load(string filePath)
149+
{
150+
using var stream = File.OpenRead(filePath);
151+
using var reader = new BinaryReader(stream);
152+
// 1. Read metadata (Size and HashCount)
153+
var size = reader.ReadInt32();
154+
var hashCount = reader.ReadInt32();
155+
156+
// 2. Create a new filter with the loaded parameters
157+
var filter = new BloomFilter(size, hashCount);
158+
159+
// 3. Read the bit array data
160+
var byteCount = (size + 7) / 8;
161+
var bitArrayBytes = reader.ReadBytes(byteCount);
162+
163+
// Re-initialize the internal BitArray with the loaded data
164+
for (var i = 0; i < size; i++)
165+
{
166+
if ((bitArrayBytes[i / 8] & (1 << (i % 8))) != 0)
167+
filter._bitArray[i] = true;
168+
}
169+
170+
return filter;
171+
}
172+
173+
174+
// --- Optimal Parameter Calculation ---
175+
176+
/// <summary>
177+
/// Calculates the optimal size of the bit array (m).
178+
/// Formula: m = - (n * log(p)) / (log(2)^2)
179+
/// </summary>
180+
private static int GetOptimalSize(int n, double p) => (int)Math.Ceiling(-1 * (n * Math.Log(p)) / Math.Pow(Math.Log(2), 2));
181+
182+
/// <summary>
183+
/// Calculates the optimal number of hash functions (k).
184+
/// Formula: k = (m/n) * log(2)
185+
/// </summary>
186+
private static int GetOptimalHashCount(int m, int n) => (int)Math.Ceiling((double)m / n * Math.Log(2));
187+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<TargetFramework>net9.0</TargetFramework>
5+
<OutputType>Library</OutputType>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
<RootNamespace>Elastic.Documentation.LegacyDocs</RootNamespace>
9+
</PropertyGroup>
10+
11+
<ItemGroup>
12+
<PackageReference Include="System.IO.Abstractions" />
13+
</ItemGroup>
14+
15+
<ItemGroup>
16+
<ProjectReference Include="..\Elastic.Documentation.Configuration\Elastic.Documentation.Configuration.csproj" />
17+
</ItemGroup>
18+
19+
<ItemGroup>
20+
<EmbeddedResource Include="legacy-pages.bloom.bin" />
21+
</ItemGroup>
22+
23+
</Project>
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
using System.IO.Abstractions;
6+
using Elastic.Documentation.Configuration;
7+
8+
namespace Elastic.Documentation.LegacyDocs;
9+
10+
public class LegacyPageChecker(IFileSystem fs)
11+
{
12+
private BloomFilter? _bloomFilter;
13+
private readonly string _bloomFilterBinaryPath = Path.Combine(Paths.WorkingDirectoryRoot.FullName, "src", "Elastic.Documentation.LegacyDocs", "legacy-pages.bloom.bin");
14+
15+
16+
public bool PathExists(string path)
17+
{
18+
_bloomFilter ??= LoadBloomFilter();
19+
return _bloomFilter.Check(path);
20+
}
21+
22+
private BloomFilter LoadBloomFilter()
23+
{
24+
var bloomFilterBinaryInfo = fs.FileInfo.New(_bloomFilterBinaryPath);
25+
_bloomFilter ??= BloomFilter.Load(bloomFilterBinaryInfo.FullName);
26+
return _bloomFilter;
27+
}
28+
29+
public void GenerateBloomFilterBinary(IPagesProvider pagesProvider)
30+
{
31+
var pages = pagesProvider.GetPages();
32+
var enumerable = pages as string[] ?? pages.ToArray();
33+
var paths = enumerable.ToHashSet();
34+
var bloomFilter = BloomFilter.FromCollection(enumerable, 0.001);
35+
Console.WriteLine(paths.Count);
36+
bloomFilter.Save(_bloomFilterBinaryPath);
37+
}
38+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Licensed to Elasticsearch B.V under one or more agreements.
2+
// Elasticsearch B.V licenses this file to you under the Apache 2.0 License.
3+
// See the LICENSE file in the project root for more information
4+
5+
namespace Elastic.Documentation.LegacyDocs;
6+
7+
public interface IPagesProvider
8+
{
9+
IEnumerable<string> GetPages();
10+
}
11+
12+
/// <summary>
13+
/// Gets pages from a local checked-out elastic/built-docs repository
14+
/// </summary>
15+
/// <param name="gitRepositoryPath"></param>
16+
public class LocalPagesProvider(string gitRepositoryPath) : IPagesProvider
17+
{
18+
public IEnumerable<string> GetPages() =>
19+
Directory.EnumerateFiles(Path.Combine(gitRepositoryPath, "html", "en"), "*.html", SearchOption.AllDirectories)
20+
.Select(i =>
21+
{
22+
var relativePath = "/guide/" + Path.GetRelativePath(Path.Combine(gitRepositoryPath, "html"), i).Replace('\\', '/');
23+
return relativePath;
24+
});
25+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# Legacy Docs
2+
3+
## Legacy Page Checker
4+
5+
The legacy page checker is a tool that checks if an URL exists in the legacy docs system (https://www.elastic.co/guide).
6+
It uses a checked-in bloom filter file loaded into memory to check if an URL exists.
7+
8+
### How to create or update the bloom filter file
9+
10+
The bloom filter file is created by running the following command:
11+
12+
```
13+
dotnet run --project src/tooling/docs-assembler -- legacy-docs create-bloom-bin --built-docs-dir /path/to/elastic/built-docs
14+
```
15+
16+
1. The `--built-docs-dir` option is the path to the locally checked-out [elastic/built-docs](https://github.com/elastic/built-docs) repository.
817 KB
Binary file not shown.

src/Elastic.Documentation/Legacy/ILegacyUrlMapper.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@
44

55
namespace Elastic.Documentation.Legacy;
66

7-
public record LegacyPageMapping(string Url, string Version)
7+
public record LegacyPageMapping(string RawUrl, string Version, bool Exists)
88
{
9-
public override string ToString() => Url.Replace("/current/", $"/{Version}/");
9+
public override string ToString() => RawUrl.Replace("/current/", $"/{Version}/");
1010
};
1111

1212
public interface ILegacyUrlMapper

src/Elastic.Markdown/Elastic.Markdown.csproj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
</ItemGroup>
2929

3030
<ItemGroup>
31+
<ProjectReference Include="..\Elastic.Documentation.LegacyDocs\Elastic.Documentation.LegacyDocs.csproj" />
3132
<ProjectReference Include="..\Elastic.Documentation.Site\Elastic.Documentation.Site.csproj" />
3233
<ProjectReference Include="..\Elastic.Documentation.LinkIndex\Elastic.Documentation.LinkIndex.csproj" />
3334
<ProjectReference Include="..\Elastic.Documentation\Elastic.Documentation.csproj" />

0 commit comments

Comments
 (0)