Skip to content

Commit 30530fc

Browse files
feat: expose CosineSimilarity with robust safeguards
1 parent 3859704 commit 30530fc

File tree

4 files changed

+100
-44
lines changed

4 files changed

+100
-44
lines changed

src/SemanticChunker.NET.Tests/SemanticChunkerNET.Tests.csproj

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,12 @@
1414
<PrivateAssets>all</PrivateAssets>
1515
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
1616
</PackageReference>
17-
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.14.1" />
18-
<PackageReference Include="Microsoft.SemanticKernel" Version="1.61.0" />
19-
<PackageReference Include="OllamaApiFacade" Version="1.3.0" />
17+
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="18.0.1" />
18+
<PackageReference Include="Microsoft.SemanticKernel" Version="1.67.1" />
19+
<PackageReference Include="OllamaApiFacade" Version="1.3.1" />
2020
<PackageReference Include="Shouldly" Version="4.3.0" />
2121
<PackageReference Include="xunit" Version="2.9.3" />
22-
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.3">
22+
<PackageReference Include="xunit.runner.visualstudio" Version="3.1.5">
2323
<PrivateAssets>all</PrivateAssets>
2424
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
2525
</PackageReference>

src/SemanticChunker.NET.Tests/SemanticChunkerTests.cs

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ public async Task E5Model_CosineSimilarities_AreWithinExpectedRange()
183183
{
184184
for (var j = i + 1; j < inputs.Length; j++)
185185
{
186-
similarities.Add(CosineSimilarity(embeddings[i].Vector.ToArray(),
186+
similarities.Add(SemanticChunker.CosineSimilarity(embeddings[i].Vector.ToArray(),
187187
embeddings[j].Vector.ToArray()));
188188
}
189189
}
@@ -220,6 +220,55 @@ public async Task CreateChunksAsync_TwoSimilarSentences_ReturnsOneOrTwoChunksCon
220220
}
221221
}
222222

223+
[Fact]
224+
public void Cosine_IdenticalVectors_IsOne()
225+
{
226+
var v = new float[] { 1, 2, 3, 4 };
227+
SemanticChunker.CosineSimilarity(v, v).ShouldBe(1.0, 1e-12);
228+
}
229+
230+
[Fact]
231+
public void Cosine_OppositeVectors_IsMinusOne()
232+
{
233+
var a = new float[] { 1, 0, -2 };
234+
var b = new float[] { -1, 0, 2 };
235+
SemanticChunker.CosineSimilarity(a, b).ShouldBe(-1.0, 1e-12);
236+
}
237+
238+
[Fact]
239+
public void Cosine_Orthogonal_IsZero()
240+
{
241+
var a = new float[] { 1, 0 };
242+
var b = new float[] { 0, 5 };
243+
SemanticChunker.CosineSimilarity(a, b).ShouldBe(0.0, 1e-12);
244+
}
245+
246+
[Fact]
247+
public void Cosine_ZeroVector_ReturnsZero()
248+
{
249+
var a = new float[] { 0, 0, 0 };
250+
var b = new float[] { 1, 2, 3 };
251+
SemanticChunker.CosineSimilarity(a, b).ShouldBe(0.0, 1e-12);
252+
}
253+
254+
[Fact]
255+
public void Cosine_DifferentLengths_Throws()
256+
{
257+
var a = new float[] { 1, 2, 3 };
258+
var b = new float[] { 1, 2 };
259+
Should.Throw<ArgumentException>(() => SemanticChunker.CosineSimilarity(a, b));
260+
}
261+
262+
[Fact]
263+
public void Cosine_IsBoundedInMinusOneToOne()
264+
{
265+
var rnd = new Random(42);
266+
var a = Enumerable.Range(0, 1000).Select(_ => (float)rnd.NextDouble()).ToArray();
267+
var b = Enumerable.Range(0, 1000).Select(_ => (float)rnd.NextDouble()).ToArray();
268+
var cos = SemanticChunker.CosineSimilarity(a, b);
269+
cos.ShouldBeInRange(-1.0, 1.0);
270+
}
271+
223272
private static double AnalyzeSemanticCoherence(IList<Chunk> chunks)
224273
{
225274
var maxCoherence = 0.0;
@@ -245,23 +294,4 @@ private static double AnalyzeSemanticCoherence(IList<Chunk> chunks)
245294

246295
return maxCoherence;
247296
}
248-
249-
private static double CosineSimilarity(float[] vectorA, float[] vectorB)
250-
{
251-
if (vectorA.Length != vectorB.Length)
252-
throw new ArgumentException("Vektoren müssen gleiche Länge haben");
253-
254-
double dotProduct = 0.0;
255-
double normA = 0.0;
256-
double normB = 0.0;
257-
258-
for (var i = 0; i < vectorA.Length; i++)
259-
{
260-
dotProduct += vectorA[i] * vectorB[i];
261-
normA += vectorA[i] * vectorA[i];
262-
normB += vectorB[i] * vectorB[i];
263-
}
264-
265-
return dotProduct / (Math.Sqrt(normA) * Math.Sqrt(normB));
266-
}
267297
}

src/SemanticChunker.NET/SemanticChunker.cs

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,50 @@ public async Task<IList<Chunk>> CreateChunksAsync(string text, CancellationToken
100100
return await AssembleChunksAsync(sentences, breakpoints, minChunkChars, cancellationToken);
101101
}
102102

103+
/// <summary>
104+
/// Calculates the cosine similarity between two vectors of equal length.
105+
/// </summary>
106+
/// <param name="vectorA">The first vector.</param>
107+
/// <param name="vectorB">The second vector.</param>
108+
/// <returns>
109+
/// A value between -1 and1 representing the cosine similarity:
110+
///1 indicates identical direction,0 indicates orthogonality, and -1 indicates opposite direction.
111+
/// Returns0 if either vector has zero magnitude.
112+
/// </returns>
113+
/// <exception cref="ArgumentNullException">
114+
/// Thrown if <paramref name="vectorA"/> or <paramref name="vectorB"/> is null.
115+
/// </exception>
116+
/// <exception cref="ArgumentException">
117+
/// Thrown if the vectors do not have the same length.
118+
/// </exception>
119+
public static double CosineSimilarity(IReadOnlyList<float> vectorA, IReadOnlyList<float> vectorB)
120+
{
121+
if (vectorA is null) throw new ArgumentNullException(nameof(vectorA));
122+
if (vectorB is null) throw new ArgumentNullException(nameof(vectorB));
123+
if (vectorA.Count != vectorB.Count) throw new ArgumentException("Vectors must have the same length.");
124+
125+
double dot = 0, na = 0, nb = 0;
126+
127+
for (int i = 0; i < vectorA.Count; i++)
128+
{
129+
var ai = vectorA[i];
130+
var bi = vectorB[i];
131+
dot += ai * bi;
132+
na += ai * ai;
133+
nb += bi * bi;
134+
}
135+
136+
var denom = Math.Sqrt(na) * Math.Sqrt(nb);
137+
if (denom == 0) return 0;
138+
139+
var cos = dot / denom;
140+
141+
if (cos > 1) return 1;
142+
if (cos < -1) return -1;
143+
144+
return cos;
145+
}
146+
103147
private static IList<string> SplitIntoSentences(string text)
104148
{
105149
var result = new List<string>();
@@ -235,8 +279,6 @@ private static double ThresholdFromTargetCount(IReadOnlyList<double> distances,
235279
return Percentile(distances, percentile);
236280
}
237281

238-
// ---------- Statistics helpers ----------
239-
240282
private static double Percentile(IReadOnlyList<double> sequence, double p)
241283
{
242284
double[] sorted = sequence.OrderBy(v => v).ToArray();
@@ -270,20 +312,4 @@ private static double[] Gradient(IReadOnlyList<double> sequence)
270312

271313
return g;
272314
}
273-
274-
private static double CosineSimilarity(IReadOnlyList<float> a, IReadOnlyList<float> b)
275-
{
276-
double dot = 0;
277-
double magnitudeA = 0;
278-
double magnitudeB = 0;
279-
280-
for (int i = 0; i < a.Count; i++)
281-
{
282-
dot += a[i] * b[i];
283-
magnitudeA += a[i] * a[i];
284-
magnitudeB += b[i] * b[i];
285-
}
286-
287-
return dot / (Math.Sqrt(magnitudeA) * Math.Sqrt(magnitudeB));
288-
}
289315
}

src/SemanticChunker.NET/SemanticChunkerNET.csproj

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
<PropertyGroup>
1010
<PackageId>SemanticChunker.NET</PackageId>
11-
<Version>1.0.1</Version>
11+
<Version>1.1.0</Version>
1212
<Authors>Gregor Biswanger</Authors>
1313
<Description>
1414
SemanticChunker.NET delivers automatic Semantic Chunking for Retrieval‑Augmented Generation in .NET. The library splits long documents into embedding‑aware, context‑preserving chunks that fit your LLM’s token budget. Compatible with Microsoft.Extensions.AI and Semantic Kernel, featuring four breakpoint strategies, target‑chunk mode, multilingual sentence detection and token‑limit safety.
@@ -36,7 +36,7 @@
3636

3737
<ItemGroup>
3838
<PackageReference Include="ICU4N" Version="60.1.0-alpha.438" />
39-
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="9.7.1" />
39+
<PackageReference Include="Microsoft.Extensions.AI.Abstractions" Version="10.0.0" />
4040
</ItemGroup>
4141

4242
<ItemGroup>

0 commit comments

Comments
 (0)