Skip to content

Commit bc0d8ec

Browse files
authored
Merge pull request #55 from Alokzh/Jensen-Shannon-Implementation
Jensen shannon implementation
2 parents 9c7f505 + d954322 commit bc0d8ec

File tree

2 files changed

+235
-0
lines changed

2 files changed

+235
-0
lines changed
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
Class {
2+
#name : 'AIJensenShannonDistanceTest',
3+
#superclass : 'TestCase',
4+
#instVars : [
5+
'jensenShannonDistance'
6+
],
7+
#category : 'AI-EditDistances-Tests',
8+
#package : 'AI-EditDistances-Tests'
9+
}
10+
11+
{ #category : 'running' }
12+
AIJensenShannonDistanceTest >> setUp [
13+
super setUp.
14+
jensenShannonDistance := AIJensenShannonDistance new
15+
]
16+
17+
{ #category : 'tests' }
18+
AIJensenShannonDistanceTest >> testDistanceCaseSensitive [
19+
20+
self assert: (jensenShannonDistance distanceBetween: 'abc' and: 'ABC') equals: 1.0
21+
]
22+
23+
{ #category : 'tests' }
24+
AIJensenShannonDistanceTest >> testDistanceCompletelyDifferentStrings [
25+
26+
self assert: (jensenShannonDistance distanceBetween: 'AAA' and: 'BBB') equals: 1.0.
27+
28+
self assert: (jensenShannonDistance distanceBetween: 'ABC' and: 'XYZ') equals: 1.0
29+
]
30+
31+
{ #category : 'tests' }
32+
AIJensenShannonDistanceTest >> testDistanceEmptyStrings [
33+
34+
self assert: (jensenShannonDistance distanceBetween: '' and: '') equals: 0.0
35+
]
36+
37+
{ #category : 'tests' }
38+
AIJensenShannonDistanceTest >> testDistanceIdenticalStrings [
39+
40+
self assert: (jensenShannonDistance distanceBetween: 'ABC' and: 'ABC') equals: 0.0.
41+
42+
self assert: (jensenShannonDistance distanceBetween: 'HELLO' and: 'HELLO') equals: 0.0
43+
]
44+
45+
{ #category : 'tests' }
46+
AIJensenShannonDistanceTest >> testDistanceOneEmptyString [
47+
48+
self assert: (jensenShannonDistance distanceBetween: '' and: 'ABC') equals: 1.0.
49+
50+
self assert: (jensenShannonDistance distanceBetween: 'ABC' and: '') equals: 1.0
51+
]
52+
53+
{ #category : 'tests' }
54+
AIJensenShannonDistanceTest >> testDistancePartialOverlap [
55+
56+
| result |
57+
result := jensenShannonDistance distanceBetween: 'ABCD' and: 'ABCE'.
58+
self assert: result equals: 0.5.
59+
60+
result := jensenShannonDistance distanceBetween: 'ABC' and: 'ACE'.
61+
self assert: (result between: 0.4 and: 0.8)
62+
]
63+
64+
{ #category : 'tests' }
65+
AIJensenShannonDistanceTest >> testDistanceSameDistribution [
66+
67+
self assert: (jensenShannonDistance distanceBetween: 'AAA' and: 'AAAA') equals: 0.0.
68+
self assert: (jensenShannonDistance distanceBetween: 'AB' and: 'AABB') equals: 0.0
69+
]
70+
71+
{ #category : 'tests' }
72+
AIJensenShannonDistanceTest >> testDistanceSymmetric [
73+
74+
| distance1 distance2 |
75+
distance1 := jensenShannonDistance distanceBetween: 'HELLO' and: 'WORLD'.
76+
distance2 := jensenShannonDistance distanceBetween: 'WORLD' and: 'HELLO'.
77+
self assert: distance1 equals: distance2
78+
]
79+
80+
{ #category : 'tests' }
81+
AIJensenShannonDistanceTest >> testDistanceValidRange [
82+
83+
| result |
84+
result := jensenShannonDistance distanceBetween: 'AABB' and: 'BBCC'.
85+
self assert: (result between: 0.0 and: 1.0)
86+
]
87+
88+
{ #category : 'tests' }
89+
AIJensenShannonDistanceTest >> testDistanceWithLargeStrings [
90+
91+
| str1 str2 result |
92+
str1 := 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.
93+
str2 := 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'.
94+
result := jensenShannonDistance distanceBetween: str1 and: str2.
95+
self assert: result equals: 0.0.
96+
97+
str1 := 'AAAABBBBCCCCDDDDEEEEFFFFGGGG'.
98+
str2 := 'AAAABBBBCCCCDDDDEEEEFFFFHHHH'.
99+
result := jensenShannonDistance distanceBetween: str1 and: str2.
100+
self assert: (result between: 0.2 and: 0.4)
101+
]
102+
103+
{ #category : 'tests' }
104+
AIJensenShannonDistanceTest >> testDistanceWithNumericStrings [
105+
106+
self
107+
assert: (jensenShannonDistance distanceBetween: '123' and: '123')
108+
equals: 0.0.
109+
110+
self
111+
assert: (jensenShannonDistance distanceBetween: '123' and: '456')
112+
equals: 1.0
113+
]
114+
115+
{ #category : 'tests' }
116+
AIJensenShannonDistanceTest >> testDistanceWithSpecialCharacters [
117+
118+
self assert: (jensenShannonDistance distanceBetween: '123' and: '456') equals: 1.0.
119+
self assert: (jensenShannonDistance distanceBetween: 'A@#' and: 'A@#') equals: 0.0
120+
]
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
"
2+
Jensen-Shannon Distance Algorithm
3+
The Jensen-Shannon distance is a metric derived from the Jensen-Shannon divergence, which measures the similarity between two probability distributions. It is symmetric, always finite,
4+
and produces values between 0 and 1, where 0 indicates identical strings and 1 indicates maximum dissimilarity.
5+
6+
For string comparison:
7+
1. We convert strings to probability distributions
8+
2. Calculate the Jensen-Shannon divergence between these distributions
9+
3. Take the square root to get the distance
10+
11+
The Jensen-Shannon distance is calculated as:
12+
JS_Distance(P, Q) = sqrt(JS_Divergence(P, Q))
13+
14+
where the Jensen-Shannon Divergence is:
15+
JS_Divergence(P, Q) = 1/2 * KL(P || M) + 1/2 * KL(Q || M)
16+
17+
where:
18+
- M = 1/2 * (P + Q) is the pointwise mean of the distributions
19+
- KL is the Kullback-Leibler divergence
20+
21+
Examples:
22+
jsDistance := AIJensenShannonDistance new.
23+
jsDistance distanceBetween: 'ABCD' and: 'ABCE'. """"Returns: a value close to 0.5""""
24+
jsDistance distanceBetween: 'AAAA' and: 'AAAA'. """"Returns: 0.0""""
25+
jsDistance distanceBetween: 'AAAA' and: 'BBBB'. """"Returns: 1.0""""
26+
"
27+
Class {
28+
#name : 'AIJensenShannonDistance',
29+
#superclass : 'AIAbstractEditDistance',
30+
#category : 'AI-EditDistances-Distances',
31+
#package : 'AI-EditDistances',
32+
#tag : 'Distances'
33+
}
34+
35+
{ #category : 'api' }
36+
AIJensenShannonDistance >> distanceBetween: firstString and: secondString [
37+
"Compute the Jensen-Shannon distance between two strings.
38+
Returns a value between 0 and 1, where 0 means identical distributions & 1.0 means completely different."
39+
40+
| firstDist secondDist jsDivergence |
41+
(firstString isEmpty and: [ secondString isEmpty ]) ifTrue: [ ^ 0.0 ].
42+
firstString isEmpty ifTrue: [ ^ 1.0 ].
43+
secondString isEmpty ifTrue: [ ^ 1.0 ].
44+
45+
firstDist := self getProbabilityDistribution: firstString.
46+
secondDist := self getProbabilityDistribution: secondString.
47+
jsDivergence := self
48+
jensenShannonDivergence: firstDist
49+
and: secondDist.
50+
51+
jsDivergence < 0 ifTrue: [ jsDivergence := 0.0 ].
52+
^ jsDivergence sqrt
53+
]
54+
55+
{ #category : 'private' }
56+
AIJensenShannonDistance >> getProbabilityDistribution: aString [
57+
"Convert a string to a probability distribution (character frequencies)"
58+
59+
| charCount dist |
60+
charCount := Dictionary new.
61+
dist := Dictionary new.
62+
63+
aString do: [ :char |
64+
charCount at: char put: (charCount at: char ifAbsent: 0) + 1 ].
65+
66+
charCount keysAndValuesDo: [ :char :count |
67+
dist at: char put: (count asFloat / aString size asFloat) ].
68+
69+
^ dist
70+
]
71+
72+
{ #category : 'private' }
73+
AIJensenShannonDistance >> jensenShannonDivergence: firstDist and: secondDist [
74+
"Calculate the Jensen-Shannon divergence between two probability distributions"
75+
76+
| allKeys midpointDist firstKL secondKL |
77+
allKeys := Set new.
78+
allKeys
79+
addAll: firstDist keys;
80+
addAll: secondDist keys.
81+
82+
midpointDist := Dictionary new.
83+
allKeys do: [ :key |
84+
| p1 p2 |
85+
p1 := firstDist at: key ifAbsent: 0.0.
86+
p2 := secondDist at: key ifAbsent: 0.0.
87+
midpointDist at: key put: p1 + p2 / 2.0 ].
88+
89+
firstKL := self kullbackLeiblerDivergence: firstDist to: midpointDist.
90+
secondKL := self
91+
kullbackLeiblerDivergence: secondDist
92+
to: midpointDist.
93+
94+
^ ( firstKL + secondKL ) / 2.0
95+
]
96+
97+
{ #category : 'private' }
98+
AIJensenShannonDistance >> kullbackLeiblerDivergence: pDist to: qDist [
99+
"Calculate the Kullback-Leibler divergence of pDist relative to qDist.
100+
Note: qDist must contain all keys present in pDist"
101+
102+
| divergence |
103+
divergence := 0.0.
104+
105+
pDist keysAndValuesDo: [ :key :pValue |
106+
| qValue |
107+
pValue > 0 ifTrue: [
108+
qValue := qDist at: key.
109+
"qValue should never be 0 here since we prepare qDist to contain all keys from pDist"
110+
qValue > 0 ifTrue: [
111+
divergence := divergence
112+
+ (pValue * (pValue / qValue) ln / 2 ln) ] ] ].
113+
114+
^ divergence
115+
]

0 commit comments

Comments
 (0)