Skip to content

Commit 3977214

Browse files
authored
🎇 增加simhash算法以及修复cosine算法的一些bug (#4)
* 🎇 增加simhash算法以及修复cosine算法的一些bug * 🎇 代码覆盖率,simhash在文本长度毕竟大时覆盖率会大很多。
1 parent bc95bc9 commit 3977214

File tree

9 files changed

+228
-14
lines changed

9 files changed

+228
-14
lines changed

README.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,10 @@ go get -u github.com/antlabs/strsim
2626
* 莱文斯坦-编辑距离(Levenshtein)
2727
* Hamming
2828
* Dice's coefficient
29-
* Jaro
30-
* JaroWinkler
31-
* Cosine similarity algorithm
29+
* Jaro
30+
* JaroWinkler
31+
* Cosine
32+
* Simhash
3233

3334
## 内容
3435
- [比较两个字符串相识度](#比较两个字符串相识度)
@@ -39,6 +40,9 @@ go get -u github.com/antlabs/strsim
3940
- [选择Dice's coefficient](#选择Dice's-coefficient)
4041
- [选择jaro](#选择jaro)
4142
- [选择Hamming](#选择Hamming)
43+
- [选择JaroWinkler](#选择JaroWinkler)
44+
- [选择Cosine](#选择Cosine)
45+
- [选择Simhash](#选择Simhash)
4246
## 比较两个字符串相识度
4347
```go
4448
strsim.Compare("中国人", "")
@@ -86,3 +90,9 @@ strsim.Compare("abc", "ab", strsim.Hamming())
8690
strsim.Compare("abc", "ab", strsim.Cosine())
8791
```
8892

93+
### 选择Simhash
94+
95+
```go
96+
strsim.Compare("abc", "ab", strsim.Simhash())
97+
```
98+

cosine_conf.go

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@ import "github.com/antlabs/strsim/similarity"
55
// CosineConf is a configuration struct for Cosine similarity.
66

77
func Cosine() OptionFunc {
8+
89
return OptionFunc(func(o *option) {
9-
h := &similarity.Cosine{}
10-
o.base64 = true
11-
o.cmp = h.CompareUtf8
12-
if o.ascii {
13-
o.cmp = h.CompareAscii
10+
if o.cmp == nil {
11+
l := similarity.Cosine{}
12+
o.base64 = true
13+
o.cmp = l.CompareUtf8
14+
if o.ascii {
15+
o.cmp = l.CompareAscii
16+
}
1417
}
1518
})
1619

jaro_winkler_conf.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ func JaroWinkler(matchWindow ...int) OptionFunc {
99
if len(matchWindow) > 0 {
1010
mw = matchWindow[0]
1111
}
12-
d := &similarity.Jaro{MatchWindow: mw}
12+
d := &similarity.JaroWinkler{MatchWindow: mw}
1313
o.cmp = d.CompareUtf8
1414
})
1515
}

prev_modify_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ func Test_ModifyString(t *testing.T) {
2020

2121
o.ignore |= ignoreCase
2222
o.ignore |= ignoreSpace
23+
o.base64 = true
24+
2325
for _, v := range []testCase{
2426
{
2527
test: "hello world",

simhash_conf.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package strsim
2+
3+
import "github.com/antlabs/strsim/similarity"
4+
5+
func Simhash() OptionFunc {
6+
return OptionFunc(func(o *option) {
7+
if o.cmp == nil {
8+
l := similarity.Simhash{}
9+
o.base64 = true
10+
o.cmp = l.CompareUtf8
11+
if o.ascii {
12+
o.cmp = l.CompareAscii
13+
}
14+
}
15+
})
16+
17+
}

similarity/Cosine.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,13 @@ func (c Cosine) CompareAscii(s1, s2 string) float64 {
1616
func (c Cosine) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
1717
l1 := utf8.RuneCountInString(utf8Str1)
1818
l2 := utf8.RuneCountInString(utf8Str2)
19-
dirts1 := make(map[string]int, l1)
20-
dirts2 := make(map[string]int, l2)
19+
//l1 := len(utf8Str1)
20+
//l2 := len(utf8Str2)
21+
l3 := utf8.RuneCountInString(base64Table)
22+
dirts1 := make(map[string]int, l3)
23+
dirts2 := make(map[string]int, l3)
2124
// 将base64Table转化成[]string
22-
base64 := StrToStrs(base64Table, utf8.RuneCountInString(base64Table))
25+
base64 := StrToStrs(base64Table, l3)
2326
// 遍历base64对dirts1和dirts2进行初始化
2427
for _, v := range base64 {
2528
dirts1[v] = 0

similarity/simhash.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
package similarity
2+
3+
import (
4+
"hash/crc32"
5+
"strconv"
6+
"unicode/utf8"
7+
)
8+
9+
type Simhash struct {
10+
}
11+
12+
func (s Simhash) CompareAscii(s1, s2 string) float64 {
13+
return s.CompareUtf8(s1, s2)
14+
15+
}
16+
func (s Simhash) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
17+
// 字符串长度
18+
l1 := utf8.RuneCountInString(utf8Str1)
19+
l2 := utf8.RuneCountInString(utf8Str2)
20+
// 将字符串转换为字符数组
21+
s1s := StrToStrs4(utf8Str1, l1)
22+
s2s := StrToStrs4(utf8Str2, l2)
23+
// 计算每个字符在字符数组中出现的次数
24+
counts1 := make(map[string]int)
25+
counts2 := make(map[string]int)
26+
for _, s := range s1s {
27+
// 如果字符在字符数组中出现过,则计数加1
28+
if _, ok := counts1[s]; ok {
29+
counts1[s]++
30+
} else {
31+
// 如果字符在字符数组中没出现过,则计数设为1
32+
counts1[s] = 1
33+
}
34+
}
35+
for _, s := range s2s {
36+
if _, ok := counts2[s]; ok {
37+
counts2[s]++
38+
} else {
39+
counts2[s] = 1
40+
}
41+
}
42+
h1 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts1))))
43+
h2 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts2))))
44+
45+
// 计算h1, h2的汉明距离
46+
Hamming := Hamming{}
47+
//fmt.Printf("h1: %s\nh2: %s\n", h1, h2)
48+
49+
return Hamming.CompareUtf8(h1, h2)
50+
51+
}
52+
53+
// 降维度
54+
func Dimensionality(ins []int) []int {
55+
for i := 0; i < len(ins); i++ {
56+
if ins[i] > 0 {
57+
ins[i] = 1
58+
} else {
59+
ins[i] = 0
60+
}
61+
62+
}
63+
return ins
64+
}
65+
66+
//合并
67+
func merge(ins [][]int) []int {
68+
res := make([]int, len(ins[0]))
69+
lens := len(ins)
70+
for i := 0; i < lens; i++ {
71+
for j := 0; j < len(ins[i]); j++ {
72+
res[j] += ins[i][j]
73+
}
74+
}
75+
return res
76+
}
77+
78+
// 计算hashcode并加权
79+
func hashcodeAndAdd(counts map[string]int) [][]int {
80+
// hashmap
81+
lens := len(counts)
82+
h1 := make([][]int, lens)
83+
// 计算counts1,counts2 中每个字符的hash值, 并且将出现的次数分为5个等级, 将每个字符的hash值与出现的次数等级相乘
84+
c1 := (lens - 1) * 4.0
85+
j := 0
86+
//for j := 0; j < lens; j++ {
87+
for k, v := range counts {
88+
////计算每一个字符串的hash
89+
//for i := 0; i < len(h1); i++ {
90+
// 出现的次数除以5
91+
c := strconv.FormatUint(uint64(crc32.ChecksumIEEE([]byte(k))), 2)
92+
// 将字符串转换为数字数组
93+
cs := Int32StrToInts(c)
94+
if v <= c1/5.0 {
95+
// 加权
96+
h1[j] = Add(cs, 1)
97+
} else if v <= c1/5.0*2 {
98+
// 加权
99+
h1[j] = Add(cs, 2)
100+
} else if v <= c1/5.0*3 {
101+
// 加权
102+
h1[j] = Add(cs, 3)
103+
} else if v <= c1/5.0*4 {
104+
// 加权
105+
h1[j] = Add(cs, 4)
106+
} else {
107+
// 加权
108+
h1[j] = Add(cs, 5)
109+
}
110+
j++
111+
}
112+
113+
return h1
114+
}

similarity/utils.go

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package similarity
33
import (
44
"encoding/base64"
55
"reflect"
6+
"strconv"
67
"unsafe"
78
)
89

@@ -41,8 +42,63 @@ func Base64Encode(s string) string {
4142
// StrToStrs 字符串转化字符数组
4243
func StrToStrs(s string, lenth int) []string {
4344
base := make([]string, lenth)
44-
for i := 0; i < len(s); i++ {
45-
base = append(base, string(s[i]))
45+
for i := 0; i < lenth; i++ {
46+
base[i] = string(s[i])
4647
}
4748
return base
4849
}
50+
51+
// StrToStrs4 每隔四个字符转换成一个字符串
52+
func StrToStrs4(s string, lenth int) []string {
53+
base := make([]string, lenth/4)
54+
var j = 0
55+
for i := 0; i < lenth; i += 4 {
56+
//base = append(base, s[i:i+4])
57+
base[j] = s[i : i+4]
58+
j++
59+
}
60+
return base
61+
}
62+
63+
// Add 加权
64+
func Add(uint64 []int, int int) []int {
65+
lens := len(uint64)
66+
for i := 0; i < 32; i++ {
67+
if i < lens {
68+
if uint64[i] == 1 {
69+
uint64[i] = int
70+
} else {
71+
uint64[i] = -int
72+
}
73+
} else {
74+
uint64 = append(uint64, int)
75+
}
76+
77+
}
78+
return uint64
79+
}
80+
81+
// Int32StrToInts 将uint64转换成string
82+
func Int32StrToInts(ins string) []int {
83+
uints := make([]int, 32)
84+
85+
for i := 0; i < len(ins); i++ {
86+
if string(ins[i]) == "1" {
87+
uints[i] = 1
88+
} else if string(ins[i]) == "0" {
89+
uints[i] = 0
90+
}
91+
}
92+
return uints
93+
94+
}
95+
96+
// IntsToStr []int 转换成string
97+
func IntsToStr(ins []int) string {
98+
res := ""
99+
for _, v := range ins {
100+
res += strconv.Itoa(v)
101+
}
102+
103+
return res
104+
}

strsim_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ func Test_Compare_Special(t *testing.T) {
1818
Jaro(),
1919
DiceCoefficient(1),
2020
Hamming(),
21+
Simhash(),
22+
Cosine(),
23+
JaroWinkler(),
2124
} {
2225
sim := Compare(v.arg1, v.arg2, o)
2326
assert.Equal(t, v.sim, sim)
@@ -39,6 +42,9 @@ func Test_FindBestMatchOne(t *testing.T) {
3942
DiceCoefficient(1),
4043
Jaro(),
4144
Default(),
45+
Simhash(),
46+
Cosine(),
47+
JaroWinkler(),
4248
} {
4349
m := FindBestMatchOne(d.key, d.best, o)
4450
assert.Equal(t, m.S, d.need)
@@ -54,6 +60,9 @@ func Test_FindBestMatch(t *testing.T) {
5460
DiceCoefficient(1),
5561
Jaro(),
5662
Default(),
63+
Simhash(),
64+
Cosine(),
65+
JaroWinkler(),
5766
} {
5867
m := FindBestMatch(d.key, d.best, o)
5968
assert.Equal(t, m.Match.S, d.need)

0 commit comments

Comments
 (0)