Skip to content

Commit 110af2a

Browse files
authored
Add custom geo data file (#172)
PR: #172
1 parent f50381a commit 110af2a

19 files changed

+36091
-574
lines changed

README.md

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,14 @@ Address(
9999
0.0
100100
```
101101

102-
## 1.3 自定义地址设置
102+
## 1.3 自定义地址文件设置
103+
104+
```kotlin
105+
// 文件生成方式见下文
106+
val geocoding = GeocodingX("region_2021.dat")
107+
```
108+
109+
## 1.4 自定义地址设置
103110

104111
```kotlin
105112
// 100000000000 代表中国的ID
@@ -128,20 +135,31 @@ Address(
128135
# 2. 说明
129136

130137
## 2.1 标准地址库
131-
项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用[国家的标准地址库][2] (对应的github库, [中国5级行政区域mysql库][3]).
138+
项目目前采用的是 [淘宝物流4级地址][1] 的标准地址库, 也可以采用`国家的标准地址库` (对应的github库, [中国5级行政区域mysql库][3]).
139+
140+
* [国家标准地址库2015][2]
141+
* [国家标准地址库2021](http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2021/)
132142

133143
### 导入中国5级行政区域mysql库注意事项
134144

135145
[参考文档](https://github.com/bitlap/geocoding/blob/master/src/test/java/org/bitlap/geocoding/region/README.md)
136146

137-
## 2.2 标准化
147+
## 2.2 标准地址库(兼容本项目)
148+
149+
| 标准库文件 | 描述 | 参考 | 感谢 |
150+
|----------------|-------------|-------------------------------------------------------------|--------------------------------------------------------------------------------------|
151+
| region_2021.dat | 国家标准地址库2021 | [ISSUE-163](https://github.com/bitlap/geocoding/issues/163) | [TsLenMo](https://github.com/TsLenMo)[weijiang.lin](https://github.com/linweijiang) |
152+
153+
使用方式:文件下载到`classpath`,使用自定义的`GeocodingX`类即可。
154+
155+
## 2.3 标准化
138156
1. 首先基于正则提取出道路、建筑物号等信息
139157
2. 省市区等匹配
140158
1. 将标准的地址库建立**倒排索引**
141159
2. 将文本从起始位置开始, 采用**最大长度优先**的方式匹配所有词条
142160
3. 对所有匹配结果进行标准行政区域从属关系校验
143161

144-
## 2.3 相似度计算
162+
## 2.4 相似度计算
145163
1. 对输入的两个地址进行标准化
146164
2. 对省市区等信息分配不同的权重
147165
3. 对道路号, 建筑号进行语义处理, 分配权重

pom.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
<groupId>org.bitlap</groupId>
88
<artifactId>geocoding</artifactId>
9-
<version>1.2.0</version>
9+
<version>1.3.0</version>
1010

1111
<name>geocoding</name>
1212
<description>地理编码技术,提供地址标准化和相似度计算。</description>

src/main/java/org/bitlap/geocoding/Geocoding.kt

Lines changed: 26 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package org.bitlap.geocoding;
22

33
import org.bitlap.geocoding.core.Context
44
import org.bitlap.geocoding.model.Address
5-
import org.bitlap.geocoding.model.Address.Companion.build
65
import org.bitlap.geocoding.model.RegionEntity
76
import org.bitlap.geocoding.model.RegionType
87
import org.bitlap.geocoding.similarity.Document
@@ -16,62 +15,62 @@ import org.bitlap.geocoding.similarity.MatchedResult
1615
*/
1716
object Geocoding {
1817

18+
@JvmField
19+
val DEFAULT = GeocodingX()
20+
1921
/**
2022
* 地址的标准化, 将不规范的地址清洗成标准的地址格式
2123
*/
2224
@JvmStatic
2325
fun normalizing(address: String): Address? {
24-
return build(Context.getInterpreter().interpret(address))
26+
return DEFAULT.normalizing(address)
2527
}
2628

2729
/**
2830
* 将地址进行切分
2931
*/
3032
@JvmStatic
3133
fun analyze(address: String): Document? {
32-
val addr = normalizing(address) ?: return null
33-
return Context.getComputer().analyze(addr)
34+
return DEFAULT.analyze(address)
3435
}
3536
@JvmStatic
3637
fun analyze(address: Address?): Document? {
37-
address ?: return null
38-
return Context.getComputer().analyze(address)
38+
return DEFAULT.analyze(address)
3939
}
4040

4141
/**
4242
* 地址的相似度计算
4343
*/
4444
@JvmStatic
45-
fun similarity(addr1: String, addr2: String): Double {
46-
val compute = Context.getComputer().compute(
47-
normalizing(addr1),
48-
normalizing(addr2)
49-
)
50-
return compute.similarity
45+
fun similarity(address1: String, address2: String): Double {
46+
return DEFAULT.similarity(address1, address2)
5147
}
5248
@JvmStatic
53-
fun similarity(addr1: Address?, addr2: Address?): Double {
54-
val compute = Context.getComputer().compute(addr1, addr2)
55-
return compute.similarity
49+
fun similarity(address1: Address?, address2: Address?): Double {
50+
return DEFAULT.similarity(address1, address2)
5651
}
5752

5853
/**
5954
* 地址相似度计算, 包含匹配的所有结果
6055
*/
6156
@JvmStatic
62-
fun similarityWithResult(addr1: String, addr2: String): MatchedResult {
63-
return Context.getComputer().compute(
64-
normalizing(addr1),
65-
normalizing(addr2)
66-
)
57+
fun similarityWithResult(address1: String, address2: String): MatchedResult {
58+
return DEFAULT.similarityWithResult(address1, address2)
6759
}
6860
@JvmStatic
69-
fun similarityWithResult(addr1: Address?, addr2: Address?): MatchedResult {
70-
return Context.getComputer().compute(addr1, addr2)
61+
fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
62+
return DEFAULT.similarityWithResult(address1, address2)
63+
}
64+
65+
/**
66+
* 深度优先匹配符合[text]的地址信息
67+
*/
68+
fun match(text: String): List<RegionEntity> {
69+
return DEFAULT.match(text)
7170
}
7271

7372
@JvmStatic
74-
fun getContext(): Context = Context
73+
fun getContext(): Context = DEFAULT.ctx
7574

7675
/**
7776
* 设置自定义地址
@@ -81,25 +80,11 @@ object Geocoding {
8180
* @param name 地址的名称
8281
* @param type 地址类型, [RegionType]
8382
* @param alias 地址的别名
83+
* @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
8484
*/
8585
@JvmStatic
86-
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "") {
87-
val persister = getContext().getPersister()
88-
persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId")
89-
if (name.isBlank()) {
90-
throw IllegalArgumentException("name should not be blank.")
91-
}
92-
// 构建 region 对象
93-
val region = RegionEntity()
94-
region.id = id
95-
region.parentId = parentId
96-
region.name = name
97-
region.alias = alias
98-
region.type = type
99-
// 1. Add to cache (id -> Region)
100-
persister.addRegionEntity(region)
101-
// 2. Build term index
102-
val indexBuilder = getContext().getInterpreter().getTermIndexBuilder()
103-
indexBuilder.indexRegions(listOf(region))
86+
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true): Geocoding {
87+
DEFAULT.addRegionEntry(id, parentId, name, type, alias, replace)
88+
return this
10489
}
10590
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package org.bitlap.geocoding
2+
3+
import org.bitlap.geocoding.core.Context
4+
import org.bitlap.geocoding.model.Address
5+
import org.bitlap.geocoding.model.RegionEntity
6+
import org.bitlap.geocoding.model.RegionType
7+
import org.bitlap.geocoding.similarity.Document
8+
import org.bitlap.geocoding.similarity.MatchedResult
9+
10+
11+
/**
12+
* Create custom geocoding
13+
*/
14+
open class GeocodingX(val ctx: Context) {
15+
16+
constructor(): this(false)
17+
constructor(strict: Boolean): this("core/region.dat", strict)
18+
constructor(dataClassPath: String): this(dataClassPath, false)
19+
20+
/**
21+
* @param dataClassPath 自定义地址文档的classpath路径
22+
* @param strict 解析模式, 默认为false。当发现没有省和市,且匹配的父项数量等于1时,能成功匹配。
23+
* * true: 严格模式,当发现没有省和市,且匹配的父项数量大于0时,返回null
24+
* * false: 非严格模式,当发现没有省和市,且匹配的父项数量大于0时,匹配随机一项省和市
25+
*/
26+
constructor(dataClassPath: String, strict: Boolean): this(Context(dataClassPath, strict))
27+
28+
/**
29+
* 地址的标准化, 将不规范的地址清洗成标准的地址格式
30+
*/
31+
fun normalizing(address: String): Address? {
32+
return Address.build(ctx.interpreter.interpret(address))
33+
}
34+
35+
/**
36+
* 将地址进行切分
37+
*/
38+
fun analyze(address: String): Document? {
39+
val add = normalizing(address) ?: return null
40+
return ctx.computer.analyze(add)
41+
}
42+
fun analyze(address: Address?): Document? {
43+
address ?: return null
44+
return ctx.computer.analyze(address)
45+
}
46+
47+
/**
48+
* 地址的相似度计算
49+
*/
50+
fun similarity(address1: String, address2: String): Double {
51+
val compute = ctx.computer.compute(
52+
normalizing(address1),
53+
normalizing(address2)
54+
)
55+
return compute.similarity
56+
}
57+
fun similarity(address1: Address?, address2: Address?): Double {
58+
val compute = ctx.computer.compute(address1, address2)
59+
return compute.similarity
60+
}
61+
62+
/**
63+
* 地址相似度计算, 包含匹配的所有结果
64+
*/
65+
fun similarityWithResult(address1: String, address2: String): MatchedResult {
66+
return ctx.computer.compute(
67+
normalizing(address1),
68+
normalizing(address2)
69+
)
70+
}
71+
fun similarityWithResult(address1: Address?, address2: Address?): MatchedResult {
72+
return ctx.computer.compute(address1, address2)
73+
}
74+
75+
/**
76+
* 深度优先匹配符合[text]的地址信息
77+
*/
78+
fun match(text: String): List<RegionEntity> {
79+
val terms = ctx.interpreter.getTermIndexBuilder().fullMatch(text) ?: emptyList()
80+
return terms.mapNotNull { it.value }
81+
}
82+
83+
/**
84+
* 设置自定义地址
85+
*
86+
* @param id 地址的ID
87+
* @param parentId 地址的父ID, 必须存在
88+
* @param name 地址的名称
89+
* @param type 地址类型, [RegionType]
90+
* @param alias 地址的别名
91+
* @param replace 是否替换旧地址, 当除了[id]之外的字段, 如果相等就替换
92+
*/
93+
fun addRegionEntry(id: Long, parentId: Long, name: String, type: RegionType = RegionType.Undefined, alias: String = "", replace: Boolean = true) {
94+
ctx.persister.getRegion(parentId) ?: throw IllegalArgumentException("Parent Address is not exists, parentId is $parentId")
95+
if (name.isBlank()) {
96+
throw IllegalArgumentException("name should not be blank.")
97+
}
98+
// 构建 region 对象
99+
val region = RegionEntity()
100+
region.id = id
101+
region.parentId = parentId
102+
region.name = name
103+
region.alias = alias
104+
region.type = type
105+
// 1. Add to cache (id -> Region)
106+
ctx.persister.addRegionEntity(region)
107+
// 2. Build term index
108+
val indexBuilder = ctx.interpreter.getTermIndexBuilder()
109+
indexBuilder.indexRegions(listOf(region), replace)
110+
}
111+
}

src/main/java/org/bitlap/geocoding/core/Context.kt

Lines changed: 10 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ package org.bitlap.geocoding.core
22

33
import org.bitlap.geocoding.core.impl.DefaultAddressInterpreter
44
import org.bitlap.geocoding.core.impl.DefaultAddressPersister
5-
import org.bitlap.geocoding.core.impl.DefaultRegoinCache
5+
import org.bitlap.geocoding.core.impl.DefaultRegionCache
66
import org.bitlap.geocoding.core.impl.RegionInterpreterVisitor
77
import org.bitlap.geocoding.core.impl.SimilarityComputer
88

@@ -12,66 +12,15 @@ import org.bitlap.geocoding.core.impl.SimilarityComputer
1212
* Created by IceMimosa
1313
* Date: 2017/1/12
1414
*/
15-
object Context {
15+
open class Context(
16+
val dataClassPath: String,
17+
val strict: Boolean,
18+
val persister: AddressPersister = DefaultAddressPersister(DefaultRegionCache(dataClassPath)),
19+
val visitor: TermIndexVisitor = RegionInterpreterVisitor(persister, strict),
20+
val interpreter: AddressInterpreter = DefaultAddressInterpreter(persister, visitor),
21+
val computer: Computer = SimilarityComputer(),
22+
) {
1623

17-
private var interpreter: AddressInterpreter? = null
18-
private var persister: AddressPersister? = null
19-
private var computer: Computer? = null
2024

21-
init {
22-
// region entity默认, 此处暂时直接实例化
23-
persister = DefaultAddressPersister(DefaultRegoinCache())
24-
// 实例化
25-
interpreter = DefaultAddressInterpreter()
26-
// 计算类
27-
computer = SimilarityComputer()
28-
}
2925

30-
// 获取 AddressInterpreter
31-
fun getInterpreter(): AddressInterpreter {
32-
interpreter ?: throw IllegalArgumentException("[Context] -> 地址解析服务类初始化失败.")
33-
return interpreter!!
34-
}
35-
36-
// 获取 AddressPersister
37-
fun getPersister(): AddressPersister {
38-
persister ?: throw IllegalArgumentException("[Context] -> 地址持久化服务类初始化失败.")
39-
return persister!!
40-
}
41-
42-
// 获取 visitor
43-
fun getVisitor(): TermIndexVisitor {
44-
return RegionInterpreterVisitor(getPersister())
45-
}
46-
47-
// 获取 计算类
48-
fun getComputer(): Computer {
49-
computer ?: throw IllegalArgumentException("[Context] -> 地址计算服务类初始化失败.")
50-
return computer!!
51-
}
52-
53-
54-
///////////////////////
55-
// Open API
56-
///////////////////////
57-
58-
59-
fun registInterpreter(interpreter: AddressInterpreter) {
60-
synchronized(this) {
61-
this.interpreter = interpreter
62-
}
63-
}
64-
65-
fun registPersister(persister: AddressPersister) {
66-
synchronized(this) {
67-
this.persister = persister
68-
}
69-
}
70-
71-
fun registComputer(computer: Computer) {
72-
synchronized(this) {
73-
this.computer = computer
74-
}
75-
}
76-
77-
}
26+
}

0 commit comments

Comments
 (0)