Skip to content

Commit 4d400ef

Browse files
authored
Merge pull request #363 from xr843/feat/address-search-amap-v2
feat(kg-map): address search + Amap v2 full coverage (+16K temples)
2 parents 2bd5d4f + 433713d commit 4d400ef

File tree

3 files changed

+374
-31
lines changed

3 files changed

+374
-31
lines changed
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
"""Backfill province/city/district for CN monasteries via Amap reverse geocoding.
2+
3+
Targets: entity_type='monastery', country='CN', province IS NULL, has lat/lng.
4+
Uses Amap regeo API with WGS-84→GCJ-02 conversion.
5+
6+
Rate limit: 0.25s between requests. ~1600 entities ≈ 7 min.
7+
"""
8+
import argparse
9+
import asyncio
10+
import json
11+
import math
12+
import os
13+
import sys
14+
import time
15+
import urllib.parse
16+
import urllib.request
17+
18+
from sqlalchemy import text
19+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
20+
21+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22+
23+
from app.config import settings
24+
25+
AMAP_KEY = "7971e9b134c4684c3b43b6e442475d0e"
26+
AMAP_REGEO_URL = "https://restapi.amap.com/v3/geocode/regeo"
27+
USER_AGENT = "FoJinBot/1.0"
28+
29+
30+
def wgs84_to_gcj02(lng, lat):
31+
"""Convert WGS-84 to GCJ-02."""
32+
a = 6378245.0
33+
ee = 0.00669342162296594323
34+
35+
dlat = _transform_lat(lng - 105.0, lat - 35.0)
36+
dlng = _transform_lon(lng - 105.0, lat - 35.0)
37+
radlat = lat / 180.0 * math.pi
38+
magic = math.sin(radlat)
39+
magic = 1 - ee * magic * magic
40+
sqrtmagic = math.sqrt(magic)
41+
dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * math.pi)
42+
dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * math.pi)
43+
return lng + dlng, lat + dlat
44+
45+
46+
def _transform_lat(x, y):
47+
ret = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
48+
ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
49+
ret += (20.0 * math.sin(y * math.pi) + 40.0 * math.sin(y / 3.0 * math.pi)) * 2.0 / 3.0
50+
ret += (160.0 * math.sin(y / 12.0 * math.pi) + 320.0 * math.sin(y * math.pi / 30.0)) * 2.0 / 3.0
51+
return ret
52+
53+
54+
def _transform_lon(x, y):
55+
ret = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
56+
ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
57+
ret += (20.0 * math.sin(x * math.pi) + 40.0 * math.sin(x / 3.0 * math.pi)) * 2.0 / 3.0
58+
ret += (150.0 * math.sin(x / 12.0 * math.pi) + 300.0 * math.sin(x / 30.0 * math.pi)) * 2.0 / 3.0
59+
return ret
60+
61+
62+
def regeo(lng_wgs, lat_wgs) -> dict | None:
63+
lng_gcj, lat_gcj = wgs84_to_gcj02(lng_wgs, lat_wgs)
64+
params = urllib.parse.urlencode({
65+
"key": AMAP_KEY,
66+
"location": f"{lng_gcj:.6f},{lat_gcj:.6f}",
67+
"extensions": "base",
68+
"output": "json",
69+
})
70+
url = f"{AMAP_REGEO_URL}?{params}"
71+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
72+
with urllib.request.urlopen(req, timeout=10) as resp:
73+
data = json.loads(resp.read())
74+
if data.get("status") != "1":
75+
return None
76+
comp = data.get("regeocode", {}).get("addressComponent", {})
77+
province = comp.get("province", "")
78+
city = comp.get("city", "")
79+
district = comp.get("district", "")
80+
# Amap returns [] for empty fields in municipalities
81+
if isinstance(province, list):
82+
province = ""
83+
if isinstance(city, list):
84+
city = ""
85+
if isinstance(district, list):
86+
district = ""
87+
if not province:
88+
return None
89+
return {"province": province, "city": city or province, "district": district}
90+
91+
92+
async def main():
93+
parser = argparse.ArgumentParser()
94+
parser.add_argument("--dry-run", action="store_true")
95+
parser.add_argument("--limit", type=int, default=2000)
96+
args = parser.parse_args()
97+
98+
print("=" * 60)
99+
print("FoJin — Backfill Address via Amap Reverse Geocoding")
100+
print("=" * 60)
101+
102+
engine = create_async_engine(settings.database_url)
103+
sf = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
104+
105+
async with sf() as session:
106+
result = await session.execute(text("""
107+
SELECT id, name_zh,
108+
(properties->>'latitude')::float AS lat,
109+
(properties->>'longitude')::float AS lng
110+
FROM kg_entities
111+
WHERE entity_type = 'monastery'
112+
AND (properties->>'country' = 'CN' OR properties->>'country' = '中国'
113+
OR properties->>'geo_source' LIKE 'osm:CN%' OR properties->>'geo_source' LIKE 'osm:中国%'
114+
OR properties->>'geo_source' LIKE 'osm_ext%')
115+
AND (properties->>'province' IS NULL OR properties->>'province' = '')
116+
AND properties->>'latitude' IS NOT NULL
117+
LIMIT :limit
118+
"""), {"limit": args.limit})
119+
rows = result.fetchall()
120+
print(f"Found {len(rows)} monasteries to backfill")
121+
122+
stats = {"updated": 0, "failed": 0, "skipped": 0}
123+
124+
for i, (eid, name, lat, lng) in enumerate(rows):
125+
try:
126+
addr = regeo(lng, lat)
127+
time.sleep(0.25)
128+
except Exception as e:
129+
print(f" ERR {eid} {name}: {e}")
130+
stats["failed"] += 1
131+
time.sleep(1)
132+
continue
133+
134+
if not addr:
135+
stats["skipped"] += 1
136+
continue
137+
138+
if not args.dry_run:
139+
props_patch = json.dumps(addr)
140+
await session.execute(text("""
141+
UPDATE kg_entities
142+
SET properties = (properties::jsonb || cast(:patch as jsonb))::json
143+
WHERE id = :id
144+
"""), {"id": eid, "patch": props_patch})
145+
146+
stats["updated"] += 1
147+
148+
if (i + 1) % 200 == 0:
149+
if not args.dry_run:
150+
await session.commit()
151+
print(f" [{i+1}/{len(rows)}] updated: {stats['updated']}, failed: {stats['failed']}")
152+
153+
if not args.dry_run:
154+
await session.commit()
155+
156+
print(f"\n{'='*60}")
157+
print(f"Results: {stats}")
158+
print(f"{'Dry run' if args.dry_run else 'Committed'}")
159+
print("=" * 60)
160+
161+
await engine.dispose()
162+
163+
164+
if __name__ == "__main__":
165+
asyncio.run(main())
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
"""Fetch Chinese Buddhist temples from Amap POI API — v2: search by city for full coverage.
2+
3+
V1 searched by province and hit 500-result caps in populous provinces.
4+
V2 searches by prefecture-level city (~340 cities) to avoid truncation.
5+
6+
Output: data/amap_temples_v2.json
7+
"""
8+
import json
9+
import math
10+
import time
11+
import urllib.parse
12+
import urllib.request
13+
14+
AMAP_KEY = "7971e9b134c4684c3b43b6e442475d0e"
15+
AMAP_URL = "https://restapi.amap.com/v3/place/text"
16+
AMAP_DISTRICT_URL = "https://restapi.amap.com/v3/config/district"
17+
USER_AGENT = "FoJinBot/1.0"
18+
OUTPUT = "data/amap_temples_v2.json"
19+
20+
KEYWORDS = ["寺", "庵", "禅寺", "佛寺", "佛教", "精舍", "佛堂"]
21+
22+
SKIP_WORDS = ["清真", "教堂", "基督", "天主", "道观", "道教", "伊斯兰",
23+
"关帝", "妈祖", "城隍", "土地庙", "孔庙", "文庙",
24+
"殡仪", "墓", "陵园", "酒店", "宾馆", "饭店", "餐厅",
25+
"停车", "公厕", "超市", "药店", "医院", "学校",
26+
"公园", "广场", "商场", "写字楼", "小区", "花园"]
27+
28+
29+
def _transform_lat(x, y):
30+
ret = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x))
31+
ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
32+
ret += (20.0 * math.sin(y * math.pi) + 40.0 * math.sin(y / 3.0 * math.pi)) * 2.0 / 3.0
33+
ret += (160.0 * math.sin(y / 12.0 * math.pi) + 320.0 * math.sin(y * math.pi / 30.0)) * 2.0 / 3.0
34+
return ret
35+
36+
37+
def _transform_lon(x, y):
38+
ret = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x))
39+
ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0
40+
ret += (20.0 * math.sin(x * math.pi) + 40.0 * math.sin(x / 3.0 * math.pi)) * 2.0 / 3.0
41+
ret += (150.0 * math.sin(x / 12.0 * math.pi) + 300.0 * math.sin(x / 30.0 * math.pi)) * 2.0 / 3.0
42+
return ret
43+
44+
45+
def gcj02_to_wgs84(lng, lat):
46+
a = 6378245.0
47+
ee = 0.00669342162296594323
48+
dlat = _transform_lat(lng - 105.0, lat - 35.0)
49+
dlng = _transform_lon(lng - 105.0, lat - 35.0)
50+
radlat = lat / 180.0 * math.pi
51+
magic = math.sin(radlat)
52+
magic = 1 - ee * magic * magic
53+
sqrtmagic = math.sqrt(magic)
54+
dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * math.pi)
55+
dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * math.pi)
56+
return lng - dlng, lat - dlat
57+
58+
59+
def get_cities() -> list[dict]:
60+
"""Fetch all prefecture-level cities from Amap district API."""
61+
params = urllib.parse.urlencode({
62+
"key": AMAP_KEY,
63+
"keywords": "中国",
64+
"subdistrict": 2,
65+
"extensions": "base",
66+
})
67+
url = f"{AMAP_DISTRICT_URL}?{params}"
68+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
69+
with urllib.request.urlopen(req, timeout=30) as resp:
70+
data = json.loads(resp.read())
71+
72+
cities = []
73+
for province in data.get("districts", [{}])[0].get("districts", []):
74+
pname = province.get("name", "")
75+
for city in province.get("districts", []):
76+
cities.append({
77+
"adcode": city.get("adcode", ""),
78+
"name": city.get("name", ""),
79+
"province": pname,
80+
})
81+
# Direct-administered municipalities: province itself is the city
82+
if not province.get("districts"):
83+
cities.append({
84+
"adcode": province.get("adcode", ""),
85+
"name": pname,
86+
"province": pname,
87+
})
88+
return cities
89+
90+
91+
def amap_search(keyword: str, city_code: str, page: int = 1) -> dict:
92+
params = urllib.parse.urlencode({
93+
"key": AMAP_KEY,
94+
"keywords": keyword,
95+
"city": city_code,
96+
"citylimit": "true",
97+
"offset": 25,
98+
"page": page,
99+
"output": "json",
100+
"extensions": "base",
101+
})
102+
url = f"{AMAP_URL}?{params}"
103+
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
104+
with urllib.request.urlopen(req, timeout=15) as resp:
105+
return json.loads(resp.read())
106+
107+
108+
def main():
109+
print("Fetching city list...")
110+
cities = get_cities()
111+
print(f"Got {len(cities)} cities")
112+
113+
all_pois: dict[str, dict] = {}
114+
total_requests = 0
115+
116+
for ci, city in enumerate(cities):
117+
city_before = len(all_pois)
118+
for keyword in KEYWORDS:
119+
page = 1
120+
while page <= 20:
121+
try:
122+
data = amap_search(keyword, city["adcode"], page)
123+
total_requests += 1
124+
time.sleep(0.25)
125+
except Exception as e:
126+
print(f" ERR {city['name']}/{keyword}: {e}")
127+
time.sleep(2)
128+
break
129+
130+
if data.get("status") != "1":
131+
break
132+
133+
pois = data.get("pois", [])
134+
if not pois:
135+
break
136+
137+
for poi in pois:
138+
pid = poi.get("id", "")
139+
name = poi.get("name", "")
140+
location = poi.get("location", "")
141+
if not pid or not name or not location:
142+
continue
143+
if any(w in name for w in SKIP_WORDS):
144+
continue
145+
if pid in all_pois:
146+
continue
147+
try:
148+
lng_gcj, lat_gcj = [float(x) for x in location.split(",")]
149+
lng_wgs, lat_wgs = gcj02_to_wgs84(lng_gcj, lat_gcj)
150+
except (ValueError, IndexError):
151+
continue
152+
153+
all_pois[pid] = {
154+
"amap_id": pid,
155+
"name": name,
156+
"latitude": round(lat_wgs, 7),
157+
"longitude": round(lng_wgs, 7),
158+
"address": poi.get("address", ""),
159+
"province": poi.get("pname", city["province"]),
160+
"city": poi.get("cityname", city["name"]),
161+
"district": poi.get("adname", ""),
162+
"type": poi.get("type", ""),
163+
"typecode": poi.get("typecode", ""),
164+
}
165+
166+
count = int(data.get("count", 0))
167+
if page * 25 >= count or page * 25 >= 500:
168+
break
169+
page += 1
170+
171+
city_new = len(all_pois) - city_before
172+
if (ci + 1) % 20 == 0 or ci == len(cities) - 1:
173+
print(f"[{ci+1}/{len(cities)}] {city['province']}/{city['name']}: +{city_new} (total: {len(all_pois)}, reqs: {total_requests})")
174+
175+
# Save
176+
result = list(all_pois.values())
177+
with open(OUTPUT, "w", encoding="utf-8") as f:
178+
json.dump(result, f, ensure_ascii=False, indent=2)
179+
print(f"\n{'='*60}")
180+
print(f"Total: {len(result)} POIs, {total_requests} API requests")
181+
print(f"Saved to {OUTPUT}")
182+
print(f"{'='*60}")
183+
184+
185+
if __name__ == "__main__":
186+
main()

0 commit comments

Comments
 (0)