diff --git a/README.md b/README.md index 9872868..0030827 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ **503 sources. 30 languages. 30 countries. 23,500+ full-text volumes. One search.** -Aggregating the world's Buddhist digital heritage — 10,500+ texts with 23,500+ volumes of full content in Pali, Classical Chinese, Tibetan, and Sanskrit from 503 data sources — with CBETA-style reading, AI-powered Q&A (RAG + reranking + citations + data source recommendations), knowledge graph with 31K+ entities and 28K+ relations (including 23K teacher-student lineage chains), 32 dictionaries with 748K entries across 6 languages, timeline visualization, collections, citations, annotations, bookmarks, and multi-language parallel reading. +Aggregating the world's Buddhist digital heritage — 10,500+ texts with 23,500+ volumes of full content in Pali, Classical Chinese, Tibetan, and Sanskrit from 503 data sources — with CBETA-style reading, AI-powered Q&A with 8 Buddhist master personas (RAG + tradition-scoped retrieval + citations), knowledge graph with 31K+ entities and 28K+ relations visualized on a 50K-entity Deck.GL geo map, 32 dictionaries with 748K entries across 6 languages, timeline visualization, activity feed, collections, citations, annotations, bookmarks, and multi-language parallel reading. [Live Demo](https://fojin.app)  ·  [API Docs](https://fojin.app/docs)  ·  [中文文档](./docs/README_zh.md)  ·  [Discussions](https://github.com/xr843/fojin/discussions)  ·  [Discord](https://discord.gg/76SZeuJekq)  ·  [Report Bug](https://github.com/xr843/fojin/issues) @@ -37,6 +37,9 @@ Buddhist texts are scattered across hundreds of databases worldwide — CBETA, S | Discover similar texts | **Semantic similarity** powered by 678K+ embedding vectors (pgvector + HNSW) | | View original manuscripts | **IIIF manuscript viewer** connected to BDRC and more | | Ask questions about texts | **AI Q&A** ("XiaoJin") with RAG, reranking, clickable citations, and follow-up suggestions | +| Learn from a specific master | **Master Persona Mode** — 8 historical Buddhist masters, each with tradition-specific RAG scope | +| Explore Buddhist geography | **Knowledge Graph Map** — 50K+ geo entities, monastery locations, lineage arcs on Deck.GL | +| Track source updates | **Activity Feed** — real-time updates from 503 data sources | | Explore history visually | **Timeline & Dashboard** — dynasty charts, translation trends, category analytics | | Save and organize | **Collections, bookmarks, annotations** for personal study | | Cite in research | **Citation export** (BibTeX, RIS, APA) for academic use | @@ -158,6 +161,39 @@ Ask questions in natural language. XiaoJin answers based on canonical Buddhist t

AI Q&A answering about Xuanzang's disciples

+### Master Persona Mode (法师模式) + +Select a specific Buddhist master to receive answers in their teaching style, grounded in their tradition's core scriptures. 8 historical masters available: + +| Master | Tradition | Core Teachings | +|--------|-----------|----------------| +| 智顗 Zhiyi | 天台宗 | 一念三千、三谛圆融、五时八教、止观双修 | +| 慧能 Huineng | 禅宗 | 直指人心、见性成佛、无念无相无住 | +| 玄奘 Xuanzang | 法相唯识宗 | 八识、三性、五位百法、转识成智 | +| 法藏 Fazang | 华严宗 | 法界缘起、四法界、十玄门、六相圆融 | +| 鸠摩罗什 Kumarajiva | 三论宗/中观 | 八不中道、缘起性空、不二法门 | +| 印光 Yinguang | 净土宗 | 信愿行、持名念佛、敦伦尽分 | +| 蕅益 Ouyi | 天台/净土·跨宗派 | 教宗天台行归净土、六信、性相融会 | +| 虚云 Xuyun | 禅宗·五宗兼嗣 | 参话头、起疑情、老实修行 | + +Each master has a 100-150 line enriched system prompt with lineage, core doctrines, speaking style, teaching methods, key allusions, and terminology table. When a master is selected, RAG retrieval is **scoped to their core scriptures** (e.g., selecting Zhiyi only searches 《摩诃止观》《法华玄义》 etc.), providing more precise citations. + +Powered by [Master-skill](https://github.com/xr843/Master-skill) — the open-source Buddhist master AI persona framework. + +### Knowledge Graph Map (知识图谱地图) + +Visualize 50,000+ geo-enabled Buddhist entities on an interactive world map — monasteries, historical places, persons, and schools. Built with Deck.GL + MapLibre. + +- **Entity types**: Monasteries (green), Places (purple), Persons (red), Schools (blue) +- **Lineage arcs**: Toggle 8,000+ teacher-student lineage relations as animated arcs on the map +- **Chinese-only filter**: Quickly filter to show only Chinese-named entities +- **Entity search**: Find entities by name with simplified/traditional Chinese conversion (OpenCC) +- **Interactive tooltips**: Hover to see metadata, country flags, and source attribution + +### Activity Feed (佛学动态) + +Track real-time updates from 503 data sources — new texts added, translation releases, manuscript scans, and schema changes. Includes academic content aggregation and platform-wide activity summary. + ### Similar Passages Discovery When reading any text, the sidebar automatically finds semantically similar passages from other texts using pgvector cosine similarity. Discover cross-textual parallels, related commentaries, and thematic connections across the entire canon. @@ -207,12 +243,12 @@ FoJin aggregates data from major Buddhist digital projects worldwide. Sources ar | Layer | Technology | |-------|-----------| -| Frontend | React 18, TypeScript, Vite, Ant Design 5, Zustand, TanStack Query, D3.js | +| Frontend | React 18, TypeScript, Vite, Ant Design 5, Zustand, TanStack Query, D3.js, Deck.GL + MapLibre (geo map) | | Backend | FastAPI, SQLAlchemy (async), Pydantic v2, SSE streaming | | Database | PostgreSQL 15 + pgvector (HNSW index) + pg_trgm | | Search | Elasticsearch 8 (ICU tokenizer) | | Cache | Redis 7 | -| AI | RAG (678K+ text vectors + 503 source vectors, BGE-M3 embeddings, HNSW) + multi-provider LLM (OpenAI/DashScope/DeepSeek/SiliconFlow) | +| AI | RAG (678K+ vectors, BGE-M3, HNSW) + 8 master personas + multi-provider LLM (OpenAI/Anthropic/DeepSeek/DashScope/Gemini/+10 more) | | Deploy | Docker Compose, Nginx (gzip, security headers), Cloudflare CDN | | CI | GitHub Actions (lint, test, security scan) | @@ -315,6 +351,9 @@ See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines. - [x] Nanshan Vinaya Dictionary (3,200+ Buddhist precept terms) - [x] CBETA full-text import — Taishō (T) + Xuzangjing (X): 3,600+ texts, 143M characters, 432K embedding vectors - [x] Dictionary expansion — 32 dictionaries, 748K entries (DPD, Apte, Mahāvyutpatti, Buddhadatta, Pentaglot, buddhaspace 7 dicts) +- [x] Master Persona Mode — 8 Buddhist masters with tradition-scoped RAG (powered by [Master-skill](https://github.com/xr843/Master-skill)) +- [x] Knowledge Graph Map — 50K+ geo entities, Deck.GL + MapLibre, lineage arcs +- [x] Activity Feed — real-time source update tracking, academic feeds - [ ] Topic ontology browsing page - [ ] Cross-lingual search (query in Chinese, find Sanskrit/Pali/Tibetan results) - [ ] Open data export (JSON/CSV for researchers) @@ -344,6 +383,7 @@ FoJin is built on the generous work of the global Buddhist digital humanities co ## Related Projects +- [Master-skill](https://github.com/xr843/Master-skill) — Buddhist master AI persona framework (powers FoJin's master mode) - [The Open Buddhist University](https://buddhistuniversity.net) — Free courses, books, and encyclopaedia for Buddhist studies --- diff --git a/backend/scripts/backfill_address_regeo.py b/backend/scripts/backfill_address_regeo.py new file mode 100644 index 0000000..d658f41 --- /dev/null +++ b/backend/scripts/backfill_address_regeo.py @@ -0,0 +1,165 @@ +"""Backfill province/city/district for CN monasteries via Amap reverse geocoding. + +Targets: entity_type='monastery', country='CN', province IS NULL, has lat/lng. +Uses Amap regeo API with WGS-84→GCJ-02 conversion. + +Rate limit: 0.25s between requests. ~1600 entities ≈ 7 min. +""" +import argparse +import asyncio +import json +import math +import os +import sys +import time +import urllib.parse +import urllib.request + +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from app.config import settings + +AMAP_KEY = "7971e9b134c4684c3b43b6e442475d0e" +AMAP_REGEO_URL = "https://restapi.amap.com/v3/geocode/regeo" +USER_AGENT = "FoJinBot/1.0" + + +def wgs84_to_gcj02(lng, lat): + """Convert WGS-84 to GCJ-02.""" + a = 6378245.0 + ee = 0.00669342162296594323 + + dlat = _transform_lat(lng - 105.0, lat - 35.0) + dlng = _transform_lon(lng - 105.0, lat - 35.0) + radlat = lat / 180.0 * math.pi + magic = math.sin(radlat) + magic = 1 - ee * magic * magic + sqrtmagic = math.sqrt(magic) + dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * math.pi) + dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * math.pi) + return lng + dlng, lat + dlat + + +def _transform_lat(x, y): + ret = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x)) + ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0 + ret += (20.0 * math.sin(y * math.pi) + 40.0 * math.sin(y / 3.0 * math.pi)) * 2.0 / 3.0 + ret += (160.0 * math.sin(y / 12.0 * math.pi) + 320.0 * math.sin(y * math.pi / 30.0)) * 2.0 / 3.0 + return ret + + +def _transform_lon(x, y): + ret = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x)) + ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0 + ret += (20.0 * math.sin(x * math.pi) + 40.0 * math.sin(x / 3.0 * math.pi)) * 2.0 / 3.0 + ret += (150.0 * math.sin(x / 12.0 * math.pi) + 300.0 * math.sin(x / 30.0 * math.pi)) * 2.0 / 3.0 + return ret + + +def regeo(lng_wgs, lat_wgs) -> dict | None: + lng_gcj, lat_gcj = wgs84_to_gcj02(lng_wgs, lat_wgs) + params = urllib.parse.urlencode({ + "key": AMAP_KEY, + "location": f"{lng_gcj:.6f},{lat_gcj:.6f}", + "extensions": "base", + "output": "json", + }) + url = f"{AMAP_REGEO_URL}?{params}" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=10) as resp: + data = json.loads(resp.read()) + if data.get("status") != "1": + return None + comp = data.get("regeocode", {}).get("addressComponent", {}) + province = comp.get("province", "") + city = comp.get("city", "") + district = comp.get("district", "") + # Amap returns [] for empty fields in municipalities + if isinstance(province, list): + province = "" + if isinstance(city, list): + city = "" + if isinstance(district, list): + district = "" + if not province: + return None + return {"province": province, "city": city or province, "district": district} + + +async def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--limit", type=int, default=2000) + args = parser.parse_args() + + print("=" * 60) + print("FoJin — Backfill Address via Amap Reverse Geocoding") + print("=" * 60) + + engine = create_async_engine(settings.database_url) + sf = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + async with sf() as session: + result = await session.execute(text(""" + SELECT id, name_zh, + (properties->>'latitude')::float AS lat, + (properties->>'longitude')::float AS lng + FROM kg_entities + WHERE entity_type = 'monastery' + AND (properties->>'country' = 'CN' OR properties->>'country' = '中国' + OR properties->>'geo_source' LIKE 'osm:CN%' OR properties->>'geo_source' LIKE 'osm:中国%' + OR properties->>'geo_source' LIKE 'osm_ext%') + AND (properties->>'province' IS NULL OR properties->>'province' = '') + AND properties->>'latitude' IS NOT NULL + LIMIT :limit + """), {"limit": args.limit}) + rows = result.fetchall() + print(f"Found {len(rows)} monasteries to backfill") + + stats = {"updated": 0, "failed": 0, "skipped": 0} + + for i, (eid, name, lat, lng) in enumerate(rows): + try: + addr = regeo(lng, lat) + time.sleep(0.25) + except Exception as e: + print(f" ERR {eid} {name}: {e}") + stats["failed"] += 1 + time.sleep(1) + continue + + if not addr: + stats["skipped"] += 1 + continue + + if not args.dry_run: + props_patch = json.dumps(addr) + await session.execute(text(""" + UPDATE kg_entities + SET properties = (properties::jsonb || cast(:patch as jsonb))::json + WHERE id = :id + """), {"id": eid, "patch": props_patch}) + + stats["updated"] += 1 + + if (i + 1) % 200 == 0: + if not args.dry_run: + await session.commit() + print(f" [{i+1}/{len(rows)}] updated: {stats['updated']}, failed: {stats['failed']}") + + if not args.dry_run: + await session.commit() + + print(f"\n{'='*60}") + print(f"Results: {stats}") + print(f"{'Dry run' if args.dry_run else 'Committed'}") + print("=" * 60) + + await engine.dispose() + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/backend/scripts/fetch_amap_temples_v2.py b/backend/scripts/fetch_amap_temples_v2.py new file mode 100644 index 0000000..fa82153 --- /dev/null +++ b/backend/scripts/fetch_amap_temples_v2.py @@ -0,0 +1,186 @@ +"""Fetch Chinese Buddhist temples from Amap POI API — v2: search by city for full coverage. + +V1 searched by province and hit 500-result caps in populous provinces. +V2 searches by prefecture-level city (~340 cities) to avoid truncation. + +Output: data/amap_temples_v2.json +""" +import json +import math +import time +import urllib.parse +import urllib.request + +AMAP_KEY = "7971e9b134c4684c3b43b6e442475d0e" +AMAP_URL = "https://restapi.amap.com/v3/place/text" +AMAP_DISTRICT_URL = "https://restapi.amap.com/v3/config/district" +USER_AGENT = "FoJinBot/1.0" +OUTPUT = "data/amap_temples_v2.json" + +KEYWORDS = ["寺", "庵", "禅寺", "佛寺", "佛教", "精舍", "佛堂"] + +SKIP_WORDS = ["清真", "教堂", "基督", "天主", "道观", "道教", "伊斯兰", + "关帝", "妈祖", "城隍", "土地庙", "孔庙", "文庙", + "殡仪", "墓", "陵园", "酒店", "宾馆", "饭店", "餐厅", + "停车", "公厕", "超市", "药店", "医院", "学校", + "公园", "广场", "商场", "写字楼", "小区", "花园"] + + +def _transform_lat(x, y): + ret = -100.0 + 2.0 * x + 3.0 * y + 0.2 * y * y + 0.1 * x * y + 0.2 * math.sqrt(abs(x)) + ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0 + ret += (20.0 * math.sin(y * math.pi) + 40.0 * math.sin(y / 3.0 * math.pi)) * 2.0 / 3.0 + ret += (160.0 * math.sin(y / 12.0 * math.pi) + 320.0 * math.sin(y * math.pi / 30.0)) * 2.0 / 3.0 + return ret + + +def _transform_lon(x, y): + ret = 300.0 + x + 2.0 * y + 0.1 * x * x + 0.1 * x * y + 0.1 * math.sqrt(abs(x)) + ret += (20.0 * math.sin(6.0 * x * math.pi) + 20.0 * math.sin(2.0 * x * math.pi)) * 2.0 / 3.0 + ret += (20.0 * math.sin(x * math.pi) + 40.0 * math.sin(x / 3.0 * math.pi)) * 2.0 / 3.0 + ret += (150.0 * math.sin(x / 12.0 * math.pi) + 300.0 * math.sin(x / 30.0 * math.pi)) * 2.0 / 3.0 + return ret + + +def gcj02_to_wgs84(lng, lat): + a = 6378245.0 + ee = 0.00669342162296594323 + dlat = _transform_lat(lng - 105.0, lat - 35.0) + dlng = _transform_lon(lng - 105.0, lat - 35.0) + radlat = lat / 180.0 * math.pi + magic = math.sin(radlat) + magic = 1 - ee * magic * magic + sqrtmagic = math.sqrt(magic) + dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * math.pi) + dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * math.pi) + return lng - dlng, lat - dlat + + +def get_cities() -> list[dict]: + """Fetch all prefecture-level cities from Amap district API.""" + params = urllib.parse.urlencode({ + "key": AMAP_KEY, + "keywords": "中国", + "subdistrict": 2, + "extensions": "base", + }) + url = f"{AMAP_DISTRICT_URL}?{params}" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=30) as resp: + data = json.loads(resp.read()) + + cities = [] + for province in data.get("districts", [{}])[0].get("districts", []): + pname = province.get("name", "") + for city in province.get("districts", []): + cities.append({ + "adcode": city.get("adcode", ""), + "name": city.get("name", ""), + "province": pname, + }) + # Direct-administered municipalities: province itself is the city + if not province.get("districts"): + cities.append({ + "adcode": province.get("adcode", ""), + "name": pname, + "province": pname, + }) + return cities + + +def amap_search(keyword: str, city_code: str, page: int = 1) -> dict: + params = urllib.parse.urlencode({ + "key": AMAP_KEY, + "keywords": keyword, + "city": city_code, + "citylimit": "true", + "offset": 25, + "page": page, + "output": "json", + "extensions": "base", + }) + url = f"{AMAP_URL}?{params}" + req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=15) as resp: + return json.loads(resp.read()) + + +def main(): + print("Fetching city list...") + cities = get_cities() + print(f"Got {len(cities)} cities") + + all_pois: dict[str, dict] = {} + total_requests = 0 + + for ci, city in enumerate(cities): + city_before = len(all_pois) + for keyword in KEYWORDS: + page = 1 + while page <= 20: + try: + data = amap_search(keyword, city["adcode"], page) + total_requests += 1 + time.sleep(0.25) + except Exception as e: + print(f" ERR {city['name']}/{keyword}: {e}") + time.sleep(2) + break + + if data.get("status") != "1": + break + + pois = data.get("pois", []) + if not pois: + break + + for poi in pois: + pid = poi.get("id", "") + name = poi.get("name", "") + location = poi.get("location", "") + if not pid or not name or not location: + continue + if any(w in name for w in SKIP_WORDS): + continue + if pid in all_pois: + continue + try: + lng_gcj, lat_gcj = [float(x) for x in location.split(",")] + lng_wgs, lat_wgs = gcj02_to_wgs84(lng_gcj, lat_gcj) + except (ValueError, IndexError): + continue + + all_pois[pid] = { + "amap_id": pid, + "name": name, + "latitude": round(lat_wgs, 7), + "longitude": round(lng_wgs, 7), + "address": poi.get("address", ""), + "province": poi.get("pname", city["province"]), + "city": poi.get("cityname", city["name"]), + "district": poi.get("adname", ""), + "type": poi.get("type", ""), + "typecode": poi.get("typecode", ""), + } + + count = int(data.get("count", 0)) + if page * 25 >= count or page * 25 >= 500: + break + page += 1 + + city_new = len(all_pois) - city_before + if (ci + 1) % 20 == 0 or ci == len(cities) - 1: + print(f"[{ci+1}/{len(cities)}] {city['province']}/{city['name']}: +{city_new} (total: {len(all_pois)}, reqs: {total_requests})") + + # Save + result = list(all_pois.values()) + with open(OUTPUT, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + print(f"\n{'='*60}") + print(f"Total: {len(result)} POIs, {total_requests} API requests") + print(f"Saved to {OUTPUT}") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/frontend/src/pages/KGMapPage.tsx b/frontend/src/pages/KGMapPage.tsx index 9de169d..1484d82 100644 --- a/frontend/src/pages/KGMapPage.tsx +++ b/frontend/src/pages/KGMapPage.tsx @@ -19,13 +19,6 @@ const ENTITY_TYPE_OPTIONS = [ const s2t = OpenCC.Converter({ from: "cn", to: "tw" }); const t2s = OpenCC.Converter({ from: "tw", to: "cn" }); -const TYPE_LABEL: Record = { - monastery: "寺院", - place: "地点", - person: "人物", - school: "宗派", -}; - const TYPE_CSS_COLORS: Record = { person: "#dc2626", monastery: "#22c55e", @@ -78,46 +71,45 @@ export default function KGMapPage() { const searchOptions = useMemo(() => { const q = searchQuery.trim().toLowerCase(); if (q.length < 1) return []; - // Generate both simplified and traditional variants of the query const qSimp = t2s(q); const qTrad = s2t(q); const queries = Array.from(new Set([q, qSimp, qTrad])); + // Split query into tokens for multi-part matching + const tokenize = (s: string): string[][] => { + const parts = s.split(/\s+/).filter(Boolean); + if (parts.length > 1) return [parts]; + const combos: string[][] = [[s]]; + for (let i = 1; i < s.length; i++) { + combos.push([s.slice(0, i), s.slice(i)]); + } + return combos; + }; + const allTokenSets = queries.flatMap(tokenize); const pool = geoData?.entities ?? []; const matches: KGGeoEntity[] = []; for (const e of pool) { const zh = (e.name_zh || "").toLowerCase(); const en = (e.name_en || "").toLowerCase(); const addr = [e.province || "", e.city || "", e.district || ""].join("").toLowerCase(); - const full = zh + addr; - const hit = queries.some((qv) => full.includes(qv) || en.includes(qv)); + const full = zh + " " + en + " " + addr; + const hit = allTokenSets.some((tokens) => + tokens.every((t) => full.includes(t)), + ); if (hit) { matches.push(e); if (matches.length >= 30) break; } } + const addr = (e: KGGeoEntity) => + [e.province, e.city, e.district].filter(Boolean).join(" "); return matches.map((e) => ({ value: String(e.id), label: ( -
- - {e.name_zh} - {(e.province || e.city || e.district) ? ( - - {[e.province, e.city, e.district].filter(Boolean).join("")} - - ) : e.name_en ? ( - {e.name_en} - ) : null} - - {TYPE_LABEL[e.entity_type] || e.entity_type} +
+ + {e.name_zh} + + {addr(e) || e.name_en || ""}
), @@ -187,7 +179,7 @@ export default function KGMapPage() { onSearch={setSearchQuery} onChange={setSearchQuery} onSelect={handleSearchSelect} - placeholder="搜索(名称/地址,如:福建崇恩)" + placeholder="搜索(名称/地址)" allowClear style={{ width: 280, marginLeft: "auto" }} popupMatchSelectWidth={380}