Skip to content

Commit 4cffb21

Browse files
Merge pull request #15 from JYinherit/main
Arabic words similarity check
2 parents f782263 + e1e1375 commit 4cffb21

File tree

3 files changed

+373
-1
lines changed

3 files changed

+373
-1
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,6 @@ app.*.map.json
4343
/android/app/debug
4444
/android/app/profile
4545
/android/app/release
46+
47+
# Gemini
48+
.gemini/

lib/funcs/utili.dart

Lines changed: 369 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ import 'package:dio/dio.dart';
1010
import 'package:flutter/material.dart';
1111
import 'package:just_audio/just_audio.dart';
1212
import 'package:provider/provider.dart';
13+
import 'dart:math';
14+
import 'package:bk_tree/bk_tree.dart';
1315

1416
/// 下载文件到指定的目录
1517
///
@@ -219,4 +221,370 @@ Map<K, V> deepMerge<K, V>(Map<K, V> base, Map<K, V> overlay) {
219221
}
220222
});
221223
return result;
222-
}
224+
}
225+
226+
227+
/// 简单的词性枚举,用于区分词汇类别
228+
enum ArabicPOS {
229+
verb, // 动词
230+
noun, // 名词(包括形容词、分词、动名词)
231+
unknown // 无法判断(如太短或未匹配到模式)
232+
}
233+
234+
class AnalysisResult {
235+
/// 提取出的三字母词根。
236+
final String root;
237+
/// 匹配到的构词法模式名称。
238+
final String patternName;
239+
/// 词性标记
240+
final ArabicPOS pos;
241+
242+
AnalysisResult(this.root, this.patternName, this.pos);
243+
}
244+
245+
/// 内部辅助类,用于定义一个构词法模式。
246+
class _RootPattern {
247+
/// 模式的名称,如 "Form X (Past)"。
248+
final String name;
249+
/// 用于匹配的正则表达式。
250+
final RegExp regex;
251+
/// 捕获组索引,定义了词根字母 (R1, R2, R3) 在正则匹配中的位置。
252+
final List<int> groups;
253+
/// 该模式对应的词性
254+
final ArabicPOS pos;
255+
256+
_RootPattern(this.name, String pattern, this.pos, {this.groups = const [1, 2, 3]})
257+
: regex = RegExp(pattern);
258+
}
259+
260+
/// 一个基于构词法模式的阿拉伯语词根提取器 (Stemmer)。
261+
///
262+
/// 该类通过一个预定义的模式库来识别单词的构词形式,并从中提取出标准的三字母词根。
263+
/// 这对于判断不同派生词之间的相似性至关重要。
264+
class ArabicStemmer {
265+
// 1. 元音范围
266+
static final _diacritics = RegExp(r'[\u064B-\u065F\u0640\u0670\u06D6-\u06ED]');
267+
268+
// 2. 定义模式库 (优先级:长/特异性 -> 短/通用性)
269+
static final List<_RootPattern> _patterns = [
270+
// --- Form X (استفعل) ---
271+
_RootPattern("Form X (Past)", r'^است(.)(.)(.)$', ArabicPOS.verb),
272+
_RootPattern("Form X (Present)", r'^يست(.)(.)(.)$', ArabicPOS.verb),
273+
_RootPattern("Form X (Participle)", r'^مست(.)(.)(.)$', ArabicPOS.noun),
274+
275+
// --- [新增] Instrumental (Mif'aal - مفعال) ---
276+
// e.g., Miftah (مفتاح) -> F-T-H
277+
// 正则:Meem + R1 + R2 + Alef + R3
278+
_RootPattern("Instrumental (Mif'aal)", r'^م(.)(.)ا(.)$', ArabicPOS.noun),
279+
280+
// --- Form I Passive (مفعول) ---
281+
// e.g., Maktub (مكتوب)
282+
// 正则:Meem + R1 + R2 + Waw + R3
283+
_RootPattern("Form I (Passive)", r'^م(.)(.)و(.)$', ArabicPOS.noun),
284+
285+
// --- Form VII (انفعل) ---
286+
_RootPattern("Form VII (Past)", r'^ان(.)(.)(.)$', ArabicPOS.verb),
287+
_RootPattern("Form VII (Present)", r'^ين(.)(.)(.)$', ArabicPOS.verb),
288+
_RootPattern("Form VII (Participle)", r'^من(.)(.)(.)$', ArabicPOS.noun),
289+
290+
// --- Form VIII (افتعل) ---
291+
_RootPattern("Form VIII (Past)", r'^ا(.)ت(.)(.)$', ArabicPOS.verb),
292+
_RootPattern("Form VIII (Present)", r'^ي(.)ت(.)(.)$', ArabicPOS.verb),
293+
_RootPattern("Form VIII (Participle)", r'^م(.)ت(.)(.)$', ArabicPOS.noun),
294+
295+
// --- Form VI (تفاعل) ---
296+
_RootPattern("Form VI (Past)", r'^ت(.)ا(.)(.)$', ArabicPOS.verb),
297+
_RootPattern("Form VI (Present)", r'^يت(.)ا(.)(.)$', ArabicPOS.verb),
298+
_RootPattern("Form VI (Participle)", r'^مت(.)ا(.)(.)$', ArabicPOS.noun),
299+
300+
// --- Form III (فاعل) ---
301+
_RootPattern("Form III/I-Active", r'^(.)ا(.)(.)$', ArabicPOS.noun), // 这里的 Active Participle 往往作名词用,但也可能是动词过去式,暂定名词
302+
_RootPattern("Form III (Present)", r'^ي(.)ا(.)(.)$', ArabicPOS.verb),
303+
_RootPattern("Form III (Participle)", r'^م(.)ا(.)(.)$', ArabicPOS.noun),
304+
305+
// --- Form V (تفعّل) ---
306+
_RootPattern("Form V (Past)", r'^ت(.)(.)(.)$', ArabicPOS.verb),
307+
_RootPattern("Form V (Present)", r'^يت(.)(.)(.)$', ArabicPOS.verb),
308+
_RootPattern("Form V (Participle)", r'^مت(.)(.)(.)$', ArabicPOS.noun),
309+
310+
// --- Masdar Form II/V (Taf'aal) ---
311+
_RootPattern("Masdar (Taf'aal)", r'^ت(.)(.)ا(.)$', ArabicPOS.noun),
312+
313+
// --- [新增] Elative/Comparative (Af'al - أفعل) ---
314+
// e.g., Akbar (أكبر) -> K-B-R
315+
// 归一化后为: Alef + R1 + R2 + R3
316+
// 注意:这也涵盖了 Form IV Past (Af'ala - أكرم)
317+
_RootPattern("Comparative (Af'al)", r'^ا(.)(.)(.)$', ArabicPOS.noun),
318+
319+
// --- [新增] Elative Fem (Fu'la - فعلى) ---
320+
// e.g., Kubra (كبرى) -> K-B-R
321+
// 归一化后:R1 + R2 + R3 + Alef (from Yaa/Alif Maqsura)
322+
// 必须是4个字母,以Alef结尾
323+
_RootPattern("Comparative Fem (Fu'la)", r'^(.)(.)(.)ا$', ArabicPOS.noun),
324+
325+
// --- Form IV (Participle) ---
326+
_RootPattern("Form IV (Participle)", r'^م(.)(.)(.)$', ArabicPOS.noun),
327+
328+
// --- Default Form I Present (Yaf'alu) ---
329+
_RootPattern("Form I (Present)", r'^ي(.)(.)(.)$', ArabicPOS.verb),
330+
];
331+
332+
/// 对输入的阿拉伯语单词进行预处理和规范化。
333+
String normalize(String text) {
334+
if (text.isEmpty) return "";
335+
// 移除所有元音符号
336+
String res = text.replaceAll(_diacritics, '');
337+
// 统一不同形式的 Alef
338+
res = res.replaceAll(RegExp(r'[أإآ]'), 'ا');
339+
// 将 Alef Maqsura 统一为 Alef
340+
res = res.replaceAll('ى', 'ا');
341+
342+
// 忽略所有 "ة" (Ta Marbuta),直接删除
343+
// 之前是替换为 'ه',现在按照需求删除,以便处理如 'مكتبة' -> 'مكتب'
344+
res = res.replaceAll('ة', '');
345+
346+
return res.trim();
347+
}
348+
349+
/// 分析单词,返回其词根和匹配的模式。
350+
AnalysisResult analyze(String word) {
351+
String stem = normalize(word);
352+
353+
if (stem.length <= 2) return AnalysisResult(stem, "Too Short", ArabicPOS.unknown);
354+
355+
// 遍历模式库,找到第一个匹配的模式
356+
for (final pattern in _patterns) {
357+
final match = pattern.regex.firstMatch(stem);
358+
if (match != null) {
359+
String r1 = match.group(pattern.groups[0])!;
360+
String r2 = match.group(pattern.groups[1])!;
361+
String r3 = match.group(pattern.groups[2])!;
362+
return AnalysisResult(r1 + r2 + r3, pattern.name, pattern.pos);
363+
}
364+
}
365+
366+
// 如果没有模式匹配成功,则使用后备的词缀剥离方法
367+
String fallbackRoot = _fallbackStripping(stem);
368+
return AnalysisResult(fallbackRoot, "Fallback/Form I", ArabicPOS.unknown);
369+
}
370+
371+
/// 提取单词的词根(仅返回词根字符串)。
372+
String extractRoot(String word) {
373+
return analyze(word).root;
374+
}
375+
376+
/// 后备方案:通过剥离常见的前后缀来简化单词。
377+
String _fallbackStripping(String stem) {
378+
String s = stem;
379+
380+
if (s.startsWith('وال') || s.startsWith('فال')) s = s.substring(1);
381+
if (s.startsWith('لل') || s.startsWith('ال')) s = s.substring(2);
382+
if (s.length > 3 && (s.startsWith('و') || s.startsWith('ف'))) s = s.substring(1);
383+
384+
if (s.length > 4) {
385+
if (s.endsWith('ات') || s.endsWith('ون') || s.endsWith('ين')) s = s.substring(0, s.length - 2);
386+
else if (s.endsWith('ي')) s = s.substring(0, s.length - 1);
387+
// 注意:这里去掉了对 'ه' (Ha) 的移除,因为我们不再把 'ة' 转为 'ه'
388+
// 如果 'ه' 是原生字母或代词后缀,仍需小心
389+
}
390+
391+
return s;
392+
}
393+
}
394+
395+
/// 计算两个字符串之间的 Levenshtein 编辑距离。
396+
///
397+
/// 编辑距离指从一个字符串转换成另一个所需的最少单字符编辑(插入、删除或替换)次数。
398+
int getLevenshtein(String s, String t) {
399+
if (s == t) return 0;
400+
if (s.isEmpty) return t.length;
401+
if (t.isEmpty) return s.length;
402+
403+
List<int> v0 = List<int>.generate(t.length + 1, (i) => i);
404+
List<int> v1 = List<int>.generate(t.length + 1, (index) => 0);
405+
406+
for (int i = 0; i < s.length; i++) {
407+
v1[0] = i + 1;
408+
for (int j = 0; j < t.length; j++) {
409+
int cost = (s[i] == t[j]) ? 0 : 1;
410+
v1[j + 1] = min(v1[j] + 1, min(v0[j + 1] + 1, v0[j] + cost));
411+
}
412+
for (int j = 0; j < t.length + 1; j++) {
413+
v0[j] = v1[j];
414+
}
415+
}
416+
return v1[t.length];
417+
}
418+
419+
final _arabicStemmer = ArabicStemmer();
420+
421+
/// 混淆项的优先级等级
422+
/// 1: 同根 + 同词性 (最高质量,考义项辨析)
423+
/// 2: 近根 + 同词性 (考拼写辨析)
424+
/// 3: 同根 + 异词性 (考词性辨析,难度较低)
425+
/// 4: 近根 + 异/未知词性 (视觉干扰)
426+
int _calculateTier(AnalysisResult target, AnalysisResult candidate) {
427+
int rootDist = getLevenshtein(target.root, candidate.root);
428+
429+
// 1. 同根词 (Root Distance = 0)
430+
if (rootDist == 0) {
431+
if (target.pos == candidate.pos && target.pos != ArabicPOS.unknown) {
432+
return 1; // Tier 1: 同根同性
433+
}
434+
return 3; // Tier 3: 同根异性
435+
}
436+
437+
// 2. 近根词 (Root Distance = 1)
438+
if (rootDist == 1) {
439+
if (target.pos == candidate.pos && target.pos != ArabicPOS.unknown) {
440+
return 2; // Tier 2: 近根同性
441+
}
442+
return 4; // Tier 4: 近根异性/未知
443+
}
444+
445+
return 0; // 不相关
446+
}
447+
448+
/// 计算两个阿拉伯语单词的相似度(编辑距离)。
449+
/// [wordA] - 第一个单词。
450+
/// [wordB] - 第二个单词。
451+
/// 返回两个单词词根之间的 Levenshtein 编辑距离。距离越小,单词越相似。
452+
int getArabicWordsSimilarity(String wordA, String wordB) {
453+
final rootA = _arabicStemmer.extractRoot(wordA);
454+
final rootB = _arabicStemmer.extractRoot(wordB);
455+
456+
return getLevenshtein(rootA, rootB);
457+
}
458+
459+
//基于BK-tree实现快速相似词搜索
460+
461+
class VocabularyOptimizer {
462+
final _stemmer = ArabicStemmer();
463+
BKTree? _bkTree;
464+
final Map<String, Set<String>> _rootToWordsMap = {};
465+
466+
/// 初始化并构建优化器
467+
void build(List<String> words) {
468+
_rootToWordsMap.clear();
469+
for (final word in words) {
470+
_addWordToMap(word);
471+
}
472+
473+
final rootMap = {for (var r in _rootToWordsMap.keys) r: r};
474+
if (rootMap.isNotEmpty) {
475+
_bkTree = BKTree(rootMap, getLevenshtein);
476+
}
477+
}
478+
479+
void _addWordToMap(String word) {
480+
final root = _stemmer.extractRoot(word);
481+
if (root.isEmpty) return;
482+
483+
if (_rootToWordsMap.containsKey(root)) {
484+
_rootToWordsMap[root]!.add(word);
485+
} else {
486+
_rootToWordsMap[root] = {word};
487+
}
488+
}
489+
490+
/// 查找与给定单词相似的所有单词
491+
List<String> findSimilarWords(String word, {int maxDistance = 1}) {
492+
if (_bkTree == null) return [];
493+
final queryRoot = _stemmer.extractRoot(word);
494+
if (queryRoot.isEmpty) return [];
495+
496+
final results = _bkTree!.search(queryHash: queryRoot, tolerance: maxDistance);
497+
498+
final resultWords = <String>[];
499+
for (final match in results) {
500+
if (match is Map && match.isNotEmpty) {
501+
final root = match.keys.first as String;
502+
if (_rootToWordsMap.containsKey(root)) {
503+
resultWords.addAll(_rootToWordsMap[root]!);
504+
}
505+
}
506+
}
507+
return resultWords;
508+
}
509+
}
510+
511+
/// 1. 初始化: BKSearch.init(['ktb', 'maktaba', ...]);
512+
/// 2. 搜索: var results = BKSearch.search('kitab');
513+
class BKSearch {
514+
// 私有构造函数,防止外部实例化
515+
BKSearch._();
516+
517+
// 单例实例
518+
static final VocabularyOptimizer _optimizer = VocabularyOptimizer();
519+
static bool _isInitialized = false;
520+
521+
/// [必须调用] 初始化搜索引擎
522+
/// 通常在 App 启动或加载词库时调用
523+
static void init(List<String> allWords) {
524+
if (_isInitialized) return; // 避免重复初始化
525+
print("正在构建 BK-Tree 搜索索引,词库大小: ${allWords.length}...");
526+
final stopwatch = Stopwatch()..start();
527+
528+
_optimizer.build(allWords);
529+
530+
stopwatch.stop();
531+
_isInitialized = true;
532+
print("BK-Tree 索引构建完成,耗时: ${stopwatch.elapsedMilliseconds}ms");
533+
}
534+
535+
/// 普通搜索: 返回所有相似词列表
536+
/// [query] : 用户输入的单词
537+
/// [threshold] : 容错阈值,默认 1 (允许 1 个字符的编辑距离差异)
538+
static List<String> search(String query, {int threshold = 1}) {
539+
if (!_isInitialized) {
540+
debugPrint("警告: BKSearch 尚未初始化,请先调用 init()");
541+
return [];
542+
}
543+
return _optimizer.findSimilarWords(query, maxDistance: threshold);
544+
}
545+
546+
/// [核心功能] 分级搜索混淆词
547+
/// 返回一个 Map,key 为优先级 (1-4),value 为符合该优先级的单词列表。
548+
///
549+
/// Tier 1: 同根 + 同词性 (最高质量)
550+
/// Tier 2: 近根(dist=1) + 同词性
551+
/// Tier 3: 同根 + 异词性
552+
/// Tier 4: 近根(dist=1) + 异/未知词性
553+
static Map<int, List<String>> searchWithTiers(String targetWord) {
554+
if (!_isInitialized) {
555+
debugPrint("警告: BKSearch 尚未初始化,无法执行分级搜索");
556+
return {1: [], 2: [], 3: [], 4: []};
557+
}
558+
559+
// 1. 分析目标词
560+
final targetAnalysis = _arabicStemmer.analyze(targetWord);
561+
562+
// 2. 使用 BK-Tree 快速获取候选词 (词根距离 <= 1)
563+
// 这一步利用了索引,极大减少了计算量
564+
final candidates = _optimizer.findSimilarWords(targetWord, maxDistance: 1);
565+
566+
final Map<int, List<String>> result = {
567+
1: [],
568+
2: [],
569+
3: [],
570+
4: [],
571+
};
572+
573+
// 3. 遍历候选词,进行精细分类
574+
for (String candidateStr in candidates) {
575+
if (candidateStr == targetWord) continue; // 跳过自己
576+
577+
final candidateAnalysis = _arabicStemmer.analyze(candidateStr);
578+
579+
int tier = _calculateTier(targetAnalysis, candidateAnalysis);
580+
if (tier > 0) {
581+
result[tier]!.add(candidateStr);
582+
}
583+
}
584+
585+
return result;
586+
}
587+
588+
/// 检查是否已经准备好
589+
static bool get isReady => _isInitialized;
590+
}

pubspec.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ dependencies:
5656
# Use with the CupertinoIcons class for iOS style icons.
5757
cupertino_icons: ^1.0.8
5858
dart_pubspec_licenses: ^3.0.14
59+
bk_tree: ^0.1.2
5960

6061
dev_dependencies:
6162
flutter_test:

0 commit comments

Comments
 (0)