Skip to content
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
335 changes: 335 additions & 0 deletions lib/funcs/utili.dart
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import 'package:dio/dio.dart';
import 'package:flutter/material.dart';
import 'package:just_audio/just_audio.dart';
import 'package:provider/provider.dart';
import 'dart:math';

/// 下载文件到指定的目录
///
Expand Down Expand Up @@ -253,4 +254,338 @@ Map<K, V> deepMerge<K, V>(Map<K, V> base, Map<K, V> overlay) {
}
});
return result;
}
class AnalysisResult {
/// 提取出的三字母词根。
final String root;
/// 匹配到的构词法模式名称。
final String patternName;
AnalysisResult(this.root, this.patternName);
}

/// 内部辅助类,用于定义一个构词法模式。
class _RootPattern {
/// 模式的名称,如 "Form X (Past)"。
final String name;
/// 用于匹配的正则表达式。
final RegExp regex;
/// 捕获组索引,定义了词根字母 (R1, R2, R3) 在正则匹配中的位置。
final List<int> groups;

_RootPattern(this.name, String pattern, {this.groups = const [1, 2, 3]})
: regex = RegExp(pattern);
}

/// 一个基于构词法模式的阿拉伯语词根提取器 (Stemmer)。
///
/// 该类通过一个预定义的模式库来识别单词的构词形式,并从中提取出标准的三字母词根。
/// 这对于判断不同派生词之间的相似性至关重要。
class ArabicStemmer {
// 1. 元音范围
static final _diacritics = RegExp(r'[\u064B-\u065F\u0640\u0670\u06D6-\u06ED]');

// 2. 定义模式库 (优先级:长/特异性 -> 短/通用性)
static final List<_RootPattern> _patterns = [
// --- Form X (استفعل) ---
_RootPattern("Form X (Past)", r'^است(.)(.)(.)$'),
_RootPattern("Form X (Present)", r'^يست(.)(.)(.)$'),
_RootPattern("Form X (Participle)", r'^مست(.)(.)(.)$'),

// --- [新增] Instrumental (Mif'aal - مفعال) ---
// e.g., Miftah (مفتاح) -> F-T-H
// 正则:Meem + R1 + R2 + Alef + R3
_RootPattern("Instrumental (Mif'aal)", r'^م(.)(.)ا(.)$'),

// --- Form I Passive (مفعول) ---
// e.g., Maktub (مكتوب)
// 正则:Meem + R1 + R2 + Waw + R3
_RootPattern("Form I (Passive)", r'^م(.)(.)و(.)$'),

// --- Form VII (انفعل) ---
_RootPattern("Form VII (Past)", r'^ان(.)(.)(.)$'),
_RootPattern("Form VII (Present)", r'^ين(.)(.)(.)$'),
_RootPattern("Form VII (Participle)", r'^من(.)(.)(.)$'),

// --- Form VIII (افتعل) ---
_RootPattern("Form VIII (Past)", r'^ا(.)ت(.)(.)$'),
_RootPattern("Form VIII (Present)", r'^ي(.)ت(.)(.)$'),
_RootPattern("Form VIII (Participle)", r'^م(.)ت(.)(.)$'),

// --- Form VI (تفاعل) ---
_RootPattern("Form VI (Past)", r'^ت(.)ا(.)(.)$'),
_RootPattern("Form VI (Present)", r'^يت(.)ا(.)(.)$'),
_RootPattern("Form VI (Participle)", r'^مت(.)ا(.)(.)$'),

// --- Form III (فاعل) ---
_RootPattern("Form III/I-Active", r'^(.)ا(.)(.)$'),
_RootPattern("Form III (Present)", r'^ي(.)ا(.)(.)$'),
_RootPattern("Form III (Participle)", r'^م(.)ا(.)(.)$'),

// --- Form V (تفعّل) ---
_RootPattern("Form V (Past)", r'^ت(.)(.)(.)$'),
_RootPattern("Form V (Present)", r'^يت(.)(.)(.)$'),
_RootPattern("Form V (Participle)", r'^مت(.)(.)(.)$'),

// --- Masdar Form II/V (Taf'aal) ---
_RootPattern("Masdar (Taf'aal)", r'^ت(.)(.)ا(.)$'),

// --- [新增] Elative/Comparative (Af'al - أفعل) ---
// e.g., Akbar (أكبر) -> K-B-R
// 归一化后为: Alef + R1 + R2 + R3
// 注意:这也涵盖了 Form IV Past (Af'ala - أكرم)
_RootPattern("Comparative (Af'al)", r'^ا(.)(.)(.)$'),

// --- [新增] Elative Fem (Fu'la - فعلى) ---
// e.g., Kubra (كبرى) -> K-B-R
// 归一化后:R1 + R2 + R3 + Alef (from Yaa/Alif Maqsura)
// 必须是4个字母,以Alef结尾
_RootPattern("Comparative Fem (Fu'la)", r'^(.)(.)(.)ا$'),

// --- Form IV (Participle) ---
_RootPattern("Form IV (Participle)", r'^م(.)(.)(.)$'),

// --- Default Form I Present (Yaf'alu) ---
_RootPattern("Form I (Present)", r'^ي(.)(.)(.)$'),
];

/// 对输入的阿拉伯语单词进行预处理和规范化。
String normalize(String text) {
if (text.isEmpty) return "";
// 移除所有元音符号
String res = text.replaceAll(_diacritics, '');
// 统一不同形式的 Alef
res = res.replaceAll(RegExp(r'[أإآ]'), 'ا');
// 将 Alef Maqsura 统一为 Alef
res = res.replaceAll('ى', 'ا');

// 忽略所有 "ة" (Ta Marbuta),直接删除
// 之前是替换为 'ه',现在按照需求删除,以便处理如 'مكتبة' -> 'مكتب'
res = res.replaceAll('ة', '');

return res.trim();
}

/// 分析单词,返回其词根和匹配的模式。
AnalysisResult analyze(String word) {
String stem = normalize(word);
if (stem.length <= 2) return AnalysisResult(stem, "Too Short");

// 遍历模式库,找到第一个匹配的模式
for (final pattern in _patterns) {
final match = pattern.regex.firstMatch(stem);
if (match != null) {
String r1 = match.group(pattern.groups[0])!;
String r2 = match.group(pattern.groups[1])!;
String r3 = match.group(pattern.groups[2])!;
return AnalysisResult(r1 + r2 + r3, pattern.name);
}
}

// 如果没有模式匹配成功,则使用后备的词缀剥离方法
String fallbackRoot = _fallbackStripping(stem);
return AnalysisResult(fallbackRoot, "Fallback/Form I");
}

/// 提取单词的词根(仅返回词根字符串)。
String extractRoot(String word) {
return analyze(word).root;
}

/// 后备方案:通过剥离常见的前后缀来简化单词。
String _fallbackStripping(String stem) {
String s = stem;

if (s.startsWith('وال') || s.startsWith('فال')) s = s.substring(1);
if (s.startsWith('لل') || s.startsWith('ال')) s = s.substring(2);
if (s.length > 3 && (s.startsWith('و') || s.startsWith('ف'))) s = s.substring(1);

if (s.length > 4) {
if (s.endsWith('ات') || s.endsWith('ون') || s.endsWith('ين')) s = s.substring(0, s.length - 2);
else if (s.endsWith('ي')) s = s.substring(0, s.length - 1);
// 注意:这里去掉了对 'ه' (Ha) 的移除,因为我们不再把 'ة' 转为 'ه'
// 如果 'ه' 是原生字母或代词后缀,仍需小心
}

return s;
}
}

/// 计算两个字符串之间的 Levenshtein 编辑距离。
///
/// 编辑距离指从一个字符串转换成另一个所需的最少单字符编辑(插入、删除或替换)次数。
int getLevenshtein(String s, String t) {
if (s == t) return 0;
if (s.isEmpty) return t.length;
if (t.isEmpty) return s.length;

List<int> v0 = List<int>.generate(t.length + 1, (i) => i);
List<int> v1 = List<int>.generate(t.length + 1, (index) => 0);

for (int i = 0; i < s.length; i++) {
v1[0] = i + 1;
for (int j = 0; j < t.length; j++) {
int cost = (s[i] == t[j]) ? 0 : 1;
v1[j + 1] = min(v1[j] + 1, min(v0[j + 1] + 1, v0[j] + cost));
}
for (int j = 0; j < t.length + 1; j++) {
v0[j] = v1[j];
}
}
return v1[t.length];
}

final _arabicStemmer = ArabicStemmer();

/// 计算两个阿拉伯语单词的相似度(编辑距离)。
/// [wordA] - 第一个单词。
/// [wordB] - 第二个单词。
/// 返回两个单词词根之间的 Levenshtein 编辑距离。距离越小,单词越相似。
int getArabicWordsSimilarity(String wordA, String wordB) {
final rootA = _arabicStemmer.extractRoot(wordA);
final rootB = _arabicStemmer.extractRoot(wordB);

return getLevenshtein(rootA, rootB);
}

//基于BK-tree实现快速相似词搜索

class _BKTreeNode {
final String term;
final Map<int, _BKTreeNode> children = {};

_BKTreeNode(this.term);
}

class BKTree {
_BKTreeNode? _root;

void add(String term) {
if (_root == null) {
_root = _BKTreeNode(term);
return;
}

var currentNode = _root!;
while (true) {
final distance = getLevenshtein(currentNode.term, term);
if (distance == 0) return; // Term already in tree

if (currentNode.children.containsKey(distance)) {
currentNode = currentNode.children[distance]!;
} else {
currentNode.children[distance] = _BKTreeNode(term);
break;
}
}
}

List<String> search(String term, int maxDistance) {
if (_root == null) return [];

final List<String> results = [];
final candidates = <_BKTreeNode>[_root!];

while (candidates.isNotEmpty) {
final node = candidates.removeLast();
final distance = getLevenshtein(node.term, term);

if (distance <= maxDistance) {
results.add(node.term);
}

final searchRangeStart = distance - maxDistance;
final searchRangeEnd = distance + maxDistance;

node.children.forEach((dist, child) {
if (dist >= searchRangeStart && dist <= searchRangeEnd) {
candidates.add(child);
}
});
}
return results;
}
}
class VocabularyOptimizer {
final _stemmer = ArabicStemmer();
final _tree = BKTree();
final Map<String, Set<String>> _rootToWordsMap = {};

/// 初始化并构建优化器
void build(List<String> words) {
_rootToWordsMap.clear();
for (final word in words) {
addWord(word);
}
}

void addWord(String word) {
final root = _stemmer.extractRoot(word);
if (root.isEmpty) return;

if (_rootToWordsMap.containsKey(root)) {
_rootToWordsMap[root]!.add(word);
} else {
_rootToWordsMap[root] = {word};
_tree.add(root);
}
}

/// 查找与给定单词相似的所有单词
List<String> findSimilarWords(String word, {int maxDistance = 1}) {
final queryRoot = _stemmer.extractRoot(word);
if (queryRoot.isEmpty) return [];

// 在树中搜索相似的词根
final similarRoots = _tree.search(queryRoot, maxDistance);

final results = <String>[];
for (final root in similarRoots) {
if (_rootToWordsMap.containsKey(root)) {
results.addAll(_rootToWordsMap[root]!);
}
}
return results;
}
}
/// 1. 初始化: BKSearch.init(['ktb', 'maktaba', ...]);
/// 2. 搜索: var results = BKSearch.search('kitab');
/// 3. 插入: BKSearch.insert('newWord');
class BKSearch {
// 私有构造函数,防止外部实例化
BKSearch._();

// 单例实例
static final VocabularyOptimizer _optimizer = VocabularyOptimizer();
static bool _isInitialized = false;

/// [必须调用] 初始化搜索引擎
/// 通常在 App 启动或加载词库时调用
static void init(List<String> allWords) {
if (_isInitialized) return; // 避免重复初始化
print("正在构建 BK-Tree 搜索索引,词库大小: ${allWords.length}...");
final stopwatch = Stopwatch()..start();

_optimizer.build(allWords);

stopwatch.stop();
_isInitialized = true;
print("BK-Tree 索引构建完成,耗时: ${stopwatch.elapsedMilliseconds}ms");
}

/// [query] : 用户输入的单词
/// [threshold] : 容错阈值,默认 1 (允许 1 个字符的编辑距离差异)
static List<String> search(String query, {int threshold = 1}) {
if (!_isInitialized) {
debugPrint("警告: BKSearch 尚未初始化,请先调用 init()");
return [];
}
return _optimizer.findSimilarWords(query, maxDistance: threshold);
}

static void insert(String word) {
_optimizer.addWord(word);
}

/// 检查是否已经准备好
static bool get isReady => _isInitialized;
}