tag_select/search.py at main · YILING0013/tag_select · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# search.py
# -*- coding: utf-8 -*-
from flask import Flask, jsonify, request, Blueprint
import json
import re
import jieba
import Levenshtein as lev

search_blueprint = Blueprint('search', __name__)

# 加载 JSON 数据
def load_json(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except Exception as e:
        print(f"Error loading JSON file: {e}")
        return None

data = load_json('translations_converted.json')
if data is None:
    raise Exception("Failed to load JSON data. Please check the file path and format.")

# 预先对所有键和值进行分词，提升模糊搜索的性能
segmented_data = {}
for key, value in data.items():
    segmented_data[key] = {
         "key_words": list(jieba.cut(str(key))),
         "value_words": list(jieba.cut(str(value)))
    }

# 正则匹配搜索函数
def search_keywords(data, query, max_results):
    results = []
    # 构建正则模式，将查询的每个字符之间用 .* 连接
    pattern = '.*'.join(map(re.escape, query))
    regex = re.compile(pattern, re.IGNORECASE)
    for key, value in data.items():
        # 将键和值转换为字符串进行匹配
        if regex.search(str(key)) or regex.search(str(value)):
            results.append({key: value})
            if len(results) >= max_results:
                break
    return results

# 精确匹配搜索函数
def exact_search(data, query, max_results):
    results = []
    query_lower = query.lower()
    for key, value in data.items():
        if str(key).lower() == query_lower or str(value).lower() == query_lower:
            results.append({key: value})
            if len(results) >= max_results:
                break
    return results

# 模糊匹配搜索函数
def fuzzy_search(data, query, max_distance, max_results):
    results = []
    query_words = list(jieba.cut(query))
    for key, value in data.items():
        seg = segmented_data[key]
        key_words = seg["key_words"]
        value_words = seg["value_words"]
        # 当查询中所有词在键或值中均有匹配时，认为匹配成功
        key_match = all(any(lev.distance(qw, kw) <= max_distance for kw in key_words) for qw in query_words)
        value_match = all(any(lev.distance(qw, vw) <= max_distance for vw in value_words) for qw in query_words)
        if key_match or value_match:
            results.append({key: value})
            if len(results) >= max_results:
                break
    return results

# 限制返回最大数量不超过300
def limit_max_results(max_results):
    if max_results is None or max_results > 300:
        return 300
    return max_results

@search_blueprint.route('/regular_expression', methods=['GET'])
def regular_expression_api():
    query = request.args.get('query')
    max_results = request.args.get('max_results', type=int)
    if not query:
        return jsonify({"error": "No query provided"}), 400
    max_results = limit_max_results(max_results)
    results = search_keywords(data, query, max_results)
    return jsonify(results)

@search_blueprint.route('/fuzzy_search', methods=['GET'])
def fuzzy_search_api():
    query = request.args.get('query')
    max_results = request.args.get('max_results', type=int)
    if not query:
        return jsonify({"error": "No query provided"}), 400
    max_results = limit_max_results(max_results)
    # max_distance 可根据需求调整
    results = fuzzy_search(data, query, max_distance=1, max_results=max_results)
    return jsonify(results)

@search_blueprint.route('/exact_search', methods=['GET'])
def exact_search_api():
    query = request.args.get('query')
    max_results = request.args.get('max_results', type=int)
    if not query:
        return jsonify({"error": "No query provided"}), 400
    max_results = limit_max_results(max_results)
    results = exact_search(data, query, max_results)
    return jsonify(results)