Skip to content

Commit 48d1f33

Browse files
authored
Merge pull request #23 from MrIbrahem/update
Update
2 parents 150f15b + e0e6048 commit 48d1f33

File tree

7 files changed

+243
-33
lines changed

7 files changed

+243
-33
lines changed

python/src/app.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pyx import logs_bot_new
99
from pyx.wd_data_bots import wd_data_P11038
1010
from pyx.sparql_bots import sparql_bot
11-
from pyx.sparql_bots.render import render_duplicate_by_category, render_sparql_P11038_grouped
11+
from pyx.sparql_bots.render import render_duplicate_by_category, render_duplicate, render_sparql_P11038_grouped
1212
from pyx.bots.not_in_db_bot import get_not_in_db
1313

1414

@@ -55,6 +55,14 @@ def jsonify(data : dict, **kwargs) -> str:
5555
return Response(response=response_json, content_type="application/json; charset=utf-8")
5656

5757

58+
@app.route("/api/duplicate2", methods=["GET"])
59+
def duplicate2_api():
60+
# ---
61+
data, sparql_exec_time = render_duplicate()
62+
# ---
63+
return jsonify(data, sparql_exec_time=sparql_exec_time, len_result=len(data))
64+
65+
5866
@app.route("/api/wd_data_count", methods=["GET"])
5967
def wd_data_api_count():
6068
# ---
@@ -154,10 +162,28 @@ def P11038_wd():
154162
)
155163

156164

165+
@app.route("/duplicate2.html", methods=["GET"])
166+
def duplicate2():
167+
# ---
168+
limit = request.args.get('limit', 10000, type=int)
169+
# ---
170+
data, sparql_exec_time = render_duplicate(limit)
171+
# ---
172+
time_tab = {
173+
"sparql_exec_time": sparql_exec_time,
174+
}
175+
# ---
176+
return render_template(
177+
"duplicate2.html",
178+
result=data,
179+
time_tab=time_tab,
180+
)
181+
182+
157183
@app.route("/duplicate.html", methods=["GET"])
158184
def duplicate():
159185
# ---
160-
limit = request.args.get('limit', 10000, type=int)
186+
limit = request.args.get('limit', 50000, type=int)
161187
# ---
162188
data, sparql_exec_time = render_duplicate_by_category(limit)
163189
# ---

python/src/pyx/sparql_bots/render.py

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,20 @@
88
from collections import defaultdict
99
from . import sparql_bot
1010

11+
categoryLabels = {
12+
"Q24905": "فعل",
13+
"Q111029": "جذر",
14+
"Q1084": "اسم",
15+
"Q34698": "صفة",
16+
"Q147276": "اسم علم",
17+
"Q4833830": "حرف جر",
18+
"Q9788": "حرف",
19+
"Q36484": "حرف ربط",
20+
"Q468801": "ضمير شخصي",
21+
"Q63116": "اسم عدد"
22+
}
23+
# ---
24+
1125

1226
def split_data_by_category_list(data):
1327
# ---
@@ -42,7 +56,7 @@ def split_data_by_category_dict(data):
4256
if category not in split_by_category:
4357
split_by_category[category] = {
4458
'category': category,
45-
'categoryLabel': item['categoryLabel'],
59+
'categoryLabel': item.get('categoryLabel') or categoryLabels.get(category, ""),
4660
'members': {}
4761
}
4862
# ---
@@ -84,7 +98,7 @@ def render_sparql_P11038_grouped(limit=0, group_it=False):
8498
return tab_P11038, sparql_exec_time
8599

86100

87-
def find_duplicates(members):
101+
def duplicates_work(members):
88102
# ---
89103
duplicates = defaultdict(list)
90104
# ---
@@ -115,11 +129,44 @@ def render_duplicate_by_category(limit):
115129
# ---
116130
for cat, tab in split_by_category.items():
117131
# ---
118-
members = find_duplicates(tab["members"])
132+
members = duplicates_work(tab["members"])
119133
# ---
120134
if members:
121135
tab["lemmas"] = members
122136
# ---
123137
new[cat] = tab
124138
# ---
125139
return new, sparql_exec_time
140+
141+
142+
def render_duplicate(limit=0):
143+
# ---
144+
result, sparql_exec_time, err = sparql_bot.find_duplicates()
145+
# ---
146+
# result = {x['item']: x for x in result}
147+
# ---
148+
# split_by_category = split_data_by_category_dict(result)
149+
# ---
150+
new = {}
151+
# ---
152+
# { "lemma_fixed": "تذكير", "category": "Q1084", "items": "L1457168, L1457168", "lemmas": "تذكير, تَذْكِير" }
153+
for tab in result:
154+
# ---
155+
new.setdefault(tab['lemma_fixed'], {
156+
"lemma": tab['lemma_fixed'],
157+
"category": tab['category'],
158+
"categoryLabel": categoryLabels.get(tab['category'], ""),
159+
'members' : []
160+
})
161+
# ---
162+
lemmas = tab['lemmas'].split(",")
163+
items = tab['items'].split(",")
164+
# ---
165+
for lemma, item in zip(lemmas, items):
166+
# ---
167+
new[tab['lemma_fixed']]['members'].append({
168+
"lemma": lemma.strip(),
169+
"item": item.strip(),
170+
})
171+
# ---
172+
return new, sparql_exec_time

python/src/pyx/sparql_bots/sparql_bot.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
endpoint_url = 'https://query.wikidata.org/sparql'
1919

2020

21-
def safe_sparql_query(query):
21+
def safe_sparql_query(query, time_out=10):
2222

2323
if query in sparql_cache and "nocahe" not in sys.argv:
2424
err_bot.log_error("SPARQL Cache Hit", f"Query retrieved from cache: {query}")
@@ -31,7 +31,7 @@ def safe_sparql_query(query):
3131
sparql.setQuery(query)
3232
# ---
3333
sparql.setReturnFormat(JSON)
34-
sparql.setTimeout(10)
34+
sparql.setTimeout(time_out)
3535
# ---
3636
data = sparql.query().convert()
3737
# ---
@@ -61,11 +61,11 @@ def safe_sparql_query(query):
6161
return {}, "SPARQL Unknown Error"
6262

6363

64-
def get_results(query):
64+
def get_results(query, time_out=10, get_err=False):
6565
# ---
6666
now = time.time()
6767
# ---
68-
data, err = safe_sparql_query(query)
68+
data, err = safe_sparql_query(query, time_out=time_out)
6969
# ---
7070
# تنسيق النتائج
7171
result = []
@@ -95,6 +95,9 @@ def get_results(query):
9595
# ---
9696
print(f"SPARQL sparql_exec_time: {sparql_exec_time}")
9797
# ---
98+
if get_err:
99+
return result, sparql_exec_time, err
100+
# ---
98101
return result, sparql_exec_time
99102

100103

@@ -236,23 +239,26 @@ def count_arabic_with_P11038():
236239
return count, sparql_exec_time
237240

238241

239-
def find_duplicates():
242+
def find_duplicates(LIMIT=100):
240243
sparql_query = """
241-
SELECT ?lemma_1 ?category
242-
(GROUP_CONCAT(?1_item; separator=", ") AS ?items)
244+
SELECT ?lemma_fixed ?category
245+
(GROUP_CONCAT(strafter(str(?1_item),"/entity/"); separator=", ") AS ?items)
243246
(GROUP_CONCAT(?lemma; separator=", ") AS ?lemmas)
244247
WHERE {
245248
#service <https://qlever.cs.uni-freiburg.de/api/wikidata> {
246249
?1_item dct:language wd:Q13955;
247250
wikibase:lemma ?lemma;
248251
wikibase:lexicalCategory ?category.
249-
BIND(REPLACE(STR(?lemma), "[\u064B-\u065F\u066A-\u06EF]", "") AS ?lemma_1)
252+
BIND(REPLACE(STR(?lemma), "[\u064B-\u065F\u066A-\u06EF]", "") AS ?lemma_fixed)
250253
#}
251254
}
252-
GROUP BY ?lemma_1 ?category
255+
GROUP BY ?lemma_fixed ?category
253256
HAVING(COUNT(?1_item) > 1)
254-
#LIMIT 10
255257
"""
256-
data, sparql_exec_time = get_results(sparql_query)
257258
# ---
258-
return data, sparql_exec_time
259+
if LIMIT > 0:
260+
sparql_query += f" LIMIT {LIMIT}"
261+
# ---
262+
data, sparql_exec_time, err = get_results(sparql_query, time_out=35, get_err=True)
263+
# ---
264+
return data, sparql_exec_time, err

python/src/static/js/lexemes/compare.js

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ function convertDataNew(data) {
6767
}
6868

6969
async function get_wdresult(qids) {
70+
// ---
7071
const sparqlQuery = qids_data_query(qids);
7172
// ---
7273
add_sparql_url(sparqlQuery);
@@ -243,11 +244,7 @@ async function render_tables_container(data) {
243244
});
244245
}
245246

246-
async function load_compare() {
247-
// ---
248-
let qids = get_param_from_window_location("qids", "");
249-
// ---
250-
qids = qids.split(",");
247+
async function load_compare(qids) {
251248
// ---
252249
let data = await get_qids_data(qids);
253250
// ---

python/src/templates/compare.php

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,8 @@
2323
<span class="text-2xl font-bold text-center h2">
2424
مقارنة المفردات:
2525
</span>
26-
({%for qid in qids %}
27-
{%if not loop.first%} - {%endif%}
28-
<a href="https://www.wikidata.org/entity/{{qid}}" target="_blank">
29-
<span class="fs-5">
30-
<span find-label="{{qid}}" find-label-both="true">{{qid}}</span>
31-
</span>
32-
</a>
33-
{%endfor%})
26+
<span id="qids_span">
27+
</span>
3428
</div>
3529
<div class="col-md-2 col-sm-2 mb-2 mb-md-0">
3630
<a href="#" target="_blank" id="sparql_url" class="btn btn-outline-primary disabled" role="button">
@@ -60,7 +54,32 @@
6054
<script src="/static/js/toggleView_compare.js"></script>
6155
6256
<script>
63-
document.addEventListener('DOMContentLoaded', () => load_compare());
57+
async function start() {
58+
// ---
59+
let qids = get_param_from_window_location("qids", "") || "";
60+
// ---
61+
qids = qids ? qids.split(",") : [];
62+
// ---
63+
const container = document.getElementById("qids_span");
64+
if (container && qids.length > 0) {
65+
container.innerHTML = "(" + qids.map((qid, index) => {
66+
const separator = index === 0 ? "" : " - ";
67+
return `${separator}<a href="https://www.wikidata.org/entity/${qid}" target="_blank">
68+
<span class="fs-5">
69+
<span find-label="${qid}" find-label-both="true">${qid}</span>
70+
</span>
71+
</a>`;
72+
}).join('') + ")";
73+
} else if (container) {
74+
container.textContent = "()"; // حالة عدم وجود QIDs
75+
}
76+
77+
// --- استدعاء دالة التحميل ---
78+
await load_compare(qids);
79+
80+
}
81+
82+
document.addEventListener('DOMContentLoaded', () => start());
6483
</script>
6584
6685
{% endblock %}

python/src/templates/duplicate.html

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,14 @@ <h2 class="mb-4" id="header_main">
9999
],
100100
pending: true,
101101
lang: "ar",
102-
paging: true,
102+
paging: false,
103+
info: false,
103104
searching: true,
104105
responsive: {
105106
details: true
106107
// display: $.fn.dataTable.Responsive.display.modal()
107108
},
108-
pageLength: 100
109+
pageLength: 1000
109110

110111
})
111112
});

0 commit comments

Comments
 (0)