Skip to content

Commit 4641f5a

Browse files
authored
Merge pull request #149 from openzim/indexData
Wrap indexData
2 parents 66dac18 + a649a61 commit 4641f5a

File tree

5 files changed

+237
-15
lines changed

5 files changed

+237
-15
lines changed

libzim/libwrapper.cpp

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,24 @@ ObjWrapper::~ObjWrapper()
7171
template<typename Output>
7272
Output _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error);
7373

74+
template<>
75+
bool _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
76+
return bool_cy_call_fct(obj, methodName, &error);
77+
}
78+
7479
template<>
7580
std::string _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
7681
return string_cy_call_fct(obj, methodName, &error);
7782
}
7883

7984
template<>
8085
uint64_t _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
81-
return int_cy_call_fct(obj, methodName, &error);
86+
return uint64_cy_call_fct(obj, methodName, &error);
87+
}
88+
89+
template<>
90+
uint32_t _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
91+
return uint32_cy_call_fct(obj, methodName, &error);
8292
}
8393

8494
template<>
@@ -92,12 +102,24 @@ _callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& erro
92102
return std::unique_ptr<zim::writer::ContentProvider>(contentprovider_cy_call_fct(obj, methodName, &error));
93103
}
94104

105+
template<>
106+
std::shared_ptr<zim::writer::IndexData>
107+
_callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
108+
return std::shared_ptr<zim::writer::IndexData>(indexdata_cy_call_fct(obj, methodName, &error));
109+
}
110+
95111
template<>
96112
zim::writer::Hints
97113
_callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
98114
return hints_cy_call_fct(obj, methodName, &error);
99115
}
100116

117+
template<>
118+
zim::writer::IndexData::GeoPosition
119+
_callMethodOnObj(PyObject *obj, const std::string& methodName, std::string& error) {
120+
return geoposition_cy_call_fct(obj, methodName, &error);
121+
}
122+
101123
// This cpp function call a python method on a python object.
102124
// It checks that we are in a valid state and handle any potential error coming from python.
103125
template<typename Output>
@@ -130,6 +152,44 @@ zim::Blob ContentProviderWrapper::feed()
130152
return callMethodOnObj<zim::Blob>(m_obj, "feed");
131153
}
132154

155+
156+
/*
157+
################################
158+
# Index Data Wrapper #
159+
################################
160+
*/
161+
162+
bool IndexDataWrapper::hasIndexData() const
163+
{
164+
return callMethodOnObj<bool>(m_obj, "has_indexdata");
165+
}
166+
167+
std::string IndexDataWrapper::getTitle() const
168+
{
169+
return callMethodOnObj<std::string>(m_obj, "get_title");
170+
}
171+
172+
std::string IndexDataWrapper::getContent() const
173+
{
174+
return callMethodOnObj<std::string>(m_obj, "get_content");
175+
}
176+
177+
std::string IndexDataWrapper::getKeywords() const
178+
{
179+
return callMethodOnObj<std::string>(m_obj, "get_keywords");
180+
}
181+
182+
uint32_t IndexDataWrapper::getWordCount() const
183+
{
184+
return callMethodOnObj<std::uint32_t>(m_obj, "get_wordcount");
185+
}
186+
187+
zim::writer::IndexData::GeoPosition IndexDataWrapper::getGeoPosition() const
188+
{
189+
return callMethodOnObj<zim::writer::IndexData::GeoPosition>(m_obj, "get_geoposition");
190+
}
191+
192+
133193
/*
134194
#########################
135195
# WriterItem #
@@ -161,6 +221,15 @@ WriterItemWrapper::getContentProvider() const
161221
return callMethodOnObj<std::unique_ptr<zim::writer::ContentProvider>>(m_obj, "get_contentprovider");
162222
}
163223

224+
std::shared_ptr<zim::writer::IndexData>
225+
WriterItemWrapper::getIndexData() const
226+
{
227+
if (!obj_has_attribute(m_obj, "get_indexdata")) {
228+
return zim::writer::Item::getIndexData();
229+
}
230+
return callMethodOnObj<std::shared_ptr<zim::writer::IndexData>>(m_obj, "get_indexdata");
231+
}
232+
164233
zim::writer::Hints WriterItemWrapper::getHints() const
165234
{
166235
return callMethodOnObj<zim::writer::Hints>(m_obj, "get_hints");

libzim/libwrapper.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ class WriterItemWrapper : public zim::writer::Item, private ObjWrapper
305305
std::string getTitle() const override;
306306
std::string getMimeType() const override;
307307
std::unique_ptr<zim::writer::ContentProvider> getContentProvider() const override;
308+
std::shared_ptr<zim::writer::IndexData> getIndexData() const override;
308309
zim::writer::Hints getHints() const override;
309310
};
310311

@@ -317,6 +318,19 @@ class ContentProviderWrapper : public zim::writer::ContentProvider, private ObjW
317318
zim::Blob feed() override;
318319
};
319320

321+
class IndexDataWrapper: public zim::writer::IndexData, private ObjWrapper
322+
{
323+
public:
324+
IndexDataWrapper(PyObject *obj) : ObjWrapper(obj) {};
325+
~IndexDataWrapper() = default;
326+
bool hasIndexData() const override;
327+
std::string getTitle() const override;
328+
std::string getContent() const override;
329+
std::string getKeywords() const override;
330+
uint32_t getWordCount() const override;
331+
IndexData::GeoPosition getGeoPosition() const override;
332+
};
333+
320334

321335
// Small helpers
322336

libzim/libzim.pyx

Lines changed: 85 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -40,15 +40,15 @@ import pathlib
4040
import sys
4141
import traceback
4242
from types import ModuleType
43-
from typing import Dict, Generator, Iterator, List, Set, Union
43+
from typing import Dict, Generator, Iterator, List, Optional, Set, Tuple, Union
4444
from uuid import UUID
4545

4646
from cpython.buffer cimport PyBUF_WRITABLE
4747
from cpython.ref cimport PyObject
4848

4949
from cython.operator import preincrement
5050

51-
from libc.stdint cimport uint64_t
51+
from libc.stdint cimport uint32_t, uint64_t
5252
from libcpp cimport bool
5353
from libcpp.map cimport map
5454
from libcpp.memory cimport shared_ptr
@@ -87,6 +87,10 @@ cdef object call_method(object obj, string method):
8787
# object to the correct cpp type.
8888
# Will be used by cpp side to call python method.
8989
cdef public api:
90+
bool obj_has_attribute(object obj, string attribute) with gil:
91+
"""Check if a object has a given attribute"""
92+
return hasattr(obj, attribute.decode('UTF-8'))
93+
9094
string string_cy_call_fct(object obj, string method, string *error) with gil:
9195
"""Lookup and execute a pure virtual method on object returning a string"""
9296
try:
@@ -122,25 +126,57 @@ cdef public api:
122126

123127
return NULL
124128

125-
# currently have no virtual method returning a bool (was should_index/compress)
126-
# bool bool_cy_call_fct(object obj, string method, string *error) with gil:
127-
# """Lookup and execute a pure virtual method on object returning a bool"""
128-
# try:
129-
# func = getattr(obj, method.decode('UTF-8'))
130-
# return func()
131-
# except Exception as e:
132-
# error[0] = traceback.format_exc().encode('UTF-8')
133-
# return False
134-
135-
uint64_t int_cy_call_fct(object obj, string method, string *error) with gil:
136-
"""Lookup and execute a pure virtual method on object returning an int"""
129+
zim.IndexData* indexdata_cy_call_fct(object obj, string method, string *error) with gil:
130+
"""Lookup and execute a pure virtual method on object returning a IndexData"""
131+
try:
132+
indexData = call_method(obj, method)
133+
if not indexData:
134+
# indexData is none
135+
return NULL;
136+
return new zim.IndexDataWrapper(<PyObject*>indexData)
137+
except Exception as e:
138+
error[0] = traceback.format_exc().encode('UTF-8')
139+
140+
return NULL
141+
142+
bool bool_cy_call_fct(object obj, string method, string *error) with gil:
143+
"""Lookup and execute a pure virtual method on object returning a bool"""
144+
try:
145+
return call_method(obj, method)
146+
except Exception as e:
147+
error[0] = traceback.format_exc().encode('UTF-8')
148+
149+
return False
150+
151+
uint64_t uint64_cy_call_fct(object obj, string method, string *error) with gil:
152+
"""Lookup and execute a pure virtual method on object returning an uint64_t"""
137153
try:
138154
return <uint64_t> call_method(obj, method)
139155
except Exception as e:
140156
error[0] = traceback.format_exc().encode('UTF-8')
141157

142158
return 0
143159

160+
uint32_t uint32_cy_call_fct(object obj, string method, string *error) with gil:
161+
"""Lookup and execute a pure virtual method on object returning an uint_32"""
162+
try:
163+
return <uint32_t> call_method(obj, method)
164+
except Exception as e:
165+
error[0] = traceback.format_exc().encode('UTF-8')
166+
167+
return 0
168+
169+
zim.GeoPosition geoposition_cy_call_fct(object obj, string method, string *error) with gil:
170+
"""Lookup and execute a pure virtual method on object returning a GeoPosition"""
171+
try:
172+
geoPosition = call_method(obj, method)
173+
if geoPosition:
174+
return zim.GeoPosition(True, geoPosition[0], geoPosition[1]);
175+
except Exception as e:
176+
error[0] = traceback.format_exc().encode('UTF-8')
177+
178+
return zim.GeoPosition(False, 0, 0)
179+
144180
map[zim.HintKeys, uint64_t] convertToCppHints(dict hintsDict):
145181
"""C++ Hints from Python dict"""
146182
cdef map[zim.HintKeys, uint64_t] ret;
@@ -439,6 +475,40 @@ class FileProvider(ContentProvider):
439475
yield WritingBlob(res)
440476
res = fh.read(bsize)
441477

478+
class IndexData:
479+
""" IndexData stub to override
480+
481+
Return a subclass of it in Item.get_indexdata()"""
482+
__module__ = writer_module_name
483+
484+
def has_indexdata(self) -> bool:
485+
"""Return true if the IndexData actually contains data"""
486+
return False
487+
488+
def get_title(self) -> str:
489+
"""Title to index. Might be the same as Item.get_title or not"""
490+
raise NotImplementedError("get_title must be implemented.")
491+
492+
def get_content(self) -> str:
493+
"""Content to index. Might be the same as Item.get_title or not"""
494+
raise NotImplementedError("get_content must be implemented.")
495+
496+
def get_keywords(self) -> str:
497+
"""Keywords used to index the item.
498+
499+
Must be a string containing keywords separated by a space"""
500+
raise NotImplementedError("get_keywords must be implemented.")
501+
502+
def get_wordcount(self) -> int:
503+
"""Number of word in content"""
504+
raise NotImplementedError("get_wordcount must be implemented.")
505+
506+
def get_geoposition(self) -> Optional[Tuple[float, float]]:
507+
"""GeoPosition used to index the item.
508+
509+
Must be a tuple (latitude, longitude) or None"""
510+
return None
511+
442512

443513
class BaseWritingItem:
444514
"""Item stub to override
@@ -529,6 +599,7 @@ writer_public_objects = [
529599
ContentProvider,
530600
FileProvider,
531601
StringProvider,
602+
IndexData,
532603
pascalize
533604
]
534605
writer = create_module(writer_module_name, writer_module_doc, writer_public_objects)

libzim/zim.pxd

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,14 @@ cdef extern from "zim/writer/item.h" namespace "zim::writer":
4545
COMPRESS
4646
FRONT_ARTICLE
4747

48+
cdef cppclass IndexData:
49+
pass
50+
51+
cdef extern from "zim/writer/item.h" namespace "zim::writer::IndexData":
52+
cppclass GeoPosition:
53+
GeoPosition()
54+
GeoPosition(bool, double, double)
55+
4856
cdef extern from "zim/writer/contentProvider.h" namespace "zim::writer":
4957
cdef cppclass ContentProvider:
5058
pass
@@ -91,6 +99,8 @@ cdef extern from "libwrapper.h":
9199
ContentProviderWrapper(PyObject* obj) except +
92100
cdef cppclass WriterItemWrapper:
93101
WriterItemWrapper(PyObject* obj) except +
102+
cdef cppclass IndexDataWrapper(IndexData):
103+
IndexDataWrapper(PyObject* obj) except +
94104

95105
Compression comp_from_int(int)
96106

tests/test_libzim_creator.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
Creator,
2121
FileProvider,
2222
Hint,
23+
IndexData,
2324
Item,
2425
StringProvider,
2526
)
@@ -639,6 +640,63 @@ def test_hints_values(fpath):
639640
)
640641

641642

643+
@pytest.mark.parametrize(
644+
"indexData, customContent, search_expected",
645+
[
646+
(None, "", [("standard", 1), ("home", 0), ("computer", 0)]),
647+
(False, "", [("standard", 1), ("home", 0), ("computer", 0)]),
648+
(True, "home", [("standard", 1), ("home", 1), ("computer", 0)]),
649+
(True, "computer", [("standard", 1), ("home", 0), ("computer", 1)]),
650+
(True, "standard", [("standard", 2), ("home", 0), ("computer", 0)]),
651+
],
652+
)
653+
def test_custom_indexdata(
654+
fpath, lipsum_item, lipsum, indexData, customContent, search_expected
655+
):
656+
item = StaticItem(path=HOME_PATH + "custom", content=lipsum, mimetype="text/html")
657+
if indexData is None:
658+
item.get_indexdata = lambda: None
659+
else:
660+
661+
class CustomIndexData(IndexData):
662+
def has_indexdata(self):
663+
return indexData
664+
665+
def get_title(self):
666+
return ""
667+
668+
def get_content(self):
669+
return customContent
670+
671+
def get_keywords(self):
672+
return ""
673+
674+
def get_wordcount(self):
675+
return 1
676+
677+
item.get_indexdata = CustomIndexData
678+
679+
with Creator(fpath).config_indexing(True, "eng") as c:
680+
c.add_item(lipsum_item)
681+
c.add_item(item)
682+
683+
zim = Archive(fpath)
684+
searcher = Searcher(zim)
685+
for search_query, expected in search_expected:
686+
query = Query().set_query(search_query)
687+
search = searcher.search(query)
688+
assert search.getEstimatedMatches() == expected
689+
690+
691+
def test_indexdata_interface():
692+
default_id = IndexData()
693+
assert default_id.has_indexdata() is False
694+
for method in ("title", "content", "keywords", "wordcount"):
695+
with pytest.raises(NotImplementedError):
696+
getattr(default_id, f"get_{method}")()
697+
assert default_id.get_geoposition() is None
698+
699+
642700
def test_reimpfeed(fpath):
643701
class AContentProvider:
644702
def __init__(self):

0 commit comments

Comments
 (0)