Do not copy the article content. Return a memoryview.

mgautierfr · mgautierfr · commit 469ca75d2a8b · 2020-05-06T10:58:55.000+02:00
diff --git a/libzim/__init__.py b/libzim/__init__.py
@@ -16,8 +16,3 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
-
-
-from libzim_wrapper import Blob
-
-__all__ = ["Blob"]
diff --git a/libzim/libzim_wrapper.pyx b/libzim/libzim_wrapper.pyx
@@ -22,6 +22,7 @@ cimport libzim.libzim_wrapper as clibzim
 
 from cython.operator import dereference, preincrement
 from cpython.ref cimport PyObject
+from cpython.buffer cimport PyBUF_WRITABLE
 
 from libc.stdint cimport uint64_t
 from libcpp.string cimport string
@@ -30,14 +31,11 @@ from libcpp.memory cimport shared_ptr, make_shared, unique_ptr
 
 import datetime
 
-
-
-
 #########################
 #         Blob          #
 #########################
 
-cdef class Blob:
+cdef class WritingBlob:
     cdef clibzim.Blob* c_blob
     cdef bytes ref_content
 
@@ -52,6 +50,50 @@ cdef class Blob:
         if self.c_blob != NULL:
             del self.c_blob
 
+cdef Py_ssize_t itemsize = 1
+
+cdef class ReadingBlob:
+    cdef clibzim.Blob c_blob
+    cdef Py_ssize_t size
+    cdef int view_count
+
+    cdef __setup(self, clibzim.Blob blob):
+        """Assigns an internal pointer to the wrapped C++ article object.
+
+        Parameters
+        ----------
+        *art : Article
+            Pointer to a C++ (zim::) article object
+        """
+        # Set new internal C zim.ZimArticle article
+        self.c_blob = blob
+        self.size = blob.size()
+        self.view_count = 0
+
+    def __dealloc__(self):
+        if self.view_count:
+            raise RuntimeError("Blob has views")
+
+    def __getbuffer__(self, Py_buffer *buffer, int flags):
+        if flags&PyBUF_WRITABLE:
+            raise BufferError("Cannot create writable memoryview on readonly data")
+        buffer.obj = self
+        buffer.buf = <void*>self.c_blob.data()
+        buffer.len = self.size
+        buffer.readonly = 1
+        buffer.format = 'c'
+        buffer.internal = NULL                  # see References
+        buffer.itemsize = itemsize
+        buffer.ndim = 1
+        buffer.shape = &self.size
+        buffer.strides = &itemsize
+        buffer.suboffsets = NULL                # for pointer arrays only
+
+        self.view_count += 1
+
+    def __releasebuffer__(self, Py_buffer *buffer):
+        self.view_count -= 1
+
 
 #------ Helper for pure virtual methods --------
 
@@ -76,7 +118,7 @@ cdef public api:
 
     clibzim.Blob blob_cy_call_fct(object obj, string method, int *error) with gil:
         """Lookup and execute a pure virtual method on ZimArticle returning a Blob"""
-        cdef Blob blob
+        cdef WritingBlob blob
 
         func = get_article_method_from_object(obj, method, error)
         blob = func()
@@ -198,12 +240,17 @@ cdef class ReadArticle:
         Creates a python ZimArticle from a C++ zim.Article article.
     """
     cdef clibzim.Article c_article
+    cdef ReadingBlob _blob
+    cdef bool _haveBlob
 
     #def __eq__(self, other):
     #    if isinstance(other, ZimArticle):
     #        return (self.longurl == other.longurl) and (self.content == other.content) and (self.is_redirect == other.is_redirect)
     #    return False
 
+    def __cinit__(self):
+        self._haveBlob = False
+
     cdef __setup(self, clibzim.Article art):
         """Assigns an internal pointer to the wrapped C++ article object.
 
@@ -214,6 +261,7 @@ cdef class ReadArticle:
         """
         # Set new internal C zim.ZimArticle article
         self.c_article = art
+        self._blob = None
 
 
 
@@ -248,9 +296,11 @@ cdef class ReadArticle:
     @property
     def content(self):
         """Get the article's content"""
-        cdef clibzim.Blob blob = self.c_article.getData(<int> 0)
-        data =  blob.data()[:blob.size()]
-        return data
+        if not self._haveBlob:
+            self._blob = ReadingBlob()
+            self._blob.__setup(self.c_article.getData(<int> 0))
+            self._haveBlob = True
+        return memoryview(self._blob)
 
     @property
     def longurl(self):
diff --git a/libzim/writer.py b/libzim/writer.py
@@ -22,7 +22,7 @@
 from collections import defaultdict
 
 import libzim_wrapper
-from libzim_wrapper import Blob
+from libzim_wrapper import WritingBlob as Blob
 
 __all__ = ["Article", "Blob", "Creator"]
 
diff --git a/tests/test_libzim_file_reader.py b/tests/test_libzim_file_reader.py
@@ -1,22 +1,33 @@
-import pytest
+import gc
 from pathlib import Path
 
-DATA_DIR = Path(__file__).parent
+import pytest
 
 from libzim.reader import File
 
+DATA_DIR = Path(__file__).parent
+
+
+ZIMFILES = [
+    {
+        'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"),
+        'checksum': u"99ea7a5598c6040c4f50b8ac0653b703",
+        'namespaces': u"-AIMX",
+        'article_count': 22027,
+        'main_page_url': u"A/index",
+    }
+]
+
+
+
+
+@pytest.fixture(params=ZIMFILES)
+def zimdata(request):
+    return request.param
+
 @pytest.fixture
-def reader_data():
-    return (
-        File(str(DATA_DIR/"wikipedia_es_physics_mini.zim")),
-        {
-            'filename': str(DATA_DIR/"wikipedia_es_physics_mini.zim"),
-            'checksum': u"99ea7a5598c6040c4f50b8ac0653b703",
-            'namespaces': u"-AIMX",
-            'article_count': 22027,
-            'main_page_url': u"A/index"
-        }
-    )
+def reader(zimdata):
+    return File(zimdata['filename'])
 
 
 @pytest.fixture
@@ -25,45 +36,71 @@ def article_data():
         'url': u"A/Albert_Einstein",
         'title': u"Albert Einstein",
         'mimetype':u"text/html",
-        'article_id': 663
+        'article_id': 663,
+        'size': 17343
     }
 
 
-def test_zim_filename(reader_data):
-    reader, data = reader_data
-    for k, v in data.items():
+def test_zim_filename(reader, zimdata):
+    for k, v in zimdata.items():
         assert getattr(reader, k) == v
 
-def test_zim_read(reader_data, article_data):
-    reader, _ = reader_data
+def test_zim_read(reader, article_data):
     article = reader.get_article(article_data['url'])
 
     assert article.longurl == article_data['url']
     assert article.title == article_data['title']
     assert article.url == article_data['url'][2:]
     assert article.mimetype == article_data['mimetype']
+    assert isinstance(article.content, memoryview)
+    assert len(article.content) == article_data['size']
 
-def test_get_article_by_id(reader_data, article_data):
-    reader, _ = reader_data
+def test_content_ref_keep(reader):
+    """Get the memoryview on a content and loose the reference on the article.
+       We try to load a lot of other articles to detect possible use of dandling pointer
+    """
+    content =None
+    def get_content():
+        nonlocal content
+        article = reader.get_article(u"A/Albert_Einstein")
+        assert isinstance(article.content, memoryview)
+        content = article.content
+    get_content() # Now we have a content but no reference to the article.
+    gc.collect()
+    # Load a lot of content
+    for i in range(0, reader.article_count, 2):
+        article = reader.get_article_by_id(i)
+        if not article.is_redirect:
+            c = article.content
+    # Check everything is ok
+    assert len(content) == 17343
+    assert bytes(content[:100]) == b'<!DOCTYPE html>\n<html class="client-js"><head>\n  <meta charset="UTF-8">\n  <title>Albert Einstein</ti'
+
+def test_get_article_by_id(reader, article_data):
+    return
     article = reader.get_article_by_id(article_data['article_id'])
 
     assert article.longurl == article_data['url']
     assert article.title == article_data['title']
     assert article.url == article_data['url'][2:]
     assert article.mimetype == article_data['mimetype']
 
-def test_namespace_count(reader_data):
-    reader, _ = reader_data
+def test_namespace_count(reader):
     namespaces = reader.namespaces
     num_articles = sum(reader.get_namespaces_count(ns) for ns in namespaces)
     assert reader.article_count == num_articles
 
-def test_suggest(reader_data):
-    reader, _ = reader_data
+def test_suggest(reader):
     results =  reader.suggest(u"Einstein")
     assert u"A/Albert_Einstein" in list(results)
 
-def test_search(reader_data):
-    reader, _ = reader_data
+def test_search(reader):
     results = reader.search(u"Einstein")
     assert len(list(results)) == 10
+
+
+def test_get_wrong_article(reader):
+    with pytest.raises(RuntimeError):
+        reader.get_article_by_id(reader.article_count + 100)
+    with pytest.raises(RuntimeError):
+        reader.get_article("A/I_do_not_exists")

Original file line number	Diff line number	Diff line change
`@@ -16,8 +16,3 @@`
`16`	`16`	`#`
`17`	`17`	`# You should have received a copy of the GNU General Public License`
`18`	`18`	`# along with this program. If not, see <http://www.gnu.org/licenses/>.`
`19`		`-`
`20`		`-`
`21`		`-from libzim_wrapper import Blob`
`22`		`-`
`23`		`-__all__ = ["Blob"]`