Skip to content

Commit 94882c2

Browse files
author
renaud gaudin
committed
Cleaner writer API
- fixed, added or updated docstrings - added type annotations to ease self-discovery - made `mandatory_metadata_ok` a property - added a `_closed` boolean prop in creator to record finalization status - fixed the deletion issue after a contextmanager auto-close (using `self._closed`) - simplify public API by moving single-use method: - `_get_counter_string()` and `write_metadata()` to `close()` - `_update_article_counter()` to `add_article()` - `pascalize()` to `update_metadata()` - `write_metadata()` to autmatically convert Date from datetime.datetime as well
1 parent 58d777f commit 94882c2

File tree

1 file changed

+123
-70
lines changed

1 file changed

+123
-70
lines changed

libzim/writer.py

Lines changed: 123 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
""" libzim writer module
2+
- Creator to create ZIM files
3+
- Article to store ZIM articles metadata
4+
- Blob to store ZIM article content
5+
Usage:
6+
with Creator(pathlib.Path("myfile.zim"), main_page="welcome.html") as xf:
7+
article = MyArticleSubclass(
8+
url="A/welcome.html",
9+
title="My Title",
10+
content=Blob("My content"))
11+
zf.add_article(article)
12+
zf.update_metadata(tags="new;demo") """
13+
114
# This file is part of python-libzim
215
# (see https://github.com/libzim/python-libzim)
316
#
@@ -19,7 +32,7 @@
1932

2033
import pathlib
2134
import datetime
22-
from collections import defaultdict
35+
import collections
2336

2437
from .wrapper import Creator as _Creator
2538
from .wrapper import WritingBlob as Blob
@@ -28,113 +41,132 @@
2841

2942

3043
class Article:
44+
""" Article stub to override
45+
46+
Pass a subclass of it to Creator.add_article() """
47+
3148
def __init__(self):
3249
self._blob = None
3350

34-
def get_url(self):
51+
def get_url(self) -> str:
52+
""" Full URL of article including namespace """
3553
raise NotImplementedError
3654

37-
def get_title(self):
55+
def get_title(self) -> str:
56+
""" Article title. Might be indexed and used in suggestions """
3857
raise NotImplementedError
3958

40-
def is_redirect(self):
59+
def is_redirect(self) -> bool:
60+
""" Whether this redirects to another article (cf. redirec_url) """
4161
raise NotImplementedError
4262

43-
def get_mime_type(self):
63+
def get_mime_type(self) -> str:
64+
""" MIME-type of the article's content. A/ namespace reserved to text/html """
4465
raise NotImplementedError
4566

46-
def get_filename(self):
67+
def get_filename(self) -> str:
68+
""" Filename to get content from. Blank string "" if not used """
4769
raise NotImplementedError
4870

49-
def should_compress(self):
71+
def should_compress(self) -> bool:
72+
""" Whether the article's content should be compressed or not """
5073
raise NotImplementedError
5174

52-
def should_index(self):
75+
def should_index(self) -> bool:
76+
""" Whether the article's content should be indexed or not """
5377
raise NotImplementedError
5478

55-
def redirect_url(self):
79+
def redirect_url(self) -> str:
80+
""" Full URL including namespace of another article """
5681
raise NotImplementedError
5782

58-
def _get_data(self):
83+
def _get_data(self) -> Blob:
84+
""" Internal data-retrieval with a cache to the content's pointer
85+
86+
You don't need to override this """
5987
if self._blob is None:
6088
self._blob = self.get_data()
6189
return self._blob
6290

63-
def get_data(self):
91+
def get_data(self) -> Blob:
92+
""" Blob containing the complete content of the article """
6493
raise NotImplementedError
6594

66-
def __repr__(self):
95+
def __repr__(self) -> str:
6796
return f"{self.__class__.__name__}(url={self.get_url()}, title={self.get_title()})"
6897

6998

7099
class MetadataArticle(Article):
71-
def __init__(self, url, metadata_content):
100+
""" Simple Article sub-class for key-value articles on M/ metadata namespace """
101+
102+
def __init__(self, url: str, metadata_content: str):
72103
Article.__init__(self)
73104
self.url = url
74105
self.metadata_content = metadata_content
75106

76-
def is_redirect(self):
107+
def is_redirect(self) -> bool:
77108
return False
78109

79-
def get_url(self):
110+
def get_url(self) -> str:
80111
return f"M/{self.url}"
81112

82-
def get_title(self):
113+
def get_title(self) -> str:
83114
return ""
84115

85-
def get_mime_type(self):
116+
def get_mime_type(self) -> str:
86117
return "text/plain"
87118

88-
def get_filename(self):
119+
def get_filename(self) -> str:
89120
return ""
90121

91-
def should_compress(self):
122+
def should_compress(self) -> bool:
92123
return True
93124

94-
def should_index(self):
125+
def should_index(self) -> bool:
95126
return False
96127

97-
def get_data(self):
128+
def get_data(self) -> Blob:
98129
return Blob(self.metadata_content)
99130

100131

101-
def pascalize(keyword):
102-
""" Converts python case to pascal case. example: long_description-> LongDescription """
103-
return "".join(keyword.title().split("_"))
104-
105-
106132
class Creator:
107-
"""
108-
A class to represent a Zim Creator.
109-
110-
Attributes
111-
----------
112-
*c_creator : zim.Creator
113-
a pointer to the C++ Creator object
114-
_finalized : bool
115-
flag if the creator was finalized
116-
_filename : pathlib.Path
117-
Zim file path
118-
_main_page : str
119-
Zim file main page
120-
_index_language : str
121-
Zim file Index language
122-
_min_chunk_size : str
123-
Zim file minimum chunk size
124-
_article_counter
125-
Zim file article counter
126-
_metadata
127-
Zim file metadata
128-
"""
129-
130-
def __init__(self, filename, main_page, index_language="eng", min_chunk_size=2048):
133+
""" Zim Creator.
134+
135+
Attributes
136+
----------
137+
*_creatorWrapper : wrapper.ZimCreatorWrapper
138+
a pointer to the C++ Creator object wrapper
139+
filename : pathlib.Path
140+
Zim file path
141+
main_page : str
142+
Zim file main page (without namespace)
143+
language : str
144+
Zim file Index language
145+
_article_counter
146+
Zim file article counter
147+
_metadata
148+
Zim file metadata """
149+
150+
def __init__(
151+
self, filename: pathlib.Path, main_page: str, index_language: str = "eng", min_chunk_size: int = 2048,
152+
):
153+
""" Creates a ZIM Creator
154+
155+
Parameters
156+
----------
157+
filename : Path to create the ZIM file at
158+
main_page: ZIM file main article URL (without namespace, must be in A/)
159+
index_language: content language to inform indexer with (ISO-639-3)
160+
min_chunk_size: minimum size of chunks for compression """
161+
131162
self._creatorWrapper = _Creator(str(filename), main_page, index_language, min_chunk_size)
132163
self.filename = pathlib.Path(filename)
133164
self.main_page = main_page
134165
self.language = index_language
135166
self._metadata = {}
136-
self._article_counter = defaultdict(int)
167+
self._article_counter = collections.defaultdict(int)
137168
self.update_metadata(date=datetime.date.today(), language=index_language)
169+
self._closed = False
138170

139171
def __enter__(self):
140172
return self
@@ -143,38 +175,59 @@ def __exit__(self, *args):
143175
self.close()
144176

145177
def __del__(self):
146-
self.close()
147-
148-
def add_article(self, article):
178+
if not self._closed:
179+
self.close()
180+
181+
def add_article(self, article: Article):
182+
""" Adds an article to the Creator.
183+
184+
Parameters
185+
----------
186+
article : Zim writer Article
187+
The article to add to the file
188+
Raises
189+
------
190+
RuntimeError
191+
If the ZimCreator was already finalized """
149192
self._creatorWrapper.add_article(article)
150193
if not article.is_redirect():
151-
self._update_article_counter(article)
152-
153-
def _update_article_counter(self, article):
154-
# default dict update
155-
self._article_counter[article.get_mime_type().strip()] += 1
194+
# update article counter
195+
self._article_counter[article.get_mime_type().strip()] += 1
156196

157197
def update_metadata(self, **kwargs: str):
158198
""" Updates Creator metadata for ZIM, supplied as keyword arguments """
159-
new_metadata = {pascalize(k): v for k, v in kwargs.items()}
160-
self._metadata.update(new_metadata)
161199

162-
def write_metadata(self):
200+
def pascalize(keyword: str):
201+
""" Converts python case to pascal case.
202+
203+
example: long_description -> LongDescription """
204+
return "".join(keyword.title().split("_"))
205+
206+
self._metadata.update({pascalize(k): v for k, v in kwargs.items()})
207+
208+
def close(self):
209+
""" Finalizes and writes added articles to the file
210+
211+
Raises
212+
------
213+
RuntimeError
214+
If the ZimCreator was already finalized """
215+
if self._closed:
216+
raise RuntimeError("Creator already closed")
217+
218+
# Store _medtadata dict as MetadataArticle
163219
for key, value in self._metadata.items():
164-
if key == "Date" and isinstance(value, datetime.date):
220+
if key == "Date" and isinstance(value, (datetime.date, datetime.datetime)):
165221
value = value.strftime("%Y-%m-%d")
166222
article = MetadataArticle(key, value)
167223
self._creatorWrapper.add_article(article)
168224

169-
article = MetadataArticle("Counter", self._get_counter_string())
225+
counter_str = ";".join([f"{k}={v}" for (k, v) in self._article_counter.items()])
226+
article = MetadataArticle("Counter", counter_str)
170227
self._creatorWrapper.add_article(article)
171228

172-
def _get_counter_string(self):
173-
return ";".join(["%s=%s" % (k, v) for (k, v) in self._article_counter.items()])
174-
175-
def close(self):
176-
self.write_metadata()
177229
self._creatorWrapper.finalize()
230+
self._closed = True
178231

179-
def __repr__(self):
232+
def __repr__(self) -> str:
180233
return f"Creator(filename={self.filename})"

0 commit comments

Comments
 (0)