Skip to content

Commit 9ca1090

Browse files
author
Andrei Neagu
committed
simplify outputs parsing
1 parent da30d13 commit 9ca1090

File tree

1 file changed

+15
-22
lines changed

1 file changed

+15
-22
lines changed

packages/service-library/src/servicelib/archiving_utils/_interface_7zip.py

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
_FILE_COUNT_RE: Final[str] = r" (\d+)\s*files"
3232
_PROGRESS_PERCENT_RE: Final[str] = r" (?:100|\d?\d)% "
3333
_ALL_DONE_RE: Final[str] = r"Everything is Ok"
34-
# NOTE: the size of `chunk_to_emit` should not be too big nor too small otherwise it might skip some updates
34+
# NOTE: the size of `chunk_to_emit` should in theory contain everything that above regexes capture
3535
_DEFAULT_CHUNK_SIZE: Final[NonNegativeInt] = 20
3636

3737

@@ -115,45 +115,36 @@ async def _output_reader(
115115
output_handlers: list[Callable[[str], Awaitable[None]]] | None,
116116
chunk_size: NonNegativeInt = _DEFAULT_CHUNK_SIZE,
117117
) -> str:
118-
command_output = ""
118+
# NOTE: we do not read line by line but chunk by chunk otherwise we'd miss progress updates
119+
# the key is to read the smallest possible chunks of data so that the progress can be properly parsed
120+
if output_handlers is None:
121+
output_handlers = []
119122

120-
# Initialize buffer to store lookbehind window
121-
lookbehind_buffer = ""
123+
command_output = ""
122124

123-
undecodable_chunk: bytes | None = None
125+
lookbehind_buffer = "" # store the last chunk
124126

125127
while True:
126128
read_chunk = await stream.read(chunk_size)
127129
if not read_chunk:
128130
# Process remaining buffer if any
129-
if lookbehind_buffer and output_handlers:
131+
if lookbehind_buffer:
130132
await asyncio.gather(
131133
*[handler(lookbehind_buffer) for handler in output_handlers]
132134
)
133135
break
134136

135-
try:
136-
if undecodable_chunk:
137-
chunk = (undecodable_chunk + read_chunk).decode("utf-8")
138-
undecodable_chunk = None
139-
else:
140-
chunk = read_chunk.decode("utf-8")
141-
except UnicodeDecodeError:
142-
undecodable_chunk = read_chunk
143-
continue
137+
# `errors=replace`: avoids getting stuck when can't parse utf-8
138+
chunk = read_chunk.decode("utf-8", errors="replace")
144139

145140
command_output += chunk
146141

147142
# Combine lookbehind buffer with new chunk
148143
chunk_to_emit = lookbehind_buffer + chunk
149-
150-
if output_handlers:
151-
await asyncio.gather(
152-
*[handler(chunk_to_emit) for handler in output_handlers]
153-
)
154-
155144
# Keep last window_size characters for next iteration
156-
lookbehind_buffer = chunk_to_emit[-chunk_size:]
145+
lookbehind_buffer = chunk_to_emit[-len(chunk) :]
146+
147+
await asyncio.gather(*[handler(chunk_to_emit) for handler in output_handlers])
157148

158149
return command_output
159150

@@ -258,6 +249,7 @@ async def unarchive_dir(
258249
num_steps=1, description=IDStr(f"extracting {archive_to_extract.name}")
259250
)
260251

252+
# get archive information
261253
archive_info_parser = ArchiveInfoParser()
262254
await _run_cli_command(
263255
f"7z l {archive_to_extract}",
@@ -279,6 +271,7 @@ async def unarchive_dir(
279271
)
280272
)
281273

274+
# extract archive
282275
async def progress_handler(byte_progress: NonNegativeInt) -> None:
283276
if tqdm_progress.update(byte_progress) and log_cb:
284277
with log_catch(_logger, reraise=False):

0 commit comments

Comments
 (0)