3131_FILE_COUNT_RE : Final [str ] = r" (\d+)\s*files"
3232_PROGRESS_PERCENT_RE : Final [str ] = r" (?:100|\d?\d)% "
3333_ALL_DONE_RE : Final [str ] = r"Everything is Ok"
34- # NOTE: the size of `chunk_to_emit` should not be too big nor too small otherwise it might skip some updates
34+ # NOTE: the size of `chunk_to_emit` should in theory contain everything that above regexes capture
3535_DEFAULT_CHUNK_SIZE : Final [NonNegativeInt ] = 20
3636
3737
@@ -115,45 +115,36 @@ async def _output_reader(
115115 output_handlers : list [Callable [[str ], Awaitable [None ]]] | None ,
116116 chunk_size : NonNegativeInt = _DEFAULT_CHUNK_SIZE ,
117117) -> str :
118- command_output = ""
118+ # NOTE: we do not read line by line but chunk by chunk otherwise we'd miss progress updates
119+ # the key is to read the smallest possible chunks of data so that the progress can be properly parsed
120+ if output_handlers is None :
121+ output_handlers = []
119122
120- # Initialize buffer to store lookbehind window
121- lookbehind_buffer = ""
123+ command_output = ""
122124
123- undecodable_chunk : bytes | None = None
125+ lookbehind_buffer = "" # store the last chunk
124126
125127 while True :
126128 read_chunk = await stream .read (chunk_size )
127129 if not read_chunk :
128130 # Process remaining buffer if any
129- if lookbehind_buffer and output_handlers :
131+ if lookbehind_buffer :
130132 await asyncio .gather (
131133 * [handler (lookbehind_buffer ) for handler in output_handlers ]
132134 )
133135 break
134136
135- try :
136- if undecodable_chunk :
137- chunk = (undecodable_chunk + read_chunk ).decode ("utf-8" )
138- undecodable_chunk = None
139- else :
140- chunk = read_chunk .decode ("utf-8" )
141- except UnicodeDecodeError :
142- undecodable_chunk = read_chunk
143- continue
137+ # `errors=replace`: avoids getting stuck when can't parse utf-8
138+ chunk = read_chunk .decode ("utf-8" , errors = "replace" )
144139
145140 command_output += chunk
146141
147142 # Combine lookbehind buffer with new chunk
148143 chunk_to_emit = lookbehind_buffer + chunk
149-
150- if output_handlers :
151- await asyncio .gather (
152- * [handler (chunk_to_emit ) for handler in output_handlers ]
153- )
154-
155144 # Keep last window_size characters for next iteration
156- lookbehind_buffer = chunk_to_emit [- chunk_size :]
145+ lookbehind_buffer = chunk_to_emit [- len (chunk ) :]
146+
147+ await asyncio .gather (* [handler (chunk_to_emit ) for handler in output_handlers ])
157148
158149 return command_output
159150
@@ -258,6 +249,7 @@ async def unarchive_dir(
258249 num_steps = 1 , description = IDStr (f"extracting { archive_to_extract .name } " )
259250 )
260251
252+ # get archive information
261253 archive_info_parser = ArchiveInfoParser ()
262254 await _run_cli_command (
263255 f"7z l { archive_to_extract } " ,
@@ -279,6 +271,7 @@ async def unarchive_dir(
279271 )
280272 )
281273
274+ # extract archive
282275 async def progress_handler (byte_progress : NonNegativeInt ) -> None :
283276 if tqdm_progress .update (byte_progress ) and log_cb :
284277 with log_catch (_logger , reraise = False ):
0 commit comments