11# Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
2- # Copyright (C) 2021 OVH SAS
2+ # Copyright (C) 2021-2022 OVH SAS
33#
44# This library is free software; you can redistribute it and/or
55# modify it under the terms of the GNU Lesser General Public
3434from oio .common .http import HeadersDict , parse_content_range , \
3535 ranges_from_http_header , headers_from_object_metadata
3636from oio .common .logger import get_logger
37+ from oio .common .storage_method import ECDriverError
3738from oio .common .utils import fix_ranges , monotonic_time
3839
3940
@@ -341,7 +342,7 @@ def put_in_queue(fragment_iterator, queue):
341342 ec_start = monotonic_time ()
342343 try :
343344 segment = self .storage_method .driver .decode (data )
344- except exceptions . ECError :
345+ except ECDriverError :
345346 # something terrible happened
346347 self .logger .exception (
347348 "ERROR decoding fragments (reqid=%s)" , self .reqid )
@@ -754,7 +755,7 @@ def __init__(self, sysmeta, meta_chunk, global_checksum, storage_method,
754755 self .meta_chunk = meta_chunk
755756 self .global_checksum = global_checksum
756757 # Unlike plain replication, we cannot use the checksum returned
757- # by rawx services, whe have to compute the checksum client-side.
758+ # by rawx services, we have to compute the checksum client-side.
758759 self .checksum = hashlib .new (self .chunk_checksum_algo or 'md5' )
759760 self .connection_timeout = connection_timeout or io .CONNECTION_TIMEOUT
760761 self .write_timeout = write_timeout or io .CHUNK_TIMEOUT
@@ -1090,13 +1091,14 @@ def stream(self):
10901091class ECRebuildHandler (object ):
10911092 def __init__ (self , meta_chunk , missing , storage_method ,
10921093 connection_timeout = None , read_timeout = None ,
1093- ** _kwargs ):
1094+ read_all_available_sources = False , ** kwargs ):
10941095 self .meta_chunk = meta_chunk
10951096 self .missing = missing
10961097 self .storage_method = storage_method
10971098 self .connection_timeout = connection_timeout or io .CONNECTION_TIMEOUT
10981099 self .read_timeout = read_timeout or io .CHUNK_TIMEOUT
1099- self .logger = _kwargs .get ('logger' , LOGGER )
1100+ self .logger = kwargs .get ('logger' , LOGGER )
1101+ self .read_all_available_sources = read_all_available_sources
11001102
11011103 def _get_response (self , chunk , headers ):
11021104 resp = None
@@ -1175,7 +1177,10 @@ def rebuild(self):
11751177 self .logger .warning (
11761178 'Use chunk(s) without size information to rebuild a chunk' )
11771179
1178- rebuild_iter = self ._make_rebuild_iter (resps [:nb_data ])
1180+ if self .read_all_available_sources :
1181+ rebuild_iter = self ._make_rebuild_iter (resps )
1182+ else :
1183+ rebuild_iter = self ._make_rebuild_iter (resps [:nb_data ])
11791184 return assumed_chunk_size , rebuild_iter
11801185
11811186 def _make_rebuild_iter (self , resps ):
@@ -1197,18 +1202,39 @@ def frag_iter():
11971202 pile .spawn (_get_frag , resp )
11981203 try :
11991204 with Timeout (self .read_timeout ):
1200- frag = [frag for frag in pile ]
1205+ in_frags = [frag for frag in pile ]
12011206 except Timeout as to :
12021207 self .logger .error ('ERROR while rebuilding: %s' , to )
12031208 except Exception :
12041209 self .logger .exception ('ERROR while rebuilding' )
12051210 break
1206- if not all (frag ):
1211+ if not all (in_frags ):
12071212 break
1208- rebuilt_frag = self ._reconstruct (frag )
1213+ ok_frags = self ._filter_broken_fragments (in_frags )
1214+ rebuilt_frag = self ._reconstruct (ok_frags )
12091215 yield rebuilt_frag
12101216
12111217 return frag_iter ()
12121218
1213- def _reconstruct (self , frag ):
1214- return self .storage_method .driver .reconstruct (frag , [self .missing ])[0 ]
1219+ def _filter_broken_fragments (self , frags ):
1220+ """
1221+ Try to read and check each fragment's EC metadata.
1222+
1223+ :returns: the list of fragments whose metadata is ok
1224+ """
1225+ frag_md_list = []
1226+ ok_frags = []
1227+ for i , frag in enumerate (frags ):
1228+ try :
1229+ frag_md = self .storage_method .driver .get_metadata (frag )
1230+ frag_md_list .append (frag_md )
1231+ ok_frags .append (frag )
1232+ except ECDriverError as err :
1233+ self .logger .error (
1234+ "Fragment %d in error, discarding it: %s" , i , err )
1235+ # FIXME(FVE): here we should call verify_stripe_metadata(frag_md_list)
1236+ # but it does not work and I don't know why.
1237+ return ok_frags
1238+
1239+ def _reconstruct (self , frags ):
1240+ return self .storage_method .driver .reconstruct (frags , [self .missing ])[0 ]
0 commit comments