Skip to content

Commit 68ff7d1

Browse files
committed
Merge branch '5.x' into 7.x
2 parents 7938ed5 + ae90093 commit 68ff7d1

File tree

15 files changed

+184
-38
lines changed

15 files changed

+184
-38
lines changed

bin/oio-blob-rebuilder

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
# oio-blob-rebuilder.py
44
# Copyright (C) 2015-2019 OpenIO SAS, as part of OpenIO SDS
5+
# Copyright (C) 2022 OVH SAS
56
#
67
# This program is free software: you can redistribute it and/or modify
78
# it under the terms of the GNU Affero General Public License as
@@ -103,6 +104,11 @@ tube for broken chunks events.
103104
'elsewhere. This option is useful if the chunks you are '
104105
'rebuilding are not actually missing but are corrupted. '
105106
'(default=%s)' % BlobRebuilder.DEFAULT_TRY_CHUNK_DELETE)
107+
parser.add_argument(
108+
'--read-all-available-sources', action='store_true',
109+
help='For objects using erasure-coding, connect to all apparently '
110+
'available chunks, to have backups in case one of them is '
111+
'silently corrupt.')
106112
parser.add_argument(
107113
'--allow-frozen-container', action='store_true',
108114
help="Allow rebuilding a chunk in a frozen container.")
@@ -153,6 +159,7 @@ def main():
153159
# local
154160
conf['concurrency'] = args.concurrency
155161
conf['items_per_second'] = args.chunks_per_second
162+
conf['read_all_available_sources'] = args.read_all_available_sources
156163
conf['try_chunk_delete'] = args.delete_faulty_chunks
157164
# distributed
158165
conf['distributed_beanstalkd_worker_tube'] = args.distributed_tube

oio/api/ec.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This library is free software; you can redistribute it and/or
55
# modify it under the terms of the GNU Lesser General Public
@@ -34,6 +34,7 @@
3434
from oio.common.http import HeadersDict, parse_content_range, \
3535
ranges_from_http_header, headers_from_object_metadata
3636
from oio.common.logger import get_logger
37+
from oio.common.storage_method import ECDriverError
3738
from oio.common.utils import fix_ranges, monotonic_time
3839

3940

@@ -341,7 +342,7 @@ def put_in_queue(fragment_iterator, queue):
341342
ec_start = monotonic_time()
342343
try:
343344
segment = self.storage_method.driver.decode(data)
344-
except exceptions.ECError:
345+
except ECDriverError:
345346
# something terrible happened
346347
self.logger.exception(
347348
"ERROR decoding fragments (reqid=%s)", self.reqid)
@@ -754,7 +755,7 @@ def __init__(self, sysmeta, meta_chunk, global_checksum, storage_method,
754755
self.meta_chunk = meta_chunk
755756
self.global_checksum = global_checksum
756757
# Unlike plain replication, we cannot use the checksum returned
757-
# by rawx services, whe have to compute the checksum client-side.
758+
# by rawx services, we have to compute the checksum client-side.
758759
self.checksum = hashlib.new(self.chunk_checksum_algo or 'md5')
759760
self.connection_timeout = connection_timeout or io.CONNECTION_TIMEOUT
760761
self.write_timeout = write_timeout or io.CHUNK_TIMEOUT
@@ -1090,13 +1091,14 @@ def stream(self):
10901091
class ECRebuildHandler(object):
10911092
def __init__(self, meta_chunk, missing, storage_method,
10921093
connection_timeout=None, read_timeout=None,
1093-
**_kwargs):
1094+
read_all_available_sources=False, **kwargs):
10941095
self.meta_chunk = meta_chunk
10951096
self.missing = missing
10961097
self.storage_method = storage_method
10971098
self.connection_timeout = connection_timeout or io.CONNECTION_TIMEOUT
10981099
self.read_timeout = read_timeout or io.CHUNK_TIMEOUT
1099-
self.logger = _kwargs.get('logger', LOGGER)
1100+
self.logger = kwargs.get('logger', LOGGER)
1101+
self.read_all_available_sources = read_all_available_sources
11001102

11011103
def _get_response(self, chunk, headers):
11021104
resp = None
@@ -1175,7 +1177,10 @@ def rebuild(self):
11751177
self.logger.warning(
11761178
'Use chunk(s) without size information to rebuild a chunk')
11771179

1178-
rebuild_iter = self._make_rebuild_iter(resps[:nb_data])
1180+
if self.read_all_available_sources:
1181+
rebuild_iter = self._make_rebuild_iter(resps)
1182+
else:
1183+
rebuild_iter = self._make_rebuild_iter(resps[:nb_data])
11791184
return assumed_chunk_size, rebuild_iter
11801185

11811186
def _make_rebuild_iter(self, resps):
@@ -1197,18 +1202,39 @@ def frag_iter():
11971202
pile.spawn(_get_frag, resp)
11981203
try:
11991204
with Timeout(self.read_timeout):
1200-
frag = [frag for frag in pile]
1205+
in_frags = [frag for frag in pile]
12011206
except Timeout as to:
12021207
self.logger.error('ERROR while rebuilding: %s', to)
12031208
except Exception:
12041209
self.logger.exception('ERROR while rebuilding')
12051210
break
1206-
if not all(frag):
1211+
if not all(in_frags):
12071212
break
1208-
rebuilt_frag = self._reconstruct(frag)
1213+
ok_frags = self._filter_broken_fragments(in_frags)
1214+
rebuilt_frag = self._reconstruct(ok_frags)
12091215
yield rebuilt_frag
12101216

12111217
return frag_iter()
12121218

1213-
def _reconstruct(self, frag):
1214-
return self.storage_method.driver.reconstruct(frag, [self.missing])[0]
1219+
def _filter_broken_fragments(self, frags):
1220+
"""
1221+
Try to read and check each fragment's EC metadata.
1222+
1223+
:returns: the list of fragments whose metadata is ok
1224+
"""
1225+
frag_md_list = []
1226+
ok_frags = []
1227+
for i, frag in enumerate(frags):
1228+
try:
1229+
frag_md = self.storage_method.driver.get_metadata(frag)
1230+
frag_md_list.append(frag_md)
1231+
ok_frags.append(frag)
1232+
except ECDriverError as err:
1233+
self.logger.error(
1234+
"Fragment %d in error, discarding it: %s", i, err)
1235+
# FIXME(FVE): here we should call verify_stripe_metadata(frag_md_list)
1236+
# but it does not work and I don't know why.
1237+
return ok_frags
1238+
1239+
def _reconstruct(self, frags):
1240+
return self.storage_method.driver.reconstruct(frags, [self.missing])[0]

oio/blob/operator.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2019 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This program is free software: you can redistribute it and/or modify
55
# it under the terms of the GNU Affero General Public License as
@@ -47,7 +47,8 @@ def __init__(self, conf, logger=None):
4747

4848
def rebuild(self, container_id, content_id, chunk_id_or_pos,
4949
rawx_id=None, try_chunk_delete=False,
50-
allow_frozen_container=True, allow_same_rawx=True):
50+
allow_frozen_container=True, allow_same_rawx=True,
51+
read_all_available_sources=False):
5152
"""
5253
Try to find the chunk in the metadata of the specified object,
5354
then rebuild it.
@@ -86,7 +87,8 @@ def rebuild(self, container_id, content_id, chunk_id_or_pos,
8687
chunk_id, service_id=rawx_id,
8788
allow_frozen_container=allow_frozen_container,
8889
allow_same_rawx=allow_same_rawx,
89-
chunk_pos=chunk_pos)
90+
chunk_pos=chunk_pos,
91+
read_all_available_sources=read_all_available_sources)
9092

9193
if try_chunk_delete:
9294
try:

oio/blob/rebuilder.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2019 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This program is free software: you can redistribute it and/or modify
55
# it under the terms of the GNU Affero General Public License as
@@ -38,6 +38,7 @@ class BlobRebuilder(Tool):
3838
DEFAULT_RDIR_TIMEOUT = 60.0
3939
DEFAULT_ALLOW_FROZEN_CT = False
4040
DEFAULT_ALLOW_SAME_RAWX = True
41+
DEFAULT_READ_ALL_AVAILABLE_SOURCES = False
4142
DEFAULT_TRY_CHUNK_DELETE = False
4243
DEFAULT_DRY_RUN = False
4344

@@ -255,6 +256,9 @@ def __init__(self, tool, queue_workers, queue_reply):
255256
'allow_frozen_container', self.tool.DEFAULT_ALLOW_FROZEN_CT))
256257
self.allow_same_rawx = true_value(self.tool.conf.get(
257258
'allow_same_rawx', self.tool.DEFAULT_ALLOW_SAME_RAWX))
259+
self.read_all_available_sources = true_value(self.tool.conf.get(
260+
'read_all_available_sources',
261+
self.tool.DEFAULT_READ_ALL_AVAILABLE_SOURCES))
258262
self.try_chunk_delete = true_value(self.tool.conf.get(
259263
'try_chunk_delete', self.tool.DEFAULT_TRY_CHUNK_DELETE))
260264
self.dry_run = true_value(self.tool.conf.get(
@@ -280,7 +284,8 @@ def _process_item(self, item):
280284
rawx_id=self.tool.rawx_id,
281285
try_chunk_delete=self.try_chunk_delete,
282286
allow_frozen_container=self.allow_frozen_container,
283-
allow_same_rawx=self.allow_same_rawx)
287+
allow_same_rawx=self.allow_same_rawx,
288+
read_all_available_sources=self.read_all_available_sources)
284289
except OioException as exc:
285290
if not isinstance(exc, OrphanChunk):
286291
raise RetryLater(exc)

oio/cli/admin/xcute/rawx.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2019-2020 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This program is free software: you can redistribute it and/or modify
55
# it under the terms of the GNU Affero General Public License as
@@ -51,6 +51,11 @@ def get_parser(self, prog_name):
5151
'rebuilt elsewhere. This option is useful if the chunks '
5252
'you are rebuilding are not actually missing but are '
5353
'corrupted.')
54+
parser.add_argument(
55+
'--read-all-available-sources', action='store_true',
56+
help='For objects using erasure-coding, connect to all apparently '
57+
'available chunks, to have backups in case one of them is '
58+
'silently corrupt.')
5459
parser.add_argument(
5560
'--allow-frozen-container', action='store_true',
5661
help='Allow rebuilding a chunk in a frozen container.')
@@ -74,6 +79,8 @@ def get_job_config(self, parsed_args):
7479
'rdir_timeout': parsed_args.rdir_timeout,
7580
'rawx_timeout': parsed_args.rawx_timeout,
7681
'dry_run': parsed_args.dry_run,
82+
'read_all_available_sources':
83+
parsed_args.read_all_available_sources,
7784
'try_chunk_delete': parsed_args.delete_faulty_chunks,
7885
'allow_frozen_container': parsed_args.allow_frozen_container,
7986
'set_incident_date': parsed_args.set_incident_date,

oio/cli/object/client.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (C) 2017 OpenIO SAS, as part of OpenIO SDS
2+
# Copyright (C) 2022 OVH SAS
23
#
34
# This program is free software: you can redistribute it and/or modify
45
# it under the terms of the GNU Affero General Public License as
@@ -26,6 +27,7 @@ def make_client(instance):
2627
endpoint=instance.get_endpoint('storage'),
2728
namespace=instance.namespace,
2829
admin_mode=instance.admin_mode,
29-
perfdata=instance.cli_conf().get('perfdata')
30+
perfdata=instance.cli_conf().get('perfdata'),
31+
logger=instance.logger
3032
)
3133
return client

oio/content/content.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2015-2019 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2020-2021 OVH SAS
2+
# Copyright (C) 2020-2022 OVH SAS
33
#
44
# This library is free software; you can redistribute it and/or
55
# modify it under the terms of the GNU Lesser General Public
@@ -238,7 +238,7 @@ def _create_object(self, **kwargs):
238238
**kwargs)
239239

240240
def rebuild_chunk(self, chunk_id, service_id=None, allow_same_rawx=False,
241-
chunk_pos=None, allow_frozen_container=False):
241+
chunk_pos=None, allow_frozen_container=False, **kwargs):
242242
raise NotImplementedError()
243243

244244
def create(self, stream, **kwargs):

oio/content/ec.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This library is free software; you can redistribute it and/or
55
# modify it under the terms of the GNU Lesser General Public
@@ -29,7 +29,8 @@
2929
class ECContent(Content):
3030
def rebuild_chunk(self, chunk_id, service_id=None,
3131
allow_same_rawx=False, chunk_pos=None,
32-
allow_frozen_container=False):
32+
allow_frozen_container=False,
33+
read_all_available_sources=False):
3334
# Identify the chunk to rebuild
3435
candidates = self.chunks.filter(id=chunk_id)
3536
if service_id is not None:
@@ -71,7 +72,9 @@ def rebuild_chunk(self, chunk_id, service_id=None,
7172

7273
# Regenerate the lost chunk's data, from existing chunks
7374
handler = ECRebuildHandler(
74-
chunks.raw(), current_chunk.subpos, self.storage_method)
75+
chunks.raw(), current_chunk.subpos, self.storage_method,
76+
read_all_available_sources=read_all_available_sources,
77+
logger=self.logger)
7578
expected_chunk_size, stream = handler.rebuild()
7679

7780
# Actually create the spare chunk

oio/content/plain.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Copyright (C) 2015-2020 OpenIO SAS, as part of OpenIO SDS
2-
# Copyright (C) 2021 OVH SAS
2+
# Copyright (C) 2021-2022 OVH SAS
33
#
44
# This library is free software; you can redistribute it and/or
55
# modify it under the terms of the GNU Lesser General Public
@@ -54,7 +54,7 @@ def create(self, stream, **kwargs):
5454

5555
def rebuild_chunk(self, chunk_id, service_id=None,
5656
allow_same_rawx=False, chunk_pos=None,
57-
allow_frozen_container=False):
57+
allow_frozen_container=False, **_kwargs):
5858
# Identify the chunk to rebuild
5959
candidates = self.chunks.filter(id=chunk_id)
6060
if service_id is not None:

oio/xcute/jobs/blob_rebuilder.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Copyright (C) 2019-2020 OpenIO SAS, as part of OpenIO SDS
2+
# Copyright (C) 2022 OVH SAS
23
#
34
# This library is free software; you can redistribute it and/or
45
# modify it under the terms of the GNU Lesser General Public
@@ -33,6 +34,8 @@ def __init__(self, conf, job_params, logger=None):
3334
self.rawx_timeout = job_params['rawx_timeout']
3435
self.allow_frozen_container = job_params['allow_frozen_container']
3536
self.allow_same_rawx = job_params['allow_same_rawx']
37+
self.read_all_available_sources = \
38+
job_params['read_all_available_sources']
3639
self.try_chunk_delete = job_params['try_chunk_delete']
3740
self.dry_run = job_params['dry_run']
3841

@@ -52,11 +55,12 @@ def process(self, task_id, task_payload, reqid=None):
5255
self.logger.debug('[reqid=%s] Rebuilding %s', reqid, chunk_id)
5356
try:
5457
chunk_size = self.chunk_operator.rebuild(
55-
container_id, content_id, chunk_id,
56-
rawx_id=self.service_id,
57-
try_chunk_delete=self.try_chunk_delete,
58-
allow_frozen_container=self.allow_frozen_container,
59-
allow_same_rawx=self.allow_same_rawx)
58+
container_id, content_id, chunk_id,
59+
rawx_id=self.service_id,
60+
try_chunk_delete=self.try_chunk_delete,
61+
allow_frozen_container=self.allow_frozen_container,
62+
allow_same_rawx=self.allow_same_rawx,
63+
read_all_available_sources=self.read_all_available_sources)
6064
except (ContentNotFound, OrphanChunk):
6165
return {'orphan_chunks': 1}
6266

@@ -74,6 +78,7 @@ class RawxRebuildJob(XcuteRdirJob):
7478
DEFAULT_TRY_CHUNK_DELETE = False
7579
DEFAULT_ALLOW_FROZEN_CT = False
7680
DEFAULT_DECLARE_INCIDENT_DATE = False
81+
DEFAULT_READ_ALL_AVAILABLE_SOURCES = False
7782

7883
@classmethod
7984
def sanitize_params(cls, job_params):
@@ -98,6 +103,10 @@ def sanitize_params(cls, job_params):
98103
job_params.get('allow_same_rawx'),
99104
cls.DEFAULT_ALLOW_SAME_RAWX)
100105

106+
sanitized_job_params['read_all_available_sources'] = boolean_value(
107+
job_params.get('read_all_available_sources'),
108+
cls.DEFAULT_READ_ALL_AVAILABLE_SOURCES)
109+
101110
sanitized_job_params['try_chunk_delete'] = boolean_value(
102111
job_params.get('try_chunk_delete'),
103112
cls.DEFAULT_TRY_CHUNK_DELETE)

0 commit comments

Comments
 (0)