|
25 | 25 | from .table_helpers import (_prepare_source_tables, _are_default_empty_table, _prepare_table_writer,
|
26 | 26 | _remove_tables, DEFAULT_EMPTY_TABLE, _to_chunk_stream, _prepare_command_format)
|
27 | 27 | from .file_commands import _get_remote_temp_files_directory, _append_default_path_with_user_level
|
28 |
| -from .parallel_reader import make_read_parallel_request |
| 28 | +from .parallel_reader import make_read_parallel_request, _slice_row_ranges_for_parallel_read |
29 | 29 | from .schema import _SchemaRuntimeCtx, TableSchema, make_dataclass_from_table_schema
|
30 | 30 | from .stream import ItemStream, _ChunkStream
|
31 | 31 | from .ypath import TablePath, YPath, ypath_join
|
|
34 | 34 | import yt.yson as yson
|
35 | 35 | import yt.logger as logger
|
36 | 36 |
|
37 |
| -import builtins |
38 | 37 | from copy import deepcopy
|
39 | 38 | from datetime import timedelta
|
40 | 39 | import enum
|
@@ -449,35 +448,6 @@ def read_blob_table(table, part_index_column_name=None, data_column_name=None,
|
449 | 448 | return response
|
450 | 449 |
|
451 | 450 |
|
452 |
| -def _slice_row_ranges_for_parallel_read(ranges, row_count, data_size, data_size_per_thread): |
453 |
| - result = [] |
454 |
| - if row_count > 0: |
455 |
| - row_size = data_size / float(row_count) |
456 |
| - else: |
457 |
| - row_size = 1 |
458 |
| - |
459 |
| - rows_per_thread = max(int(data_size_per_thread / row_size), 1) |
460 |
| - for range in ranges: |
461 |
| - if "exact" in range: |
462 |
| - require("row_index" in range["exact"], lambda: YtError('Invalid YPath: "row_index" not found')) |
463 |
| - lower_limit = range["exact"]["row_index"] |
464 |
| - upper_limit = lower_limit + 1 |
465 |
| - else: |
466 |
| - if "lower_limit" in range: |
467 |
| - require("row_index" in range["lower_limit"], lambda: YtError('Invalid YPath: "row_index" not found')) |
468 |
| - if "upper_limit" in range: |
469 |
| - require("row_index" in range["upper_limit"], lambda: YtError('Invalid YPath: "row_index" not found')) |
470 |
| - |
471 |
| - lower_limit = 0 if "lower_limit" not in range else range["lower_limit"]["row_index"] |
472 |
| - upper_limit = row_count if "upper_limit" not in range else range["upper_limit"]["row_index"] |
473 |
| - |
474 |
| - for start in builtins.range(lower_limit, upper_limit, rows_per_thread): |
475 |
| - end = min(start + rows_per_thread, upper_limit) |
476 |
| - result.append({"range" : (start, end)}) |
477 |
| - |
478 |
| - return result |
479 |
| - |
480 |
| - |
481 | 451 | def _prepare_params_for_parallel_read(params, range):
|
482 | 452 | params["path"].attributes["ranges"] = [{"lower_limit": {"row_index": range["range"][0]},
|
483 | 453 | "upper_limit": {"row_index": range["range"][1]}}]
|
@@ -808,7 +778,7 @@ def _check_attributes_for_read_table(attributes, table, client):
|
808 | 778 | def _get_table_attributes(table, client):
|
809 | 779 | attributes = get(
|
810 | 780 | table + "/@",
|
811 |
| - attributes=["type", "chunk_count", "compressed_data_size", "dynamic", "row_count", "uncompressed_data_size"], |
| 781 | + attributes=["type", "chunk_count", "compressed_data_size", "dynamic", "row_count", "replication_factor", "chunk_count", "uncompressed_data_size"], |
812 | 782 | client=client)
|
813 | 783 | return attributes
|
814 | 784 |
|
@@ -861,11 +831,14 @@ def read_table(table, format=None, table_reader=None, control_attributes=None, u
|
861 | 831 | table.attributes["ranges"] = [
|
862 | 832 | {"lower_limit": {"row_index": 0},
|
863 | 833 | "upper_limit": {"row_index": attributes["row_count"]}}]
|
864 |
| - ranges = _slice_row_ranges_for_parallel_read( |
865 |
| - table.attributes["ranges"], |
866 |
| - attributes["row_count"], |
867 |
| - attributes["uncompressed_data_size"], |
868 |
| - get_config(client)["read_parallel"]["data_size_per_thread"]) |
| 834 | + ranges, _ = _slice_row_ranges_for_parallel_read( |
| 835 | + ranges=table.attributes["ranges"], |
| 836 | + row_count=attributes["row_count"], |
| 837 | + chunk_count=attributes["chunk_count"], |
| 838 | + data_size=attributes["uncompressed_data_size"], |
| 839 | + replication_factor=attributes["replication_factor"], |
| 840 | + data_size_per_thread=get_config(client)["read_parallel"]["data_size_per_thread"], |
| 841 | + ) |
869 | 842 | response_parameters = get_value(response_parameters, {})
|
870 | 843 | if not ranges:
|
871 | 844 | response_parameters["start_row_index"] = 0
|
@@ -1229,12 +1202,14 @@ def _dump_file(table, output_file, output_path, enable_several_files, unordered,
|
1229 | 1202 | "upper_limit": {"row_index": attributes["row_count"]},
|
1230 | 1203 | }]
|
1231 | 1204 |
|
1232 |
| - data_size_per_thread = get_config(client)["read_parallel"]["data_size_per_thread"] |
1233 |
| - ranges = _slice_row_ranges_for_parallel_read( |
1234 |
| - table.attributes["ranges"], |
1235 |
| - attributes["row_count"], |
1236 |
| - attributes["uncompressed_data_size"], |
1237 |
| - data_size_per_thread) |
| 1205 | + ranges, data_size_per_thread = _slice_row_ranges_for_parallel_read( |
| 1206 | + ranges=table.attributes["ranges"], |
| 1207 | + row_count=attributes["row_count"], |
| 1208 | + chunk_count=attributes["chunk_count"], |
| 1209 | + data_size=attributes["uncompressed_data_size"], |
| 1210 | + replication_factor=attributes["replication_factor"], |
| 1211 | + data_size_per_thread=get_config(client)["read_parallel"]["data_size_per_thread"], |
| 1212 | + ) |
1238 | 1213 |
|
1239 | 1214 | range_count = len(ranges)
|
1240 | 1215 | result_ranges = []
|
|
0 commit comments