-
Notifications
You must be signed in to change notification settings - Fork 1.3k
[python] Filter manifest files by row id #6951
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,78 @@ | ||
| # Licensed to the Apache Software Foundation (ASF) under one | ||
| # or more contributor license agreements. See the NOTICE file | ||
| # distributed with this work for additional information | ||
| # regarding copyright ownership. The ASF licenses this file | ||
| # to you under the Apache License, Version 2.0 (the | ||
| # "License"); you may not use this file except in compliance | ||
| # with the License. You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, | ||
| # software distributed under the License is distributed on an | ||
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| # KIND, either express or implied. See the License for the | ||
| # specific language governing permissions and limitations | ||
| # under the License. | ||
| from typing import Optional | ||
|
|
||
| import portion | ||
|
|
||
|
|
||
| class Range: | ||
| """ | ||
| A range class based on the portion library for interval operations. | ||
|
|
||
| This class wraps portion.Interval to provide range operations like | ||
| intersection checking for row ID ranges. | ||
| """ | ||
|
|
||
| def __init__(self, start, end): | ||
| """ | ||
| Create a closed range [start, end]. | ||
|
|
||
| Args: | ||
| start: The start value of the range (inclusive) | ||
| end: The end value of the range (inclusive) | ||
| """ | ||
| self.start = start | ||
| self.end = end | ||
| # Create a closed interval [start, end] | ||
| self._interval = portion.closed(start, end) | ||
|
|
||
| @staticmethod | ||
| def intersection(range1: 'Range', range2: 'Range') -> Optional['Range']: | ||
| """ | ||
| Calculate the intersection of two ranges. | ||
|
|
||
| Args: | ||
| range1: The first Range object | ||
| range2: The second Range object | ||
|
|
||
| Returns: | ||
| A new Range object representing the intersection, or None if ranges don't overlap | ||
| """ | ||
| if range1 is None or range2 is None: | ||
| return None | ||
|
|
||
| # Calculate intersection using portion | ||
| intersect = range1._interval & range2._interval | ||
|
|
||
| # If intersection is empty, return None | ||
| if intersect.empty: | ||
| return None | ||
|
|
||
| # Extract the bounds from the intersection | ||
| # portion returns an Interval which may contain multiple atomic intervals | ||
| # For our use case, we expect a single atomic interval | ||
| if len(intersect) > 0: | ||
| atomic = list(intersect)[0] | ||
| return Range(atomic.lower, atomic.upper) | ||
|
|
||
| return None | ||
|
|
||
| def __repr__(self): | ||
| return f"Range({self.start}, {self.end})" | ||
|
|
||
| def __str__(self): | ||
| return f"[{self.start}, {self.end}]" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,7 @@ | |
| from collections import defaultdict | ||
| from typing import Callable, List, Optional, Dict, Set | ||
|
|
||
| from pypaimon.common.range import Range | ||
| from pypaimon.common.predicate import Predicate | ||
| from pypaimon.table.source.deletion_file import DeletionFile | ||
| from pypaimon.table.row.generic_row import GenericRow | ||
|
|
@@ -65,6 +66,7 @@ def __init__(self, table, predicate: Optional[Predicate], limit: Optional[int]): | |
| self.number_of_para_subtasks = None | ||
| self.start_row_of_this_subtask = None | ||
| self.end_row_of_this_subtask = None | ||
| self.row_ranges: List[Range] = None | ||
|
|
||
| self.only_read_real_buckets = True if options.bucket() == BucketMode.POSTPONE_BUCKET.value else False | ||
| self.data_evolution = options.data_evolution_enabled() | ||
|
|
@@ -129,7 +131,7 @@ def with_shard(self, idx_of_this_subtask, number_of_para_subtasks) -> 'FullStart | |
| self.number_of_para_subtasks = number_of_para_subtasks | ||
| return self | ||
|
|
||
| def with_row_range(self, start_row, end_row) -> 'FullStartingScanner': | ||
| def with_row_shard(self, start_row, end_row) -> 'FullStartingScanner': | ||
| if start_row >= end_row: | ||
| raise Exception("start_row must be less than end_row") | ||
| if self.idx_of_this_subtask is not None: | ||
|
|
@@ -138,6 +140,13 @@ def with_row_range(self, start_row, end_row) -> 'FullStartingScanner': | |
| self.end_row_of_this_subtask = end_row | ||
| return self | ||
|
|
||
| def with_row_ranges(self, row_ranges) -> 'FullStartingScanner': | ||
| """ | ||
| Filter manifest files by row id ranges. | ||
| """ | ||
| self.row_ranges = row_ranges | ||
| return self | ||
|
|
||
| def _append_only_filter_by_row_range(self, partitioned_files: defaultdict, | ||
| start_row: int, | ||
| end_row: int) -> (defaultdict, int, int): | ||
|
|
@@ -340,10 +349,42 @@ def _apply_push_down_limit(self, splits: List[Split]) -> List[Split]: | |
|
|
||
| def _filter_manifest_file(self, file: ManifestFileMeta) -> bool: | ||
| if not self.partition_key_predicate: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. extract a method: _filter_manifest_by_partition_predicate. |
||
| return self._filter_manifest_by_row_ranges(file) | ||
| if not self.partition_key_predicate.test_by_simple_stats( | ||
| file.partition_stats, | ||
| file.num_added_files + file.num_deleted_files): | ||
| return False | ||
| else: | ||
| return self._filter_manifest_by_row_ranges(file) | ||
|
|
||
| def _filter_manifest_by_row_ranges(self, manifest: ManifestFileMeta) -> bool: | ||
| """ | ||
| Filter manifest file by row ranges. | ||
|
|
||
| Args: | ||
| manifest: The manifest file metadata to filter | ||
|
|
||
| Returns: | ||
| True if the manifest should be included, False otherwise | ||
| """ | ||
| if self.row_ranges is None: | ||
| return True | ||
|
|
||
| min_row_id = manifest.min_row_id | ||
| max_row_id = manifest.max_row_id | ||
|
|
||
| if min_row_id is None or max_row_id is None: | ||
| return True | ||
| return self.partition_key_predicate.test_by_simple_stats( | ||
| file.partition_stats, | ||
| file.num_added_files + file.num_deleted_files) | ||
|
|
||
| # Create a Range object for the manifest's row range | ||
| manifest_row_range = Range(min_row_id, max_row_id) | ||
|
|
||
| # Check if manifest range intersects with any expected range | ||
| for row_range in self.row_ranges: | ||
| if Range.intersection(row_range, manifest_row_range) is not None: | ||
| return True | ||
|
|
||
| return False | ||
|
|
||
| def _filter_manifest_entry(self, entry: ManifestEntry) -> bool: | ||
| if self.only_read_real_buckets and entry.bucket < 0: | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -72,10 +72,10 @@ def with_shard(self, idx_of_this_subtask, number_of_para_subtasks) -> 'TableScan | |
| self.starting_scanner.with_shard(idx_of_this_subtask, number_of_para_subtasks) | ||
| return self | ||
|
|
||
| def with_row_range(self, start_row, end_row) -> 'TableScan': | ||
| """ | ||
| Filter file entries by row range. The row_id corresponds to the row position of the | ||
| file in all file entries in table scan's partitioned_files. | ||
| """ | ||
| self.starting_scanner.with_row_range(start_row, end_row) | ||
| def with_row_shard(self, start_row, end_row) -> 'TableScan': | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. keep
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. |
||
| self.starting_scanner.with_row_shard(start_row, end_row) | ||
| return self | ||
|
|
||
| def with_row_ranges(self, row_ranges) -> 'TableScan': | ||
| self.starting_scanner.with_row_ranges(row_ranges) | ||
| return self | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You introduced the ranges, so please remove
start_row_of_this_subtaskandend_row_of_this_subtaskin this class.