|
1 | 1 | import dataclasses |
| 2 | +import functools |
2 | 3 | import logging |
3 | 4 | import os |
4 | 5 | import pathlib |
@@ -182,6 +183,29 @@ def get_column_type( |
182 | 183 | # ) |
183 | 184 |
|
184 | 185 |
|
| 186 | +def partitioned_folder_comparator(folder1: str, folder2: str) -> int: |
| 187 | + # Try to convert to number and compare if the folder name is a number |
| 188 | + try: |
| 189 | + # Stripping = from the folder names as it most probably partition name part like year=2021 |
| 190 | + if "=" in folder1 and "=" in folder2: |
| 191 | + if folder1.split("=", 1)[0] == folder2.split("=", 1)[0]: |
| 192 | + folder1 = folder1.split("=", 1)[1] |
| 193 | + folder2 = folder2.split("=", 1)[1] |
| 194 | + |
| 195 | + num_folder1 = int(folder1) |
| 196 | + num_folder2 = int(folder2) |
| 197 | + if num_folder1 == num_folder2: |
| 198 | + return 0 |
| 199 | + else: |
| 200 | + return 1 if num_folder1 > num_folder2 else -1 |
| 201 | + except Exception: |
| 202 | + # If folder name is not a number then do string comparison |
| 203 | + if folder1 == folder2: |
| 204 | + return 0 |
| 205 | + else: |
| 206 | + return 1 if folder1 > folder2 else -1 |
| 207 | + |
| 208 | + |
185 | 209 | @dataclasses.dataclass |
186 | 210 | class TableData: |
187 | 211 | display_name: str |
@@ -700,7 +724,12 @@ def get_dir_to_process(self, bucket_name: str, folder: str) -> str: |
700 | 724 | ) |
701 | 725 | iterator = peekable(iterator) |
702 | 726 | if iterator: |
703 | | - sorted_dirs = sorted(iterator, reverse=True) |
| 727 | + sorted_dirs = sorted( |
| 728 | + iterator, |
| 729 | + key=functools.cmp_to_key(partitioned_folder_comparator), |
| 730 | + reverse=True, |
| 731 | + ) |
| 732 | + |
704 | 733 | return self.get_dir_to_process( |
705 | 734 | bucket_name=bucket_name, folder=sorted_dirs[0] + "/" |
706 | 735 | ) |
@@ -786,7 +815,8 @@ def local_browser(self, path_spec: PathSpec) -> Iterable[Tuple[str, datetime, in |
786 | 815 | else: |
787 | 816 | logger.debug(f"Scanning files under local folder: {prefix}") |
788 | 817 | for root, dirs, files in os.walk(prefix): |
789 | | - dirs.sort() |
| 818 | + dirs.sort(key=functools.cmp_to_key(partitioned_folder_comparator)) |
| 819 | + |
790 | 820 | for file in sorted(files): |
791 | 821 | full_path = os.path.join(root, file) |
792 | 822 | yield full_path, datetime.utcfromtimestamp( |
|
0 commit comments