|
3 | 3 | import logging |
4 | 4 | import math |
5 | 5 | import os |
| 6 | +import random |
6 | 7 | from typing import Any, Dict, Generator, List, Optional, Tuple |
7 | 8 |
|
8 | 9 | import boto3 # type: ignore |
|
11 | 12 | import psycopg2 # type: ignore |
12 | 13 | import s3fs # type: ignore |
13 | 14 |
|
14 | | -logger: logging.Logger = logging.getLogger(__name__) |
| 15 | +from awswrangler import exceptions |
| 16 | + |
| 17 | +_logger: logging.Logger = logging.getLogger(__name__) |
15 | 18 |
|
16 | 19 |
|
17 | 20 | def ensure_session(session: Optional[boto3.Session] = None) -> boto3.Session: |
@@ -124,6 +127,8 @@ def chunkify(lst: List[Any], num_chunks: int = 1, max_length: Optional[int] = No |
124 | 127 | [[0, 1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]] |
125 | 128 |
|
126 | 129 | """ |
| 130 | + if not lst: |
| 131 | + return [] |
127 | 132 | n: int = num_chunks if max_length is None else int(math.ceil((float(len(lst)) / float(max_length)))) |
128 | 133 | np_chunks = np.array_split(lst, n) |
129 | 134 | return [arr.tolist() for arr in np_chunks if len(arr) > 0] |
@@ -179,3 +184,52 @@ def get_region_from_subnet(subnet_id: str, boto3_session: Optional[boto3.Session |
179 | 184 | session: boto3.Session = ensure_session(session=boto3_session) |
180 | 185 | client_ec2: boto3.client = client(service_name="ec2", session=session) |
181 | 186 | return client_ec2.describe_subnets(SubnetIds=[subnet_id])["Subnets"][0]["AvailabilityZone"][:9] |
| 187 | + |
| 188 | + |
| 189 | +def extract_partitions_from_paths( |
| 190 | + path: str, paths: List[str] |
| 191 | +) -> Tuple[Optional[Dict[str, str]], Optional[Dict[str, List[str]]]]: |
| 192 | + """Extract partitions from Amazon S3 paths.""" |
| 193 | + path = path if path.endswith("/") else f"{path}/" |
| 194 | + partitions_types: Dict[str, str] = {} |
| 195 | + partitions_values: Dict[str, List[str]] = {} |
| 196 | + for p in paths: |
| 197 | + if path not in p: |
| 198 | + raise exceptions.InvalidArgumentValue(f"Object {p} is not under the root path ({path}).") |
| 199 | + path_wo_filename: str = p.rpartition("/")[0] + "/" |
| 200 | + if path_wo_filename not in partitions_values: |
| 201 | + path_wo_prefix: str = p.replace(f"{path}/", "") |
| 202 | + dirs: List[str] = [x for x in path_wo_prefix.split("/") if (x != "") and ("=" in x)] |
| 203 | + if dirs: |
| 204 | + values_tups: List[Tuple[str, str]] = [tuple(x.split("=")[:2]) for x in dirs] # type: ignore |
| 205 | + values_dics: Dict[str, str] = dict(values_tups) |
| 206 | + p_values: List[str] = list(values_dics.values()) |
| 207 | + p_types: Dict[str, str] = {x: "string" for x in values_dics.keys()} |
| 208 | + if not partitions_types: |
| 209 | + partitions_types = p_types |
| 210 | + if p_values: |
| 211 | + partitions_types = p_types |
| 212 | + partitions_values[path_wo_filename] = p_values |
| 213 | + elif p_types != partitions_types: |
| 214 | + raise exceptions.InvalidSchemaConvergence( |
| 215 | + f"At least two different partitions schema detected: {partitions_types} and {p_types}" |
| 216 | + ) |
| 217 | + if not partitions_types: |
| 218 | + return None, None |
| 219 | + return partitions_types, partitions_values |
| 220 | + |
| 221 | + |
| 222 | +def list_sampling(lst: List[Any], sampling: float) -> List[Any]: |
| 223 | + """Random List sampling.""" |
| 224 | + if sampling > 1.0 or sampling <= 0.0: |
| 225 | + raise exceptions.InvalidArgumentValue(f"Argument <sampling> must be [0.0 < value <= 1.0]. {sampling} received.") |
| 226 | + _len: int = len(lst) |
| 227 | + if _len == 0: |
| 228 | + return [] |
| 229 | + num_samples: int = int(round(_len * sampling)) |
| 230 | + num_samples = _len if num_samples > _len else num_samples |
| 231 | + num_samples = 1 if num_samples < 1 else num_samples |
| 232 | + _logger.debug("_len: %s", _len) |
| 233 | + _logger.debug("sampling: %s", sampling) |
| 234 | + _logger.debug("num_samples: %s", num_samples) |
| 235 | + return random.sample(population=lst, k=num_samples) |
0 commit comments