|
| 1 | +import datetime |
| 2 | +import glob |
| 3 | +import os |
1 | 4 | import re |
2 | | -from typing import Any, Callable, Dict, Optional, Sequence, Tuple |
| 5 | +import zipfile |
| 6 | +from typing import Any, Callable, Dict, Iterator, List, Optional, Sequence, Tuple, Union |
3 | 7 | from .helper import string_sig |
4 | 8 | import pandas |
5 | 9 | from pandas.api.types import is_numeric_dtype |
6 | 10 |
|
7 | 11 |
|
| 12 | +def enumerate_csv_files( |
| 13 | + data: Union[ |
| 14 | + pandas.DataFrame, List[Union[str, Tuple[str, str]]], str, Tuple[str, str, str, str] |
| 15 | + ], |
| 16 | + verbose: int = 0, |
| 17 | +) -> Iterator[Union[pandas.DataFrame, str, Tuple[str, str, str, str]]]: |
| 18 | + """ |
| 19 | + Enumerates files considered for the aggregation. |
| 20 | + Only csv files are considered. |
| 21 | + If a zip file is given, the function digs into the zip files and |
| 22 | + loops over csv candidates. |
| 23 | +
|
| 24 | + :param data: dataframe with the raw data or a file or list of files |
| 25 | +
|
| 26 | + data can contains: |
| 27 | + * a dataframe |
| 28 | + * a string for a filename, zip or csv |
| 29 | + * a list of string |
| 30 | + * a tuple |
| 31 | + """ |
| 32 | + if not isinstance(data, list): |
| 33 | + data = [data] |
| 34 | + for itn, filename in enumerate(data): |
| 35 | + if isinstance(filename, pandas.DataFrame): |
| 36 | + if verbose: |
| 37 | + print(f"[enumerate_csv_files] data[{itn}] is a dataframe") |
| 38 | + yield filename |
| 39 | + continue |
| 40 | + |
| 41 | + if isinstance(filename, tuple): |
| 42 | + # A file in a zipfile |
| 43 | + if verbose: |
| 44 | + print(f"[enumerate_csv_files] data[{itn}] is {filename!r}") |
| 45 | + yield filename |
| 46 | + continue |
| 47 | + |
| 48 | + if os.path.exists(filename): |
| 49 | + ext = os.path.splitext(filename)[-1] |
| 50 | + if ext == ".csv": |
| 51 | + # We check the first line is ok. |
| 52 | + if verbose: |
| 53 | + print(f"[enumerate_csv_files] data[{itn}] is a csv file: {filename!r}]") |
| 54 | + with open(filename, "r", encoding="utf-8") as f: |
| 55 | + line = f.readline() |
| 56 | + if "~help" in line or (",CMD" not in line and ",DATE" not in line): |
| 57 | + continue |
| 58 | + dt = datetime.datetime.fromtimestamp(os.stat(filename).st_mtime) |
| 59 | + du = dt.strftime("%Y-%m-%d %H:%M:%S") |
| 60 | + yield (os.path.split(filename)[-1], du, filename, "") |
| 61 | + continue |
| 62 | + |
| 63 | + if ext == ".zip": |
| 64 | + if verbose: |
| 65 | + print(f"[enumerate_csv_files] data[{itn}] is a zip file: {filename!r}]") |
| 66 | + zf = zipfile.ZipFile(filename, "r") |
| 67 | + for ii, info in enumerate(zf.infolist()): |
| 68 | + name = info.filename |
| 69 | + ext = os.path.splitext(name)[-1] |
| 70 | + if ext != ".csv": |
| 71 | + continue |
| 72 | + if verbose: |
| 73 | + print( |
| 74 | + f"[enumerate_csv_files] data[{itn}][{ii}] is a csv file: {name!r}]" |
| 75 | + ) |
| 76 | + with zf.open(name) as f: |
| 77 | + line = f.readline() |
| 78 | + yield ( |
| 79 | + os.path.split(name)[-1], |
| 80 | + "%04d-%02d-%02d %02d:%02d:%02d" % info.date_time, |
| 81 | + name, |
| 82 | + filename, |
| 83 | + ) |
| 84 | + zf.close() |
| 85 | + continue |
| 86 | + |
| 87 | + raise AssertionError(f"Unexpected format {filename!r}, cannot read it.") |
| 88 | + |
| 89 | + # filename is a pattern. |
| 90 | + found = glob.glob(filename) |
| 91 | + if verbose and not found: |
| 92 | + print(f"[enumerate_csv_files] unable to find file in {filename!r}") |
| 93 | + for ii, f in enumerate(found): |
| 94 | + if verbose: |
| 95 | + print(f"[enumerate_csv_files] data[{itn}][{ii}] {f!r} from {filename!r}") |
| 96 | + yield from enumerate_csv_files(f, verbose=verbose) |
| 97 | + |
| 98 | + |
| 99 | +def open_dataframe( |
| 100 | + data: Union[str, Tuple[str, str, str, str], pandas.DataFrame], |
| 101 | +) -> pandas.DataFrame: |
| 102 | + """ |
| 103 | + Opens a filename. |
| 104 | +
|
| 105 | + :param data: a dataframe, a filename, a tuple indicating the file is coming |
| 106 | + from a zip file |
| 107 | + :return: a dataframe |
| 108 | + """ |
| 109 | + if isinstance(data, pandas.DataFrame): |
| 110 | + return data |
| 111 | + if isinstance(data, str): |
| 112 | + df = pandas.read_csv(data) |
| 113 | + df["RAWFILENAME"] = data |
| 114 | + return df |
| 115 | + if isinstance(data, tuple): |
| 116 | + if not data[-1]: |
| 117 | + df = pandas.read_csv(data[2]) |
| 118 | + df["RAWFILENAME"] = data[2] |
| 119 | + return df |
| 120 | + zf = zipfile.ZipFile(data[-1]) |
| 121 | + with zf.open(data[2]) as f: |
| 122 | + df = pandas.read_csv(f) |
| 123 | + df["RAWFILENAME"] = f"{data[-1]}/{data[2]}" |
| 124 | + zf.close() |
| 125 | + return df |
| 126 | + |
| 127 | + raise ValueError(f"Unexpected value for data: {data!r}") |
| 128 | + |
| 129 | + |
8 | 130 | class CubeViewDef: |
9 | 131 | """ |
10 | 132 | Defines how to compute a view. |
|
0 commit comments