Skip to content

Commit 65b3790

Browse files
committed
Refaktoriere Filesystem-API, behebe ls/lss-Fehler, sichere Metadaten-Handling und bump Version auf 0.22.0
- Ersetze Direktimport von fsspec durch fsspec_utils und passe FileSystem-Fabrik an. - Refactor vieler Filesystem-Hilfsfunktionen (ls/lss, pyarrow-/write-Utilities, Konvertierungsroutinen); alte Implementierungen auskommentiert/geräumt. - Behebe falsche Aufrufe von lss -> ls in BaseDataset und ParquetDataset (Dateiliste/Partitionserkennung). - Vereinheitliche String-Quotes/Whitespace und sichere JSON-/brotli-Dekodierung im Metadaten-Reader/Writer. - Korrigiere Pfad-Strip in ParquetMetadata (split("://")[-1]) für robuste Pfad-Verarbeitung. - Aktualisiere pyproject.toml auf Version 0.22.0 und passe uv.lock entsprechend (fsspec_utils und Paketversion).
1 parent 5ce5f1c commit 65b3790

File tree

5 files changed

+619
-592
lines changed

5 files changed

+619
-592
lines changed

pydala/dataset.py

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ class BaseDataset:
5858
_partitioning (str | list[str]): The partitioning scheme.
5959
_format (str): The file format of the dataset.
6060
"""
61+
6162
def __init__(
6263
self,
6364
path: str,
@@ -142,7 +143,7 @@ def __init__(
142143
if partitioning is None:
143144
# try to infer partitioning
144145
try:
145-
if any(["=" in obj for obj in self.fs.lss(self._path)]):
146+
if any(["=" in obj for obj in self.fs.ls(self._path)]):
146147
partitioning = "hive"
147148
except FileNotFoundError as e:
148149
_ = e
@@ -152,7 +153,6 @@ def __init__(
152153
partitioning = None
153154
self._partitioning = partitioning
154155

155-
156156
try:
157157
self.load()
158158
except FileNotFoundError as e:
@@ -563,7 +563,7 @@ def filter(
563563
the method will automatically use DuckDB for filtering.
564564
565565
"""
566-
#if any([s in filter_expr for s in ["%", "like", "similar to", "*"]]):
566+
# if any([s in filter_expr for s in ["%", "like", "similar to", "*"]]):
567567
# use = "duckdb"
568568

569569
if use == "auto":
@@ -949,7 +949,7 @@ def load(
949949
None
950950
"""
951951
if not self.has_file_metadata_file:
952-
if len(self.fs.lss(self.path)) == 0:
952+
if len(self.fs.ls(self.path)) == 0:
953953
return
954954
else:
955955
update_metadata = True
@@ -1223,7 +1223,6 @@ def __init__(
12231223

12241224

12251225
class CSVDataset(PyarrowDataset):
1226-
12271226
"""A dataset implementation for CSV files.
12281227
12291228
This class provides specialized handling for CSV datasets using
@@ -1432,11 +1431,11 @@ def _compact_partition(
14321431
raise ValueError(f"Invalid partition name: {name}")
14331432
if not validate_partition_value(value):
14341433
raise ValueError(f"Invalid partition value: {value}")
1435-
1434+
14361435
escaped_name = escape_sql_identifier(name)
14371436
escaped_value = escape_sql_literal(value)
14381437
filter_parts.append(f"{escaped_name}={escaped_value}")
1439-
1438+
14401439
filter_ = " AND ".join(filter_parts)
14411440

14421441
scan = self.scan(filter_)
@@ -1569,32 +1568,30 @@ def _compact_by_timeperiod(
15691568
# Securely build timestamp filter to prevent SQL injection
15701569
if timestamp_column is None:
15711570
timestamp_column = self._timestamp_column
1572-
1571+
15731572
if timestamp_column is None:
15741573
raise ValueError("No timestamp column specified or found")
1575-
1574+
15761575
# Validate timestamp column name
1577-
if not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', timestamp_column):
1576+
if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", timestamp_column):
15781577
raise ValueError(f"Invalid timestamp column name: {timestamp_column}")
1579-
1578+
15801579
# Format dates safely as ISO strings and build filter with proper escaping
15811580
start_date_str = start_date.isoformat()
15821581
end_date_str = end_date.isoformat()
15831582
filter_ = f"{timestamp_column} >= '{start_date_str}' AND {timestamp_column} < '{end_date_str}'"
1584-
1583+
15851584
# Sanitize the filter expression
15861585
filter_ = sanitize_filter_expression(filter_)
1587-
1586+
15881587
scan = self.scan(filter_)
1589-
1588+
15901589
if len(self.scan_files) == 1:
15911590
# Safely escape file path for metadata query
1592-
file_path = self.scan_files[0].replace(self._path, '').lstrip('/')
1591+
file_path = self.scan_files[0].replace(self._path, "").lstrip("/")
15931592
escaped_file_path = file_path.replace("'", "''")
15941593
date_diff = (
1595-
self.metadata_table.filter(
1596-
f"file_path='{escaped_file_path}'"
1597-
)
1594+
self.metadata_table.filter(f"file_path='{escaped_file_path}'")
15981595
.aggregate("max(AE_DATUM.max) - min(AE_DATUM.min)")
15991596
.fetchone()[0]
16001597
)

0 commit comments

Comments
 (0)