Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,11 @@ jobs:
comment: 'Please commit the suggested changes from markdownlint.'
event: 'REQUEST_CHANGES'

- name: "Run Type Checking"
- name: "Run Type Checking (Optional to passing)"
if: always()
run: poe types

continue-on-error: true

- name: "Generate Docs"
if: matrix.python-version == '3.9' && github.ref == 'refs/heads/master'
run: poe docs --ci
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

### Fixed
- Fixed handling of datetime columns in old pandas versions. (#609)

## [6.0.0] - 2025-11-26

### Added
Expand Down
14 changes: 7 additions & 7 deletions azure-kusto-data/azure/kusto/data/helpers.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import json
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Union, Callable, Dict, Optional
from typing import TYPE_CHECKING, Any, Union, Callable, Optional

if TYPE_CHECKING:
import pandas as pd
from azure.kusto.data._models import KustoResultTable, KustoStreamingResultTable

# Alias for dataframe_from_result_table converter type
Converter = Dict[str, Union[str, Callable[[str, "pd.DataFrame"], "pd.Series"]]]
Converter = dict[str, Union[str, Callable[[str, "pd.DataFrame"], "pd.Series['Any']"]]]


def load_bundled_json(file_name: str) -> Dict:
def load_bundled_json(file_name: str) -> dict[Any, Any]:
filename = Path(__file__).absolute().parent.joinpath(file_name)
with filename.open("r", encoding="utf-8") as data:
return json.load(data)
Expand Down Expand Up @@ -118,18 +118,18 @@ def parse_float(frame, col):
return frame[col]


def parse_datetime(frame, col):
def parse_datetime(frame, col, force_version: Optional[str] = None):
# Pandas before version 2 doesn't support the "format" arg
import pandas as pd

args = {}
if pd.__version__.startswith("2."):
if (force_version or pd.__version__).startswith("2."):
args = {"format": "ISO8601", "utc": True}
else:
# if frame contains ".", replace "Z" with ".000Z"
# == False is not a mistake - that's the pandas way to do it
contains_dot = frame[col].str.contains(".")
frame.loc[not contains_dot, col] = frame.loc[not contains_dot, col].str.replace("Z", ".000Z")
contains_dot = frame[col].str.contains("\\.")
frame.loc[~contains_dot, col] = frame.loc[~contains_dot, col].str.replace("Z", ".000Z")
frame[col] = pd.to_datetime(frame[col], errors="coerce", **args)
return frame[col]

Expand Down
42 changes: 42 additions & 0 deletions azure-kusto-data/tests/test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,3 +128,45 @@

assert df["Date"][0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tzinfo=datetime.timezone.utc)
assert df["Date"][1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tzinfo=datetime.timezone.utc)


def test_parse_datetime():
"""Test parse_datetime function with different pandas versions and datetime formats"""
from azure.kusto.data.helpers import parse_datetime

Check warning on line 135 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.11)

Type of "parse_datetime" is partially unknown   Type of "parse_datetime" is "(frame: Unknown, col: Unknown, force_version: str | None = None) -> Unknown" (reportUnknownVariableType)

Check warning on line 135 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.12)

Type of "parse_datetime" is partially unknown   Type of "parse_datetime" is "(frame: Unknown, col: Unknown, force_version: str | None = None) -> Unknown" (reportUnknownVariableType)

Check warning on line 135 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.10)

Type of "parse_datetime" is partially unknown   Type of "parse_datetime" is "(frame: Unknown, col: Unknown, force_version: str | None = None) -> Unknown" (reportUnknownVariableType)

Check warning on line 135 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.9)

Type of "parse_datetime" is partially unknown   Type of "parse_datetime" is "(frame: Unknown, col: Unknown, force_version: str | None = None) -> Unknown" (reportUnknownVariableType)

Check warning on line 135 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.13)

Type of "parse_datetime" is partially unknown   Type of "parse_datetime" is "(frame: Unknown, col: Unknown, force_version: str | None = None) -> Unknown" (reportUnknownVariableType)

# Test with pandas v2 behavior (force version 2)
df_v2 = pandas.DataFrame(
{
"date_with_ms": ["2023-12-12T01:59:59.352Z", "2023-12-12T01:54:44.123Z"],
"date_without_ms": ["2023-12-12T01:59:59Z", "2023-12-12T01:54:44Z"],
"mixed": ["2023-12-12T01:59:59.352Z", "2023-12-12T01:54:44Z"],
}
)

# Force pandas v2 behavior
result_v2 = parse_datetime(df_v2, "mixed", force_version="2.0.0")

Check warning on line 147 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.9)

Type of "result_v2" is partially unknown   Type of "result_v2" is "Series[Unknown]" (reportUnknownVariableType)
assert result_v2[0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tz="UTC")
assert result_v2[1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tz="UTC")
# Test with pandas v1 behavior (force version 1)

df_v1 = pandas.DataFrame(
{
"date_with_ms": ["2023-12-12T01:59:59.352Z", "2023-12-12T01:54:44.123Z"],
"date_without_ms": ["2023-12-12T01:59:59Z", "2023-12-12T01:54:44Z"],
"mixed": ["2023-12-12T01:59:59.352Z", "2023-12-12T01:54:44Z"],
}
)

# Force pandas v1 behavior - it should add .000 to dates without milliseconds
result_v1 = parse_datetime(df_v1, "mixed", force_version="1.5.3")

Check warning on line 161 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.9)

Type of "result_v1" is partially unknown   Type of "result_v1" is "Series[Unknown]" (reportUnknownVariableType)
assert result_v1[0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tz="UTC")
assert result_v1[1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tz="UTC")
# Test with actual pandas version (no force)
df_actual = pandas.DataFrame(
{
"mixed": ["2023-12-12T01:59:59.352Z", "2023-12-12T01:54:44Z"],
}
)
result_actual = parse_datetime(df_actual, "mixed")

Check warning on line 170 in azure-kusto-data/tests/test_helpers.py

View workflow job for this annotation

GitHub Actions / build (3.9)

Type of "result_actual" is partially unknown   Type of "result_actual" is "Series[Unknown]" (reportUnknownVariableType)
assert result_actual[0] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=59, second=59, microsecond=352000, tz="UTC")
assert result_actual[1] == pandas.Timestamp(year=2023, month=12, day=12, hour=1, minute=54, second=44, tz="UTC")
16 changes: 16 additions & 0 deletions azure-kusto-ingest/tests/test_e2e_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ async def assert_rows_added(cls, expected: int, timeout=60):
assert actual == expected, "Row count expected = {0}, while actual row count = {1}".format(expected, actual)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_csv_ingest_existing_table(self, is_managed_streaming):
csv_ingest_props = IngestionProperties(
self.test_db,
Expand All @@ -274,6 +275,7 @@ async def test_csv_ingest_existing_table(self, is_managed_streaming):
await self.assert_rows_added(20)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_csv_ingest_ignore_first_record(self, is_managed_streaming):
csv_ingest_props = IngestionProperties(
self.test_db,
Expand All @@ -291,6 +293,7 @@ async def test_csv_ingest_ignore_first_record(self, is_managed_streaming):
await self.assert_rows_added(18)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_json_ingest_existing_table(self):
json_ingestion_props = IngestionProperties(
self.test_db,
Expand All @@ -307,6 +310,7 @@ async def test_json_ingest_existing_table(self):
await self.assert_rows_added(4)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_json_ingest_existing_table_no_mapping(self):
json_ingestion_props = IngestionProperties(
self.test_db,
Expand All @@ -322,6 +326,7 @@ async def test_json_ingest_existing_table_no_mapping(self):
await self.assert_rows_added(4)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_ingest_complicated_props(self):
validation_policy = ValidationPolicy(
validation_options=ValidationOptions.ValidateCsvInputConstantColumns, validation_implications=ValidationImplications.Fail
Expand Down Expand Up @@ -350,6 +355,7 @@ async def test_ingest_complicated_props(self):
await self.assert_rows_added(4)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_ingest_from_stream(self, is_managed_streaming):
validation_policy = ValidationPolicy(
validation_options=ValidationOptions.ValidateCsvInputConstantColumns, validation_implications=ValidationImplications.Fail
Expand Down Expand Up @@ -379,6 +385,7 @@ async def test_ingest_from_stream(self, is_managed_streaming):
await self.assert_rows_added(4)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_json_ingestion_ingest_by_tag(self):
json_ingestion_props = IngestionProperties(
self.test_db,
Expand All @@ -397,6 +404,7 @@ async def test_json_ingestion_ingest_by_tag(self):
await self.assert_rows_added(0)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_tsv_ingestion_csv_mapping(self):
tsv_ingestion_props = IngestionProperties(
self.test_db,
Expand All @@ -412,6 +420,7 @@ async def test_tsv_ingestion_csv_mapping(self):
await self.assert_rows_added(10)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_ingest_blob(self):
if not self.test_blob:
pytest.skip("Provide blob SAS uri with 'dataset.csv'")
Expand All @@ -436,6 +445,7 @@ async def test_ingest_blob(self):
await self.assert_rows_added(10)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_opened_file(self, is_managed_streaming):
ingestion_properties = IngestionProperties(database=self.test_db, table=self.test_table, data_format=DataFormat.CSV)

Expand All @@ -446,6 +456,7 @@ async def test_streaming_ingest_from_opened_file(self, is_managed_streaming):
await self.assert_rows_added(10, timeout=120)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_csv_file(self):
ingestion_properties = IngestionProperties(database=self.test_db, table=self.test_table, flush_immediately=True, data_format=DataFormat.CSV)

Expand All @@ -455,6 +466,7 @@ async def test_streaming_ingest_from_csv_file(self):
await self.assert_rows_added(20, timeout=120)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_json_file(self):
ingestion_properties = IngestionProperties(
database=self.test_db,
Expand All @@ -471,6 +483,7 @@ async def test_streaming_ingest_from_json_file(self):
await self.assert_rows_added(4, timeout=120)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_csv_io_streams(self):
ingestion_properties = IngestionProperties(database=self.test_db, table=self.test_table, data_format=DataFormat.CSV)
byte_sequence = b'0,00000000-0000-0000-0001-020304050607,0,0,0,0,0,0,0,0,0,0,2014-01-01T01:01:01.0000000Z,Zero,"Zero",0,00:00:00,,null'
Expand All @@ -484,6 +497,7 @@ async def test_streaming_ingest_from_csv_io_streams(self):
await self.assert_rows_added(2, timeout=120)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_json_io_streams(self):
ingestion_properties = IngestionProperties(
database=self.test_db,
Expand All @@ -505,6 +519,7 @@ async def test_streaming_ingest_from_json_io_streams(self):
await self.assert_rows_added(2, timeout=120)

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_dataframe(self):
from pandas import DataFrame

Expand Down Expand Up @@ -544,6 +559,7 @@ async def test_streaming_ingest_from_dataframe(self):
assert a.primary_results[0].rows[0]["xdynamicWithNulls"] == dynamic_value

@pytest.mark.asyncio
@pytest.mark.xdist_group(name="ingest_group")
async def test_streaming_ingest_from_blob(self, is_managed_streaming):
ingestion_properties = IngestionProperties(
database=self.test_db,
Expand Down
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ dev = [
"ruff>=0.12.9",
"pdoc>=15.0.4",
"basedpyright>=1.31.2",
"pandas-stubs>=2.2.2.240807",
]

[tool.uv.workspace]
Expand Down Expand Up @@ -80,4 +81,7 @@ args = { bump = { help = "Bump version [possible values: major, minor, patch, st

[tool.basedpyright]
pythonVersion = "3.9"
reportUnnecessaryTypeIgnoreComment = false
reportUnnecessaryTypeIgnoreComment = false
reportExplicitAny = false
reportAny = false
reportUnknownMemberType = false
55 changes: 51 additions & 4 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading