Skip to content

Commit 6573d3a

Browse files
authored
Merge branch 'main' into wheels-released-numpy
2 parents 9bdf998 + 0ad2c0d commit 6573d3a

File tree

10 files changed

+153
-26
lines changed

10 files changed

+153
-26
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
383383
-i "pandas.tseries.offsets.Week.n GL08" \
384384
-i "pandas.tseries.offsets.Week.normalize GL08" \
385385
-i "pandas.tseries.offsets.Week.weekday GL08" \
386-
-i "pandas.tseries.offsets.WeekOfMonth SA01" \
387386
-i "pandas.tseries.offsets.WeekOfMonth.is_on_offset GL08" \
388387
-i "pandas.tseries.offsets.WeekOfMonth.n GL08" \
389388
-i "pandas.tseries.offsets.WeekOfMonth.normalize GL08" \

pandas/_libs/tslibs/offsets.pyx

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3582,6 +3582,11 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
35823582
"""
35833583
Describes monthly dates like "the Tuesday of the 2nd week of each month".
35843584
3585+
This offset allows for generating or adjusting dates by specifying
3586+
a particular week and weekday within a month. The week is zero-indexed,
3587+
where 0 corresponds to the first week of the month, and weekday follows
3588+
a Monday=0 convention.
3589+
35853590
Attributes
35863591
----------
35873592
n : int, default 1
@@ -3602,6 +3607,12 @@ cdef class WeekOfMonth(WeekOfMonthMixin):
36023607
- 5 is Saturday
36033608
- 6 is Sunday.
36043609
3610+
See Also
3611+
--------
3612+
offsets.Week : Describes weekly frequency adjustments.
3613+
offsets.MonthEnd : Describes month-end frequency adjustments.
3614+
date_range : Generates a range of dates based on a specific frequency.
3615+
36053616
Examples
36063617
--------
36073618
>>> ts = pd.Timestamp(2022, 1, 1)

pandas/core/computation/eval.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
from pandas.util._exceptions import find_stack_level
1515
from pandas.util._validators import validate_bool_kwarg
1616

17-
from pandas.core.dtypes.common import is_extension_array_dtype
17+
from pandas.core.dtypes.common import (
18+
is_extension_array_dtype,
19+
is_string_dtype,
20+
)
1821

1922
from pandas.core.computation.engines import ENGINES
2023
from pandas.core.computation.expr import (
@@ -345,10 +348,13 @@ def eval(
345348
parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
346349

347350
if engine == "numexpr" and (
348-
is_extension_array_dtype(parsed_expr.terms.return_type)
351+
(
352+
is_extension_array_dtype(parsed_expr.terms.return_type)
353+
and not is_string_dtype(parsed_expr.terms.return_type)
354+
)
349355
or getattr(parsed_expr.terms, "operand_types", None) is not None
350356
and any(
351-
is_extension_array_dtype(elem)
357+
(is_extension_array_dtype(elem) and not is_string_dtype(elem))
352358
for elem in parsed_expr.terms.operand_types
353359
)
354360
):

pandas/core/computation/expr.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
from pandas.errors import UndefinedVariableError
2323

24+
from pandas.core.dtypes.common import is_string_dtype
25+
2426
import pandas.core.common as com
2527
from pandas.core.computation.ops import (
2628
ARITH_OPS_SYMS,
@@ -524,10 +526,12 @@ def _maybe_evaluate_binop(
524526
elif self.engine != "pytables":
525527
if (
526528
getattr(lhs, "return_type", None) == object
529+
or is_string_dtype(getattr(lhs, "return_type", None))
527530
or getattr(rhs, "return_type", None) == object
531+
or is_string_dtype(getattr(rhs, "return_type", None))
528532
):
529533
# evaluate "==" and "!=" in python if either of our operands
530-
# has an object return type
534+
# has an object or string return type
531535
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
532536
return res
533537

pandas/tests/extension/test_sparse.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,11 +340,16 @@ def test_argmin_argmax_all_na(self, method, data, na_value):
340340
self._check_unsupported(data)
341341
super().test_argmin_argmax_all_na(method, data, na_value)
342342

343+
@pytest.mark.fails_arm_wheels
343344
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
344345
def test_equals(self, data, na_value, as_series, box):
345346
self._check_unsupported(data)
346347
super().test_equals(data, na_value, as_series, box)
347348

349+
@pytest.mark.fails_arm_wheels
350+
def test_equals_same_data_different_object(self, data):
351+
super().test_equals_same_data_different_object(data)
352+
348353
@pytest.mark.parametrize(
349354
"func, na_action, expected",
350355
[

pandas/tests/frame/test_query_eval.py

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas._config import using_string_dtype
8-
97
from pandas.errors import (
108
NumExprClobberingError,
119
UndefinedVariableError,
@@ -762,7 +760,6 @@ def test_inf(self, op, f, engine, parser):
762760
result = df.query(q, engine=engine, parser=parser)
763761
tm.assert_frame_equal(result, expected)
764762

765-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
766763
def test_check_tz_aware_index_query(self, tz_aware_fixture):
767764
# https://github.com/pandas-dev/pandas/issues/29463
768765
tz = tz_aware_fixture
@@ -775,6 +772,7 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
775772
tm.assert_frame_equal(result, expected)
776773

777774
expected = DataFrame(df_index)
775+
expected.columns = expected.columns.astype(object)
778776
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
779777
tm.assert_frame_equal(result, expected)
780778

@@ -1072,7 +1070,7 @@ def test_query_with_string_columns(self, parser, engine):
10721070
with pytest.raises(NotImplementedError, match=msg):
10731071
df.query("a in b and c < d", parser=parser, engine=engine)
10741072

1075-
def test_object_array_eq_ne(self, parser, engine, using_infer_string):
1073+
def test_object_array_eq_ne(self, parser, engine):
10761074
df = DataFrame(
10771075
{
10781076
"a": list("aaaabbbbcccc"),
@@ -1081,14 +1079,11 @@ def test_object_array_eq_ne(self, parser, engine, using_infer_string):
10811079
"d": np.random.default_rng(2).integers(9, size=12),
10821080
}
10831081
)
1084-
warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
1085-
with tm.assert_produces_warning(warning):
1086-
res = df.query("a == b", parser=parser, engine=engine)
1082+
res = df.query("a == b", parser=parser, engine=engine)
10871083
exp = df[df.a == df.b]
10881084
tm.assert_frame_equal(res, exp)
10891085

1090-
with tm.assert_produces_warning(warning):
1091-
res = df.query("a != b", parser=parser, engine=engine)
1086+
res = df.query("a != b", parser=parser, engine=engine)
10921087
exp = df[df.a != df.b]
10931088
tm.assert_frame_equal(res, exp)
10941089

@@ -1128,15 +1123,13 @@ def test_query_with_nested_special_character(self, parser, engine):
11281123
],
11291124
)
11301125
def test_query_lex_compare_strings(
1131-
self, parser, engine, op, func, using_infer_string
1126+
self, parser, engine, op, func
11321127
):
11331128
a = Series(np.random.default_rng(2).choice(list("abcde"), 20))
11341129
b = Series(np.arange(a.size))
11351130
df = DataFrame({"X": a, "Y": b})
11361131

1137-
warning = RuntimeWarning if using_infer_string and engine == "numexpr" else None
1138-
with tm.assert_produces_warning(warning):
1139-
res = df.query(f'X {op} "d"', engine=engine, parser=parser)
1132+
res = df.query(f'X {op} "d"', engine=engine, parser=parser)
11401133
expected = df[func(df.X, "d")]
11411134
tm.assert_frame_equal(res, expected)
11421135

@@ -1400,15 +1393,13 @@ def test_expr_with_column_name_with_backtick(self):
14001393
expected = df[df["a`b"] < 2]
14011394
tm.assert_frame_equal(result, expected)
14021395

1403-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
14041396
def test_expr_with_string_with_backticks(self):
14051397
# GH 59285
14061398
df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
14071399
result = df.query("'```' < `#backticks`")
14081400
expected = df["```" < df["#backticks"]]
14091401
tm.assert_frame_equal(result, expected)
14101402

1411-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
14121403
def test_expr_with_string_with_backticked_substring_same_as_column_name(self):
14131404
# GH 59285
14141405
df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"])
@@ -1439,7 +1430,6 @@ def test_expr_with_column_names_with_special_characters(self, col1, col2, expr):
14391430
expected = df[df[col1] < df[col2]]
14401431
tm.assert_frame_equal(result, expected)
14411432

1442-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
14431433
def test_expr_with_no_backticks(self):
14441434
# GH 59285
14451435
df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"])
@@ -1483,15 +1473,13 @@ def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self):
14831473
):
14841474
df.query("`column-name` < 'It`s that\\'s \"quote\" #hash")
14851475

1486-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
14871476
def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self):
14881477
# GH 59285
14891478
df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])
14901479
result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'")
14911480
expected = df[df["column-name"] < 'It`s that\'s "quote" #hash']
14921481
tm.assert_frame_equal(result, expected)
14931482

1494-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
14951483
def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self):
14961484
# GH 59285
14971485
df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"])

pandas/tests/series/indexing/test_setitem.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,17 @@
44
datetime,
55
)
66
from decimal import Decimal
7+
import os
78

89
import numpy as np
910
import pytest
1011

1112
from pandas._config import using_string_dtype
1213

13-
from pandas.compat import HAS_PYARROW
14+
from pandas.compat import (
15+
HAS_PYARROW,
16+
WASM,
17+
)
1418
from pandas.compat.numpy import np_version_gte1p24
1519
from pandas.errors import IndexingError
1620

@@ -1446,7 +1450,11 @@ def obj(self):
14461450
marks=pytest.mark.xfail(
14471451
(
14481452
not np_version_gte1p24
1449-
or (np_version_gte1p24 and np._get_promotion_state() != "weak")
1453+
or (
1454+
np_version_gte1p24
1455+
and os.environ.get("NPY_PROMOTION_STATE", "weak") != "weak"
1456+
)
1457+
or WASM
14501458
),
14511459
reason="np.float32(1.1) ends up as 1.100000023841858, so "
14521460
"np_can_hold_element raises and we cast to float64",

pandas/tests/series/test_ufunc.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ def ufunc(request):
1616
return request.param
1717

1818

19-
@pytest.fixture(params=[True, False], ids=["sparse", "dense"])
19+
@pytest.fixture(
20+
params=[pytest.param(True, marks=pytest.mark.fails_arm_wheels), False],
21+
ids=["sparse", "dense"],
22+
)
2023
def sparse(request):
2124
return request.param
2225

pyproject.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,14 @@ before-build = "bash {package}/scripts/cibw_before_build.sh"
162162
before-build = "pip install delvewheel && bash {package}/scripts/cibw_before_build.sh"
163163
repair-wheel-command = "delvewheel repair -w {dest_dir} {wheel}"
164164

165+
[[tool.cibuildwheel.overrides]]
166+
select = "*-manylinux_aarch64*"
167+
test-command = """
168+
PANDAS_CI='1' python -c 'import pandas as pd; \
169+
pd.test(extra_args=["-m not clipboard and not single_cpu and not slow and not network and not db and not fails_arm_wheels", "-n 2", "--no-strict-data-files"]); \
170+
pd.test(extra_args=["-m not clipboard and single_cpu and not slow and not network and not db", "--no-strict-data-files"]);' \
171+
"""
172+
165173
[[tool.cibuildwheel.overrides]]
166174
select = "*-musllinux*"
167175
before-test = "apk update && apk add musl-locales"
@@ -477,6 +485,10 @@ markers = [
477485
"clipboard: mark a pd.read_clipboard test",
478486
"arm_slow: mark a test as slow for arm64 architecture",
479487
"skip_ubsan: Tests known to fail UBSAN check",
488+
# TODO: someone should investigate this ...
489+
# these tests only fail in the wheel builder and don't fail in regular
490+
# ARM CI
491+
"fails_arm_wheels: Tests that fail in the ARM wheel build only",
480492
]
481493

482494
[tool.mypy]

web/pandas/community/ecosystem.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,97 @@ pandas-gbq provides high performance reads and writes to and from
367367
these methods were exposed as `pandas.read_gbq` and `DataFrame.to_gbq`.
368368
Use `pandas_gbq.read_gbq` and `pandas_gbq.to_gbq`, instead.
369369

370+
371+
### [ArcticDB](https://github.com/man-group/ArcticDB)
372+
373+
ArcticDB is a serverless DataFrame database engine designed for the Python Data Science ecosystem. ArcticDB enables you to store, retrieve, and process pandas DataFrames at scale. It is a storage engine designed for object storage and also supports local-disk storage using LMDB. ArcticDB requires zero additional infrastructure beyond a running Python environment and access to object storage and can be installed in seconds. Please find full documentation [here](https://docs.arcticdb.io/latest/).
374+
375+
#### ArcticDB Terminology
376+
377+
ArcticDB is structured to provide a scalable and efficient way to manage and retrieve DataFrames, organized into several key components:
378+
379+
- `Object Store` Collections of libraries. Used to separate logical environments from each other. Analogous to a database server.
380+
- `Library` Contains multiple symbols which are grouped in a certain way (different users, markets, etc). Analogous to a database.
381+
- `Symbol` Atomic unit of data storage. Identified by a string name. Data stored under a symbol strongly resembles a pandas DataFrame. Analogous to tables.
382+
- `Version` Every modifying action (write, append, update) performed on a symbol creates a new version of that object.
383+
384+
#### Installation
385+
386+
To install, simply run:
387+
388+
```console
389+
pip install arcticdb
390+
```
391+
392+
To get started, we can import ArcticDB and instantiate it:
393+
394+
```python
395+
import arcticdb as adb
396+
import numpy as np
397+
import pandas as pd
398+
# this will set up the storage using the local file system
399+
arctic = adb.Arctic("lmdb://arcticdb_test")
400+
```
401+
402+
> **Note:** ArcticDB supports any S3 API compatible storage, including AWS. ArcticDB also supports Azure Blob storage.
403+
> ArcticDB also supports LMDB for local/file based storage - to use LMDB, pass an LMDB path as the URI: `adb.Arctic('lmdb://path/to/desired/database')`.
404+
405+
#### Library Setup
406+
407+
ArcticDB is geared towards storing many (potentially millions) of tables. Individual tables (DataFrames) are called symbols and are stored in collections called libraries. A single library can store many symbols. Libraries must first be initialized prior to use:
408+
409+
```python
410+
lib = arctic.get_library('sample', create_if_missing=True)
411+
```
412+
413+
#### Writing Data to ArcticDB
414+
415+
Now we have a library set up, we can get to reading and writing data. ArcticDB has a set of simple functions for DataFrame storage. Let's write a DataFrame to storage.
416+
417+
```python
418+
df = pd.DataFrame(
419+
{
420+
"a": list("abc"),
421+
"b": list(range(1, 4)),
422+
"c": np.arange(3, 6).astype("u1"),
423+
"d": np.arange(4.0, 7.0, dtype="float64"),
424+
"e": [True, False, True],
425+
"f": pd.date_range("20130101", periods=3)
426+
}
427+
)
428+
429+
df
430+
df.dtypes
431+
```
432+
433+
Write to ArcticDB.
434+
435+
```python
436+
write_record = lib.write("test", df)
437+
```
438+
439+
> **Note:** When writing pandas DataFrames, ArcticDB supports the following index types:
440+
>
441+
> - `pandas.Index` containing int64 (or the corresponding dedicated types Int64Index, UInt64Index)
442+
> - `RangeIndex`
443+
> - `DatetimeIndex`
444+
> - `MultiIndex` composed of above supported types
445+
>
446+
> The "row" concept in `head`/`tail` refers to the row number ('iloc'), not the value in the `pandas.Index` ('loc').
447+
448+
#### Reading Data from ArcticDB
449+
450+
Read the data back from storage:
451+
452+
```python
453+
read_record = lib.read("test")
454+
read_record.data
455+
df.dtypes
456+
```
457+
458+
ArcticDB also supports appending, updating, and querying data from storage to a pandas DataFrame. Please find more information [here](https://docs.arcticdb.io/latest/api/query_builder/).
459+
460+
370461
## Out-of-core
371462

372463
### [Bodo](https://bodo.ai/)

0 commit comments

Comments
 (0)