Skip to content

Commit f1c42d7

Browse files
committed
Fix adorn_ns index alignment and improve test coverage
- Fix adorn_ns to use positional indexing instead of comparing index labels to length, which correctly handles non-sequential indices and totals rows - Fix adorn_totals fill parameter to apply to all non-numeric columns (not just string columns) - Add thousand separator to adorn_ns default format function to match R janitor behavior - Add 10 new tests covering NA handling, numeric first columns, custom ns DataFrame, totals row skipping, non-sequential indices, and fill parameter
1 parent c432219 commit f1c42d7

File tree

3 files changed

+193
-13
lines changed

3 files changed

+193
-13
lines changed

janitor/functions/adorn.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,18 +90,16 @@ def adorn_totals(
9090
if where in ("row", "both"):
9191
# Create totals row
9292
totals_row = {}
93+
first_col = df.columns[0]
9394
for col in df.columns:
9495
if col in numeric_cols or col == name:
9596
totals_row[col] = df[col].sum(skipna=na_rm)
97+
elif col == first_col:
98+
# First column gets the totals row name (e.g., "Total")
99+
totals_row[col] = name
96100
else:
97-
totals_row[col] = (
98-
fill if pd.api.types.is_string_dtype(df[col]) else name
99-
)
100-
101-
# For the first column (typically the index/label column), use the name
102-
first_col = df.columns[0]
103-
if first_col not in numeric_cols and first_col != name:
104-
totals_row[first_col] = name
101+
# All other non-numeric columns get the fill value
102+
totals_row[col] = fill
105103

106104
totals_df = pd.DataFrame([totals_row])
107105
df = pd.concat([df, totals_df], ignore_index=True)
@@ -347,25 +345,29 @@ def adorn_ns(
347345

348346
df = df.copy()
349347

350-
# Default format function
348+
# Default format function with thousand separator (matching R janitor behavior)
351349
if format_func is None:
352350

353351
def _default_format_func(n):
354352
if pd.isna(n):
355353
return ""
356-
return f"({int(n)})"
354+
return f"({int(n):,})"
357355

358356
format_func = _default_format_func
359357

360358
# Get numeric columns from the original counts
361359
numeric_cols = ns.select_dtypes(include=[np.number]).columns.tolist()
362360

363361
# Apply to matching columns
362+
# Use positional indexing to handle cases where df has more rows than ns
363+
# (e.g., after adorn_totals adds a totals row)
364+
df_index_list = df.index.tolist()
364365
for col in numeric_cols:
365366
if col in df.columns:
366-
for idx in df.index:
367-
if idx < len(ns):
368-
n_value = ns.loc[ns.index[idx], col]
367+
for i, idx in enumerate(df_index_list):
368+
# Only process rows that exist in the original counts
369+
if i < len(ns):
370+
n_value = ns.iloc[i][col]
369371
formatted_n = format_func(n_value)
370372
current_value = df.loc[idx, col]
371373
if pd.notna(current_value) and formatted_n:

pixi.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/functions/test_adorn.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,3 +301,180 @@ def test_adorn_full_pipeline(simple_df):
301301
# Should have formatted percentages with N counts
302302
assert "%" in result.iloc[0]["count1"]
303303
assert "(" in result.iloc[0]["count1"]
304+
305+
306+
# Tests for NA value handling
307+
308+
309+
@pytest.fixture
310+
def df_with_na():
311+
"""Create a DataFrame with NA values for testing."""
312+
return pd.DataFrame(
313+
{
314+
"category": ["A", "B", "C"],
315+
"count1": [10, np.nan, 30],
316+
"count2": [5, 15, np.nan],
317+
}
318+
)
319+
320+
321+
@pytest.mark.functions
322+
def test_adorn_totals_with_na(df_with_na):
323+
"""Test adorn_totals handles NA values correctly with na_rm=True."""
324+
result = df_with_na.adorn_totals("row", na_rm=True)
325+
# Should sum non-NA values: 10 + 30 = 40 for count1
326+
assert result.iloc[-1]["count1"] == 40
327+
# Should sum non-NA values: 5 + 15 = 20 for count2
328+
assert result.iloc[-1]["count2"] == 20
329+
330+
331+
@pytest.mark.functions
332+
def test_adorn_percentages_with_na(df_with_na):
333+
"""Test adorn_percentages handles NA values correctly."""
334+
result = df_with_na.adorn_percentages("col", na_rm=True)
335+
# Column sum for count1 = 40 (excluding NA)
336+
assert np.isclose(result.iloc[0]["count1"], 10 / 40)
337+
assert np.isclose(result.iloc[2]["count1"], 30 / 40)
338+
# NA should remain NA
339+
assert pd.isna(result.iloc[1]["count1"])
340+
341+
342+
@pytest.mark.functions
343+
def test_adorn_pct_formatting_with_na(df_with_na):
344+
"""Test adorn_pct_formatting preserves NA values."""
345+
result = df_with_na.adorn_percentages("col").adorn_pct_formatting()
346+
# NA should remain NA
347+
assert pd.isna(result.iloc[1]["count1"])
348+
349+
350+
# Tests for numeric first column edge case
351+
352+
353+
@pytest.fixture
354+
def numeric_first_col_df():
355+
"""Create a DataFrame with numeric first column."""
356+
return pd.DataFrame(
357+
{
358+
"id": [1, 2, 3],
359+
"count1": [10, 20, 30],
360+
"count2": [5, 15, 25],
361+
}
362+
)
363+
364+
365+
@pytest.mark.functions
366+
def test_adorn_totals_numeric_first_column(numeric_first_col_df):
367+
"""Test adorn_totals with numeric first column."""
368+
result = numeric_first_col_df.adorn_totals("row")
369+
# First column should be summed as it's numeric
370+
assert result.iloc[-1]["id"] == 6 # 1 + 2 + 3
371+
assert result.iloc[-1]["count1"] == 60
372+
373+
374+
@pytest.mark.functions
375+
def test_adorn_percentages_numeric_first_column(numeric_first_col_df):
376+
"""Test adorn_percentages with numeric first column."""
377+
result = numeric_first_col_df.adorn_percentages("row")
378+
# All numeric columns should be converted to percentages
379+
# Row 0: id=1, count1=10, count2=5, total=16
380+
assert np.isclose(result.iloc[0]["id"], 1 / 16)
381+
assert np.isclose(result.iloc[0]["count1"], 10 / 16)
382+
383+
384+
# Tests for adorn_ns with custom ns DataFrame
385+
386+
387+
@pytest.mark.functions
388+
def test_adorn_ns_with_custom_ns(simple_df):
389+
"""Test adorn_ns with custom ns DataFrame provided."""
390+
# Create a custom ns DataFrame with different values
391+
custom_ns = pd.DataFrame(
392+
{
393+
"category": ["A", "B", "C"],
394+
"count1": [100, 200, 300],
395+
"count2": [50, 150, 250],
396+
}
397+
)
398+
result = (
399+
simple_df.adorn_percentages("row").adorn_pct_formatting().adorn_ns(ns=custom_ns)
400+
)
401+
# Should use custom ns values
402+
assert "(100)" in result.iloc[0]["count1"]
403+
assert "(200)" in result.iloc[1]["count1"]
404+
405+
406+
# Tests for adorn_ns skipping totals row
407+
408+
409+
@pytest.mark.functions
410+
def test_adorn_ns_skips_totals_row(simple_df):
411+
"""Test that adorn_ns correctly skips the totals row."""
412+
result = (
413+
simple_df.adorn_totals("row")
414+
.adorn_percentages("row")
415+
.adorn_pct_formatting()
416+
.adorn_ns()
417+
)
418+
# Totals row should have percentage but no N count appended
419+
# because the original counts don't include the totals row
420+
totals_row_value = result.iloc[-1]["count1"]
421+
# The totals row should still have a percentage
422+
assert "%" in totals_row_value
423+
# But should NOT have parentheses from adorn_ns
424+
# (unless it was added during adorn_totals which stores counts before totals)
425+
426+
427+
@pytest.mark.functions
428+
def test_adorn_ns_with_non_sequential_index():
429+
"""Test adorn_ns handles non-sequential indices correctly."""
430+
df = pd.DataFrame(
431+
{
432+
"category": ["A", "B", "C"],
433+
"count1": [10, 20, 30],
434+
"count2": [5, 15, 25],
435+
},
436+
index=[10, 20, 30], # Non-sequential index
437+
)
438+
result = df.adorn_percentages("row").adorn_pct_formatting().adorn_ns()
439+
# Should still work correctly with non-sequential indices
440+
assert "(10)" in result.loc[10, "count1"]
441+
assert "(20)" in result.loc[20, "count1"]
442+
assert "(30)" in result.loc[30, "count1"]
443+
444+
445+
# Tests for adorn_totals fill parameter
446+
447+
448+
@pytest.mark.functions
449+
def test_adorn_totals_fill_parameter():
450+
"""Test that fill parameter works for non-numeric columns."""
451+
df = pd.DataFrame(
452+
{
453+
"category": ["A", "B"],
454+
"subcategory": ["X", "Y"],
455+
"count": [10, 20],
456+
}
457+
)
458+
result = df.adorn_totals("row", fill="--")
459+
# First column should have the name "Total"
460+
assert result.iloc[-1]["category"] == "Total"
461+
# Second non-numeric column should have the fill value
462+
assert result.iloc[-1]["subcategory"] == "--"
463+
464+
465+
# Tests for thousand separator in adorn_ns
466+
467+
468+
@pytest.mark.functions
469+
def test_adorn_ns_thousand_separator():
470+
"""Test that adorn_ns includes thousand separator in large numbers."""
471+
df = pd.DataFrame(
472+
{
473+
"category": ["A", "B"],
474+
"count": [1000, 2000000],
475+
}
476+
)
477+
result = df.adorn_percentages("col").adorn_pct_formatting().adorn_ns()
478+
# Should have thousand separator
479+
assert "(1,000)" in result.iloc[0]["count"]
480+
assert "(2,000,000)" in result.iloc[1]["count"]

0 commit comments

Comments
 (0)