Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# readxl (development version)

* Adds an option to convert categories coded as background color to an extra column (e.g., `bad_data <- read_excel(file, sheet = "bad", extract_colors = TRUE)`) @pachadotdev.

# readxl 1.4.5

This release contains no user-facing changes.
Expand Down
8 changes: 4 additions & 4 deletions R/cpp11.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Generated by cpp11: do not edit by hand

read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress) {
.Call(`_readxl_read_xls_`, path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress)
read_xls_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors) {
.Call(`_readxl_read_xls_`, path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors)
}

read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress) {
.Call(`_readxl_read_xlsx_`, path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress)
read_xlsx_ <- function(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors) {
.Call(`_readxl_read_xlsx_`, path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors)
}

xls_sheets <- function(path) {
Expand Down
21 changes: 13 additions & 8 deletions R/read_excel.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ NULL
#' @param .name_repair Handling of column names. Passed along to
#' [tibble::as_tibble()]. readxl's default is `.name_repair = "unique", which
#' ensures column names are not empty and are unique.
#' @param extract_colors Logical. If `TRUE`, extracts background colors from
#' cells and adds them as additional columns with "_bg" suffix. Default is
#' `FALSE`. When enabled, for each data column, an additional column with the
#' background color information is added.
#' @return A [tibble][tibble::tibble-package]
#' @seealso [cell-specification] for more details on targetting cells with the
#' `range` argument
Expand Down Expand Up @@ -122,7 +126,7 @@ read_excel <- function(path, sheet = NULL, range = NULL,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max),
progress = readxl_progress(),
.name_repair = "unique") {
.name_repair = "unique", extract_colors = FALSE) {
path <- check_file(path)
format <- check_format(path)
read_excel_(
Expand All @@ -132,7 +136,7 @@ read_excel <- function(path, sheet = NULL, range = NULL,
n_max = n_max, guess_max = guess_max,
progress = progress,
.name_repair = .name_repair,
format = format
format = format, extract_colors = extract_colors
)
}

Expand All @@ -147,7 +151,7 @@ read_xls <- function(path, sheet = NULL, range = NULL,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max),
progress = readxl_progress(),
.name_repair = "unique") {
.name_repair = "unique", extract_colors = FALSE) {
path <- check_file(path)
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -156,7 +160,7 @@ read_xls <- function(path, sheet = NULL, range = NULL,
n_max = n_max, guess_max = guess_max,
progress = progress,
.name_repair = .name_repair,
format = "xls"
format = "xls", extract_colors = extract_colors
)
}

Expand All @@ -167,7 +171,7 @@ read_xlsx <- function(path, sheet = NULL, range = NULL,
na = "", trim_ws = TRUE, skip = 0, n_max = Inf,
guess_max = min(1000, n_max),
progress = readxl_progress(),
.name_repair = "unique") {
.name_repair = "unique", extract_colors = FALSE) {
path <- check_file(path)
read_excel_(
path = path, sheet = sheet, range = range,
Expand All @@ -176,7 +180,7 @@ read_xlsx <- function(path, sheet = NULL, range = NULL,
n_max = n_max, guess_max = guess_max,
progress = progress,
.name_repair = .name_repair,
format = "xlsx"
format = "xlsx", extract_colors = extract_colors
)
}

Expand All @@ -186,7 +190,7 @@ read_excel_ <- function(path, sheet = NULL, range = NULL,
guess_max = min(1000, n_max),
progress = readxl_progress(),
.name_repair = NULL,
format) {
format, extract_colors = FALSE) {
if (format == "xls") {
sheets_fun <- xls_sheets
read_fun <- read_xls_
Expand All @@ -204,13 +208,14 @@ read_excel_ <- function(path, sheet = NULL, range = NULL,
guess_max <- check_guess_max(guess_max)
trim_ws <- check_bool(trim_ws, "trim_ws")
progress <- check_bool(progress, "progress")
extract_colors <- check_bool(extract_colors, "extract_colors")
set_readxl_names(
read_fun(
path = path, sheet_i = sheet,
limits = limits, shim = shim,
col_names = col_names, col_types = col_types,
na = na, trim_ws = trim_ws, guess_max = guess_max,
progress = progress
progress = progress, extract_colors = extract_colors
),
.name_repair = .name_repair
)
Expand Down
Binary file added inst/extdata/gapminder-2007.xlsx
Binary file not shown.
14 changes: 11 additions & 3 deletions man/read_excel.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 9 additions & 6 deletions src/Read.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ cpp11::list read_this_(
std::vector<std::string> na,
bool trim_ws,
int guess_max = 1000,
bool progress = true) {
bool progress = true,
bool extract_colors = false) {
// Construct worksheet ----------------------------------------------
SheetView<T> ws(path, sheet_i, limits, shim, progress);
SheetView<T> ws(path, sheet_i, limits, shim, progress, extract_colors);

// catches empty sheets and sheets where requested rectangle contains no data
if (ws.nrow() == 0 && ws.ncol() == 0) {
Expand Down Expand Up @@ -78,8 +79,9 @@ cpp11::list read_xls_(
std::vector<std::string> na,
bool trim_ws,
int guess_max = 1000,
bool progress = true) {
return read_this_<Xls>(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress);
bool progress = true,
bool extract_colors = false) {
return read_this_<Xls>(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors);
}

[[cpp11::register]]
Expand All @@ -93,6 +95,7 @@ cpp11::list read_xlsx_(
std::vector<std::string> na,
bool trim_ws,
int guess_max = 1000,
bool progress = true) {
return read_this_<Xlsx>(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress);
bool progress = true,
bool extract_colors = false) {
return read_this_<Xlsx>(path, sheet_i, limits, shim, col_names, col_types, na, trim_ws, guess_max, progress, extract_colors);
}
116 changes: 111 additions & 5 deletions src/SheetView.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,10 @@ class SheetView {

public:
SheetView(const std::string& path,
int sheet_i, cpp11::integers limits, bool shim, bool progress)
int sheet_i, cpp11::integers limits, bool shim, bool progress, bool extract_colors = false)
: spinner_(progress),
wb_(path),
cs_(wb_, sheet_i, limits, shim, spinner_)
cs_(wb_, sheet_i, limits, shim, spinner_, extract_colors)
{
}

Expand Down Expand Up @@ -124,11 +124,63 @@ class SheetView {
// base is row the data starts on **in the spreadsheet**
int base = cs_.cells_.begin()->row() + has_col_names;
int n = (xcell == cs_.cells_.end()) ? 0 : cs_.lastRow() - base + 1;
cpp11::writable::list cols(cs_.ncol());
cols.attr("names") = names;

// First pass: determine which columns have background colors if extract_colors is enabled
std::vector<bool> has_colors(cs_.ncol(), false);
int color_column_count = 0;

if (cs_.extract_colors_ && n > 0) {
typename std::vector<typename T::Cell>::iterator scan_cell = xcell;
while (scan_cell != cs_.cells_.end()) {
int col = scan_cell->col() - cs_.startCol();
if (col >= 0 && col < cs_.ncol() && types[col] != COL_SKIP) {
// Check static background color
std::string bg_color = scan_cell->getBackgroundColor(wb_.backgroundColors());
// Check conditional formatting color (for XLSX)
std::string cf_color = getConditionalFormattingColor(*scan_cell);

if (!bg_color.empty() || !cf_color.empty()) {
if (!has_colors[col]) {
has_colors[col] = true;
color_column_count++;
}
}
}
scan_cell++;
}
}

// Determine the number of columns - add only color columns that have actual colors
int total_cols = cs_.ncol() + color_column_count;
cpp11::writable::list cols(total_cols);

// Create column names - add "_bg" suffix only for columns with colors
cpp11::writable::strings col_names(total_cols);
std::vector<int> color_col_mapping(cs_.ncol(), -1); // maps data col to color col index
int next_color_col = cs_.ncol();

for (int j = 0; j < cs_.ncol(); ++j) {
col_names[j] = names[j];
if (cs_.extract_colors_ && has_colors[j]) {
std::string base_name = static_cast<std::string>(names[j]);
std::string color_name = base_name + "_bg";
col_names[next_color_col] = color_name;
color_col_mapping[j] = next_color_col;
next_color_col++;
}
}
cols.attr("names") = col_names;

// Create data columns
for (int j = 0; j < cs_.ncol(); ++j) {
cols[j] = makeCol(types[j], n);
}
// Create color columns only for columns that have colors
for (int j = 0; j < cs_.ncol(); ++j) {
if (cs_.extract_colors_ && has_colors[j]) {
cols[color_col_mapping[j]] = makeCol(COL_TEXT, n);
}
}

if (n == 0) {
return cols;
Expand Down Expand Up @@ -283,10 +335,64 @@ class SheetView {
}
}
}

// Extract background color if enabled and this column has colors
if (cs_.extract_colors_ && types[col] != COL_SKIP && color_col_mapping[col] != -1) {
cpp11::sexp color_column = cpp11::as_sexp(cols[color_col_mapping[col]]);
std::string bg_color = xcell->getBackgroundColor(wb_.backgroundColors());

// If no static background color, check conditional formatting
if (bg_color.empty()) {
bg_color = getConditionalFormattingColor(*xcell);
}

if (bg_color.empty()) {
SET_STRING_ELT(color_column, row, NA_STRING);
} else {
SET_STRING_ELT(color_column, row, Rf_mkCharCE(bg_color.c_str(), CE_UTF8));
}
}

xcell++;
}

return removeSkippedColumns(cols, names, types);
// Handle column filtering for color extraction
if (cs_.extract_colors_ && color_column_count > 0) {
// Create extended types vector for both data and color columns
std::vector<ColType> extended_types(total_cols);
cpp11::writable::strings extended_names(total_cols);

for (int j = 0; j < cs_.ncol(); ++j) {
extended_types[j] = types[j];
extended_names[j] = names[j];
}

// Add types for color columns (they follow same skip pattern as their data columns)
int color_idx = cs_.ncol();
for (int j = 0; j < cs_.ncol(); ++j) {
if (has_colors[j]) {
extended_types[color_idx] = types[j];
extended_names[color_idx] = col_names[color_idx];
color_idx++;
}
}

return removeSkippedColumns(cols, extended_names, extended_types);
} else {
return removeSkippedColumns(cols, names, types);
}
}

private:
// Get conditional formatting color - non-template method
std::string getConditionalFormattingColor(const typename T::Cell& cell) {
return ""; // Default: no conditional formatting for XLS
}

};

// Template specialization for XLSX conditional formatting
template<>
inline std::string SheetView<Xlsx>::getConditionalFormattingColor(const XlsxCell& cell) {
return cell.getConditionalFormattingColor(cs_.conditionalFormats());
}
7 changes: 7 additions & 0 deletions src/XlsCell.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

#include <iomanip>
#include <limits.h>
#include <map>

// Key reference for understanding the structure of the xls format is
// [MS-XLS]: Excel Binary File Format (.xls) Structure
Expand Down Expand Up @@ -337,4 +338,10 @@ class XlsCell {
}
}

std::string getBackgroundColor(const std::map<int, std::string>& backgroundColors) const {
// For XLS files, background color extraction is not implemented yet
// This is a placeholder to maintain template compatibility
return "";
}

};
5 changes: 3 additions & 2 deletions src/XlsCellSet.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ class XlsCellSet {
public:

std::vector<XlsCell> cells_;
bool extract_colors_;

XlsCellSet(const XlsWorkBook wb, int sheet_i,
cpp11::integers limits, bool shim, Spinner spinner_)
: nominal_(limits)
cpp11::integers limits, bool shim, Spinner spinner_, bool extract_colors = false)
: nominal_(limits), extract_colors_(extract_colors)
{
if (sheet_i >= wb.n_sheets()) {
cpp11::stop("Can't retrieve sheet in position %d, only %d sheet(s) found.",
Expand Down
7 changes: 7 additions & 0 deletions src/XlsWorkBook.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,15 @@
#include "cpp11/r_string.hpp"
#include "cpp11/strings.hpp"

#include <map>

class XlsWorkBook {

// common to Xls[x]WorkBook
std::string path_;
bool is1904_;
std::set<int> dateFormats_;
std::map<int, std::string> backgroundColors_;
std::vector<std::string> stringTable_;

// kept as data + accessor in XlsWorkBook vs. member function in XlsxWorkBook
Expand Down Expand Up @@ -76,6 +79,10 @@ class XlsWorkBook {
return dateFormats_;
}

const std::map<int, std::string>& backgroundColors() const {
return backgroundColors_;
}

const std::vector<std::string>& stringTable() const {
return stringTable_;
}
Expand Down
Loading
Loading