Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- export `xlsx`: No more parallel document processing in one export step, as this caused the exporter to crash when more than one file was present
- export `xlsx`: Better warning messages

### Fixed

- import `xlsx`: Trim column names

## [0.44.1] - 2025-12-09

## [0.44.0] - 2025-12-08
Expand Down
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ version = "0.44.1"
ansi_term = "0.12"
anyhow = "1.0"
bimap = "0.6.3"
chrono = "0.4.42"
clap = { version = "4.0", features = ["derive", "env"] }
console = "0.15"
csv = "1.3"
Expand Down
74 changes: 60 additions & 14 deletions src/exporter/xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ use graphannis_core::{
util::join_qname,
};
use linked_hash_map::LinkedHashMap;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use serde::{Deserialize, Serialize};
use tempfile::NamedTempFile;

Expand Down Expand Up @@ -141,13 +140,20 @@ impl ExportXlsx {
};
let worksheet = if let Some(addr) = &self.update_datasheet {
match addr {
SheetAddress::Numeric(i) => workbook
.get_sheet_mut(&(*i - 1))
.ok_or(anyhow!("Sheet with index {i} does not exist."))?,
SheetAddress::Name(s) => workbook
.get_sheet_by_name_mut(s)
.ok_or(anyhow!("Sheet with name {s} does not exist."))?,
SheetAddress::Numeric(i) => {
workbook.remove_sheet(*i - 1).map_err(|e| anyhow!(e))?;
}
SheetAddress::Name(s) => {
workbook.remove_sheet_by_name(s).map_err(|e| anyhow!(e))?;
}
}
let sheet_name = format!(
"{doc_name}-{}",
chrono::Local::now().format("%Y-%m-%d-%H-%M-%S-%9f")
);
workbook
.new_sheet(&sheet_name)
.map_err(|e| anyhow!("Could not create new sheet with name `{sheet_name}`: {e}"))?
} else {
workbook
.get_sheet_mut(&0)
Expand Down Expand Up @@ -378,7 +384,7 @@ impl ExportXlsx {
for t in gs.get_outgoing_edges(span.node) {
let t = t?;
if let Some(row) = token_to_row.get(&t) {
spanned_rows.insert(row);
spanned_rows.insert(*row);
}
}
}
Expand All @@ -388,13 +394,14 @@ impl ExportXlsx {
if let Some(first) = first_row
&& let Some(last) = last_row
{
if spanned_rows.intersection(&written_rows).count() > 0 {
progress.warn(format!("Could not write span value {span_val} from row {first} to row {last} in column `{}`. A span already exists in at least of the affected rows.", span_anno_key.name))?;
let intersection_size = spanned_rows.intersection(&written_rows).count();
if intersection_size > 0 {
progress.warn(format!("Could not write span value {span_val} from row {first} to row {last} in column `{}` in document {}. A span already exists in at least of the affected rows. {intersection_size} node(s) overlap(s).", span_anno_key.name, worksheet.get_name()))?;
continue;
}
if *last - *first > 0 {
worksheet
.get_cell_mut((*column_index, **first))
.get_cell_mut((*column_index, *first))
.set_value(span_val);
let column_letter =
umya_spreadsheet::helper::coordinate::string_from_column_index(
Expand All @@ -404,7 +411,7 @@ impl ExportXlsx {
worksheet.add_merge_cells(range);
} else {
worksheet
.get_cell_mut((*column_index, **first))
.get_cell_mut((*column_index, *first))
.set_value(span_val);
}
written_rows.extend(spanned_rows);
Expand Down Expand Up @@ -447,7 +454,7 @@ impl Exporter for ExportXlsx {
std::fs::create_dir_all(output_path)?;

let results: anyhow::Result<Vec<_>> = document_names
.par_iter()
.iter()
.map(|(doc_name, doc_node_id)| {
self.export_document(doc_name, *doc_node_id, graph, output_path, &reporter)?;
reporter.worked(1)?;
Expand All @@ -470,6 +477,7 @@ mod tests {
path::{Path, PathBuf},
};

use graphannis::update::GraphUpdate;
use insta::assert_snapshot;
use sha2::{Digest, Sha256};
use tempfile::{TempDir, tempdir};
Expand All @@ -478,6 +486,7 @@ mod tests {
ExporterStep, ImporterStep, ReadFrom, WriteAs,
importer::{Importer, xlsx::ImportSpreadsheet},
test_util::compare_graphs,
util::example_generator,
};

use super::*;
Expand Down Expand Up @@ -996,7 +1005,7 @@ mod tests {
let wb = umya_spreadsheet::reader::xlsx::read(test_target);
assert!(wb.is_ok());
let book = wb.unwrap();
let sheet = book.get_sheet(&1).unwrap();
let sheet = book.get_sheet(&0).unwrap();
let merge_cells = sheet.get_merge_cells();
assert_eq!(1, merge_cells.len());
let merge_cell = merge_cells.get(0);
Expand All @@ -1016,4 +1025,41 @@ mod tests {
fnt.get_bold()
));
}

#[test]
fn spans() {
let g = AnnotationGraph::with_default_graphstorages(true);
assert!(g.is_ok());
let mut graph = g.unwrap();
let mut u = GraphUpdate::default();
example_generator::create_corpus_structure_simple(&mut u);
example_generator::create_multiple_segmentations(&mut u, "root/doc1");
assert!(graph.apply_update(&mut u, |_| {}).is_ok());
let exporter = ExportXlsx {
..Default::default()
};
let target_dir = tempdir().unwrap();
assert!(
exporter
.export_corpus(
&graph,
target_dir.path(),
crate::StepID {
module_name: "test_export".to_string(),
path: None
},
None
)
.is_ok()
);
assert!(
sheets_diff::core::diff::Diff::new(
"./tests/data/export/xlsx/span-target/doc1.xlsx",
&target_dir.path().join("doc1.xlsx").to_string_lossy()
)
.diff()
.cell_diffs
.is_empty()
);
}
}
12 changes: 6 additions & 6 deletions src/importer/xlsx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ impl<'a> DatasheetMapper<'a> {
.reverse_col_map
.keys()
.chain(self.reverse_col_map.values())
.map(|e| e.as_str())
.map(|e| e.trim())
.collect_vec();
let mut merged_cells = self.collect_merged_cells(expected_names, progress)?;
let base_tokens = self.build_tokens(update, doc_node_name)?;
Expand Down Expand Up @@ -284,8 +284,8 @@ impl<'a> DatasheetMapper<'a> {
|| self.column_map.contains_key(&col_name);
let anno_key = if is_segmentation {
AnnoKey {
ns: ns.unwrap_or(name).into(), // prefer the namespace in the column header
name: name.into(),
ns: ns.unwrap_or(name.trim()).into(), // prefer the namespace in the column header
name: name.trim().into(),
}
} else if let Some(seg_name) = self
.reverse_col_map
Expand All @@ -294,8 +294,8 @@ impl<'a> DatasheetMapper<'a> {
.or(self.fallback.as_ref())
{
AnnoKey {
ns: ns.unwrap_or(seg_name).into(), // prefer the namespace in the column header
name: name.into(),
ns: ns.unwrap_or(seg_name.trim()).into(), // prefer the namespace in the column header
name: name.trim().into(),
}
} else {
let msg = format!(
Expand Down Expand Up @@ -479,7 +479,7 @@ impl<'a> DatasheetMapper<'a> {
if start_col != end_col {
if (*start_col.get_num()..=*end_col.get_num()).any(|c| {
if let Some(cell) = self.sheet.get_cell((c, 1)) {
col_names.contains(&cell.get_raw_value().to_string().as_str())
col_names.contains(&cell.get_raw_value().to_string().trim())
} else {
false
}
Expand Down
Binary file added tests/data/export/xlsx/span-target/doc1.xlsx
Binary file not shown.