Skip to content

Commit 6dc2389

Browse files
committed
Added support for typed dataframe conversions
1 parent d19a625 commit 6dc2389

File tree

12 files changed

+348
-16
lines changed

12 files changed

+348
-16
lines changed

Cargo.lock

Lines changed: 6 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,17 @@ members = ["cli-excel-rs", "crates/*", "py-excel-rs"]
33
resolver = "2"
44

55
[workspace.package]
6-
version = "0.5.1"
6+
version = "0.5.2"
77
authors = ["Carl Voller"]
88
edition = "2021"
99
homepage = "https://github.com/carlvoller/excel-rs"
1010
license = "MIT"
1111
repository = "https://github.com/carlvoller/excel-rs"
1212

1313
[workspace.dependencies]
14-
excel-rs-xlsx = { version = "0.5.1", path = "crates/excel-rs-xlsx", default-features = false }
15-
excel-rs-csv = { version = "0.5.1", path = "crates/excel-rs-csv", default-features = false }
16-
excel-rs-postgres = { version = "0.5.1", path = "crates/excel-rs-postgres", default-features = false }
14+
excel-rs-xlsx = { version = "0.5.2", path = "crates/excel-rs-xlsx", default-features = false }
15+
excel-rs-csv = { version = "0.5.2", path = "crates/excel-rs-csv", default-features = false }
16+
excel-rs-postgres = { version = "0.5.2", path = "crates/excel-rs-postgres", default-features = false }
1717

1818
[profile.release]
1919
opt-level = 3

benchmarks/test-py-excel-rs.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
import py_excel_rs
2+
import datetime
3+
import pandas as pd
24

3-
f = open('organizations-1000000.csv', 'rb')
4-
xlsx = py_excel_rs.csv_to_xlsx(f.read())
5+
# f = open('organizations-1000000.csv', 'rb')
6+
# xlsx = py_excel_rs.csv_to_xlsx(f.read())
7+
8+
9+
data = [[datetime.datetime.now(), "hello", 10, 10.888]]
10+
df = pd.DataFrame(data, columns=["Date", "hi", "number1", "float2"])
11+
12+
xlsx = py_excel_rs.df_to_xlsx(df, should_infer_types=True)
513

614
with open('report.xlsx', 'wb') as f:
715
f.write(xlsx)

crates/excel-rs-xlsx/src/format.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ impl<W: Write + Seek> XlsxFormatter<W> {
142142
</cellStyleXfs>
143143
<cellXfs count="1">
144144
<xf numFmtId="0" fontId="0" fillId="0" borderId="0" xfId="0"/>
145+
<xf numFmtId="14" borderId="0" fillId="0" fontId="0" xfId="0"/>
145146
</cellXfs>
146147
<cellStyles count="1">
147148
<cellStyle name="Normal" xfId="0" builtinId="0"/>

crates/excel-rs-xlsx/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
mod format;
33
pub mod workbook;
44
pub mod sheet;
5+
pub mod typed_sheet;
56

67
pub use workbook::WorkBook;
78

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
use std::{
2+
collections::VecDeque,
3+
io::{Seek, Write},
4+
};
5+
6+
use anyhow::Result;
7+
use zip::{write::SimpleFileOptions, ZipWriter};
8+
9+
pub struct TypedSheet<'a, W: Write + Seek> {
10+
pub sheet_buf: &'a mut ZipWriter<W>,
11+
pub _name: String,
12+
// pub id: u16,
13+
// pub is_closed: bool,
14+
col_num_to_letter: Vec<Vec<u8>>,
15+
current_row_num: u32,
16+
}
17+
18+
impl<'a, W: Write + Seek> TypedSheet<'a, W> {
19+
pub fn new(name: String, id: u16, writer: &'a mut ZipWriter<W>) -> Self {
20+
let options = SimpleFileOptions::default()
21+
.compression_method(zip::CompressionMethod::Deflated)
22+
.compression_level(Some(1))
23+
.large_file(true);
24+
25+
writer
26+
.start_file(format!("xl/worksheets/sheet{}.xml", id), options)
27+
.ok();
28+
29+
// Writes Sheet Header
30+
writer.write(b"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n<worksheet xmlns=\"http://schemas.openxmlformats.org/spreadsheetml/2006/main\" xmlns:r=\"http://schemas.openxmlformats.org/officeDocument/2006/relationships\">\n<sheetData>\n").ok();
31+
32+
TypedSheet {
33+
sheet_buf: writer,
34+
// id,
35+
_name: name,
36+
// is_closed: false,
37+
col_num_to_letter: Vec::with_capacity(64),
38+
current_row_num: 0,
39+
}
40+
}
41+
42+
// TOOD: Use ShortVec over Vec for cell ID
43+
pub fn write_row(&mut self, data: Vec<&[u8]>, types: &Vec<&str>) -> Result<()> {
44+
self.current_row_num += 1;
45+
46+
let mut final_vec = Vec::with_capacity(512 * data.len());
47+
48+
// TODO: Proper Error Handling
49+
let (row_in_chars_arr, digits) = self.num_to_bytes(self.current_row_num);
50+
51+
final_vec.write(b"<row r=\"")?;
52+
final_vec.write(&row_in_chars_arr[9 - digits..])?;
53+
final_vec.write(b"\">")?;
54+
55+
let mut col = 0;
56+
if self.current_row_num == 1 {
57+
for datum in data {
58+
let (ref_id, pos) = self.ref_id(col, (row_in_chars_arr, digits))?;
59+
60+
final_vec.write(b"<c r=\"")?;
61+
final_vec.write(&ref_id.as_slice()[0..pos])?;
62+
final_vec.write(b"\" t=\"str\"><v>")?;
63+
64+
let (mut chars, chars_pos) = self.escape_in_place(datum);
65+
let mut current_pos = 0;
66+
for char_pos in chars_pos {
67+
final_vec.write(&datum[current_pos..char_pos])?;
68+
final_vec.write(chars.pop_front().unwrap())?;
69+
current_pos = char_pos + 1;
70+
}
71+
72+
final_vec.write(&datum[current_pos..])?;
73+
final_vec.write(b"</v></c>")?;
74+
75+
col += 1;
76+
}
77+
} else {
78+
for datum in data {
79+
let (ref_id, pos) = self.ref_id(col, (row_in_chars_arr, digits))?;
80+
81+
let col_type = *types.get(col).unwrap_or(&"s");
82+
83+
final_vec.write(b"<c r=\"")?;
84+
final_vec.write(&ref_id.as_slice()[0..pos])?;
85+
final_vec.write(b"\" t=\"")?;
86+
final_vec.write(col_type.as_bytes())?;
87+
final_vec.write(b"\"><v>")?;
88+
89+
let (mut chars, chars_pos) = self.escape_in_place(datum);
90+
let mut current_pos = 0;
91+
for char_pos in chars_pos {
92+
final_vec.write(&datum[current_pos..char_pos])?;
93+
final_vec.write(chars.pop_front().unwrap())?;
94+
current_pos = char_pos + 1;
95+
}
96+
97+
final_vec.write(&datum[current_pos..])?;
98+
final_vec.write(b"</v></c>")?;
99+
100+
col += 1;
101+
}
102+
}
103+
104+
final_vec.write(b"</row>")?;
105+
106+
self.sheet_buf.write(&final_vec)?;
107+
108+
Ok(())
109+
}
110+
111+
fn escape_in_place(&self, bytes: &[u8]) -> (VecDeque<&[u8]>, VecDeque<usize>) {
112+
let mut special_chars: VecDeque<&[u8]> = VecDeque::new();
113+
let mut special_char_pos: VecDeque<usize> = VecDeque::new();
114+
let len = bytes.len();
115+
for x in 0..len {
116+
let _ = match bytes[x] {
117+
b'<' => {
118+
special_chars.push_back(b"&lt;".as_slice());
119+
special_char_pos.push_back(x);
120+
}
121+
b'>' => {
122+
special_chars.push_back(b"&gt;".as_slice());
123+
special_char_pos.push_back(x);
124+
}
125+
b'\'' => {
126+
special_chars.push_back(b"&apos;".as_slice());
127+
special_char_pos.push_back(x);
128+
}
129+
b'&' => {
130+
special_chars.push_back(b"&amp;".as_slice());
131+
special_char_pos.push_back(x);
132+
}
133+
b'"' => {
134+
special_chars.push_back(b"&quot;".as_slice());
135+
special_char_pos.push_back(x);
136+
}
137+
_ => (),
138+
};
139+
}
140+
141+
(special_chars, special_char_pos)
142+
}
143+
144+
pub fn close(&mut self) -> Result<()> {
145+
self.sheet_buf.write(b"\n</sheetData>\n</worksheet>\n")?;
146+
Ok(())
147+
}
148+
149+
fn num_to_bytes(&self, n: u32) -> ([u8; 9], usize) {
150+
// Convert from number to string manually
151+
let mut row_in_chars_arr: [u8; 9] = [0; 9];
152+
let mut row = n;
153+
let mut char_pos = 8;
154+
let mut digits = 0;
155+
while row > 0 {
156+
row_in_chars_arr[char_pos] = b'0' + (row % 10) as u8;
157+
row = row / 10;
158+
char_pos -= 1;
159+
digits += 1;
160+
}
161+
162+
(row_in_chars_arr, digits)
163+
}
164+
165+
fn ref_id(&mut self, col: usize, row: ([u8; 9], usize)) -> Result<([u8; 12], usize)> {
166+
let mut final_arr: [u8; 12] = [0; 12];
167+
let letter = self.col_to_letter(col);
168+
169+
let mut pos: usize = 0;
170+
for c in letter {
171+
final_arr[pos] = *c;
172+
pos += 1;
173+
}
174+
175+
let (row_in_chars_arr, digits) = row;
176+
177+
for i in 0..digits {
178+
final_arr[pos] = row_in_chars_arr[(8 - digits) + i + 1];
179+
pos += 1;
180+
}
181+
182+
Ok((final_arr, pos))
183+
}
184+
185+
fn col_to_letter(&mut self, col: usize) -> &[u8] {
186+
if self.col_num_to_letter.len() < col + 1 as usize {
187+
let mut result = Vec::with_capacity(2);
188+
let mut col = col as i16;
189+
190+
loop {
191+
result.push(b'A' + (col % 26) as u8);
192+
col = col / 26 - 1;
193+
if col < 0 {
194+
break;
195+
}
196+
}
197+
198+
result.reverse();
199+
self.col_num_to_letter.push(result);
200+
}
201+
202+
&self.col_num_to_letter[col]
203+
}
204+
}

crates/excel-rs-xlsx/src/workbook.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use anyhow::Result;
44
use zip::ZipWriter;
55

66
use super::sheet::Sheet;
7+
use super::typed_sheet::TypedSheet;
78

89
pub struct WorkBook<W: Write + Seek> {
910
formatter: XlsxFormatter<W>,
@@ -24,6 +25,11 @@ impl<W: Write + Seek> WorkBook<W> {
2425
self.num_of_sheets += 1;
2526
Sheet::new(name, self.num_of_sheets, &mut self.formatter.zip_writer)
2627
}
28+
29+
pub fn get_typed_worksheet(&mut self, name: String) -> TypedSheet<W> {
30+
self.num_of_sheets += 1;
31+
TypedSheet::new(name, self.num_of_sheets, &mut self.formatter.zip_writer)
32+
}
2733

2834
pub fn finish(self) -> Result<W> {
2935
let result = self.formatter.finish(self.num_of_sheets)?;

py-excel-rs/Cargo.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,10 @@ description = "python ffi for excel-rs"
1212
excel-rs-xlsx = { workspace = true }
1313
excel-rs-csv = { workspace = true }
1414
excel-rs-postgres = { workspace = true }
15-
pyo3 = { version = "0.21", features = ["extension-module"] }
15+
pyo3 = { version = "0.21", features = ["chrono", "extension-module"] }
1616
numpy = "0.21"
17+
chrono = "0.4.38"
1718

1819
[lib]
1920
name = "excel_rs"
20-
crate-type = ["cdylib"]
21+
crate-type = ["cdylib"]

py-excel-rs/py_excel_rs/df_to_xlsx.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,36 @@
11
import pandas as pd
22
import numpy as np
3+
from enum import Enum
34

45
from py_excel_rs import _excel_rs
56

7+
from pandas.api.types import is_datetime64_any_dtype as is_datetime
8+
from pandas.api.types import is_numeric_dtype as is_numeric
9+
10+
class CellTypes(Enum):
11+
Date = "n\" s=\"1"
12+
String = "str"
13+
Number = "n"
14+
Formula = "str"
15+
Boolean = "b"
16+
617
def csv_to_xlsx(buf: bytes) -> bytes:
718
return _excel_rs.csv_to_xlsx(buf)
819

9-
def df_to_xlsx(df: pd.DataFrame) -> bytes:
20+
def df_to_xlsx(df: pd.DataFrame, should_infer_types: bool = False) -> bytes:
21+
1022
py_list = np.vstack((df.keys().to_numpy(), df.to_numpy(dtype='object')))
23+
24+
if should_infer_types:
25+
df_types = []
26+
for x in df.dtypes:
27+
if is_datetime(x):
28+
df_types.append(CellTypes.Date)
29+
elif is_numeric(x):
30+
df_types.append(CellTypes.Number)
31+
else:
32+
df_types.append(CellTypes.String)
33+
return _excel_rs.typed_py_2d_to_xlsx(py_list, list(map(lambda x : x.value, df_types)))
1134
return _excel_rs.py_2d_to_xlsx(py_list)
1235

1336
def pg_to_xlsx(query: str, conn_string: str) -> bytes:

py-excel-rs/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "maturin"
44

55
[project]
66
name = "py-excel-rs"
7-
version = "0.5.1"
7+
version = "0.5.2"
88
description = "Some performant utility functions to convert common data structures to XLSX"
99
dependencies = ["pandas", "numpy"]
1010
requires-python = ">=3.7"

0 commit comments

Comments
 (0)