Skip to content

Commit dee09ee

Browse files
committed
Performance improvements for large worksheets
1 parent 1f17f40 commit dee09ee

File tree

2 files changed

+90
-24
lines changed

2 files changed

+90
-24
lines changed

project.clj

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
(defproject org.clojars.mjdowney/excel-clj "1.1.2"
1+
(defproject org.clojars.mjdowney/excel-clj "1.2.0"
22
:description "Generate Excel documents & PDFs from Clojure data."
33
:url "https://github.com/matthewdowney/excel-clj"
44
:license {:name "Eclipse Public License"
55
:url "http://www.eclipse.org/legal/epl-v10.html"}
66
:dependencies [[org.clojure/clojure "1.10.0"]
7+
[com.taoensso/tufte "2.0.1"]
78
[rhizome "0.2.9"]
89
[org.apache.poi/poi-ooxml "4.0.0"]
910
[org.jodconverter/jodconverter-core "4.0.0-RELEASE"]])

src/excel_clj/core.clj

Lines changed: 88 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,19 @@
1313
(:require [excel-clj.tree :as tree]
1414
[excel-clj.style :as style]
1515
[clojure.string :as string]
16-
[clojure.java.io :as io])
16+
[clojure.java.io :as io]
17+
[taoensso.tufte :as tufte :refer (defnp p profiled profile)])
1718
(:import (org.apache.poi.ss.usermodel Cell RichTextString)
18-
(org.apache.poi.xssf.usermodel XSSFWorkbook XSSFSheet)
19-
(java.io FileOutputStream File)
19+
(org.apache.poi.xssf.usermodel XSSFWorkbook XSSFSheet XSSFRow XSSFCell)
20+
(java.io File)
2021
(java.awt Desktop HeadlessException)
2122
(java.util Calendar Date)
2223
(org.apache.poi.ss.util CellRangeAddress)
2324
(org.jodconverter.office DefaultOfficeManagerBuilder)
2425
(org.jodconverter OfficeDocumentConverter)))
2526

27+
(set! *warn-on-reflection* true)
28+
2629
;;; Low level code to write to & style sheets; you probably shouldn't have to
2730
;;; touch this to make use of the API, but might choose to when adding or
2831
;;; extending functionality
@@ -71,20 +74,40 @@
7174
[^Cell cell data]
7275
;; These types are allowed natively
7376
(if-type [data [Boolean Calendar String Date Double RichTextString]]
74-
(doto cell (.setCellValue data))
77+
(doto cell (.setCellValue data))
7578

76-
;; Apache POI requires that numbers be doubles
77-
(if (number? data)
78-
(doto cell (.setCellValue (double data)))
79+
;; Apache POI requires that numbers be doubles
80+
(if (number? data)
81+
(doto cell (.setCellValue (double data)))
7982

80-
;; Otherwise stringify it
81-
(doto cell (.setCellValue ^String (or (some-> data pr-str) ""))))))
83+
;; Otherwise stringify it
84+
(let [to-write (or (some-> data pr-str) "")]
85+
(doto cell (.setCellValue ^String to-write))))))
8286

8387
(def ^:dynamic *max-col-width*
8488
"Sometimes POI's auto sizing isn't super intelligent, so set a sanity-max on
85-
the column width."
89+
the column width."
8690
15000)
8791

92+
(defmacro ^:private doparallel [[sym coll] & body]
93+
"Performance hack for writing the POI cells.
94+
Like (dotimes [x xs] ...) but parallel."
95+
`(let [n# (+ 2 (.. Runtime getRuntime availableProcessors))
96+
equal-chunks# (loop [num# n#, parts# [], coll# ~coll, c# (count ~coll)]
97+
(if (<= num# 0)
98+
parts#
99+
(let [t# (quot (+ c# num# -1) num#)]
100+
(recur (dec num#) (conj parts# (take t# coll#))
101+
(drop t# coll#) (- c# t#)))))
102+
workers#
103+
(doall
104+
(for [chunk# equal-chunks#]
105+
(future
106+
(doseq [~sym chunk#]
107+
~@body))))]
108+
(doseq [w# workers#]
109+
(deref w#))))
110+
88111
(defn- ^XSSFSheet write-grid!
89112
"Modify the given workbook by adding a sheet with the given name built from
90113
the provided grid.
@@ -101,35 +124,77 @@
101124
build-style' (memoize ;; Immutable styles can share mutable objects :)
102125
(fn [style-map]
103126
(->> (style/merge-all style/default-style (or style-map {}))
104-
(style/build-style workbook))))]
127+
(style/build-style workbook))))
128+
layout (volatile! {})]
105129
(try
130+
131+
;; N.B. So this code got uglier due to performance. Writing the cells
132+
;; takes many seconds for a large sheet (~50,000 rows) and we can improve
133+
;; the process a bit by doing the cell creation sequentially and the cell
134+
;; writing in parallel (on test data set reduced from ~19s to ~14s).
135+
136+
;; Unfortunately much of the time is spent writing to disk (~8s).
137+
138+
;; We have to do this part sequentially because POI doesn't use
139+
;; thread-safe data structures
106140
(doseq [[row-idx row-data] (map-indexed vector grid)]
107-
(let [row (.createRow sh (int row-idx))]
141+
(let [row (p :create-row (.createRow sh (int row-idx)))]
108142
(loop [col-idx 0 cells row-data]
109143
(when-let [cell-data (first cells)]
110-
(let [cell (.createCell row col-idx)
144+
;; (1) Build the cell
145+
(let [cell (p :create-cell (.createCell ^XSSFRow row col-idx))
111146
width (if (map? cell-data) (get cell-data :width 1) 1)]
112-
(write-cell! cell (cond-> cell-data (map? cell-data) :value))
113-
(.setCellStyle
114-
cell
115-
(build-style' (if (map? cell-data) (:style cell-data) {})))
147+
148+
;; (2) Merge if necessary into adjacent cells
116149
(when (> width 1)
117150
(.addMergedRegion
118151
sh (CellRangeAddress.
119152
row-idx row-idx col-idx (dec (+ col-idx width)))))
153+
154+
;; (3) Save the cell
155+
(vswap! layout assoc-in [row-idx col-idx] cell)
120156
(recur (+ col-idx ^long width) (rest cells)))))))
157+
158+
;; We can do this part in parallel at least, since the cells are all
159+
;; different objects
160+
(let [layout @layout]
161+
(doparallel [row (map-indexed vector grid)]
162+
(let [[row-idx row-data] row]
163+
(loop [col-idx 0, cells row-data]
164+
(when-let [cell-data (first cells)]
165+
;; (1) Find the cell
166+
(let [width (if (map? cell-data) (get cell-data :width 1) 1)
167+
^XSSFCell cell (get (get layout row-idx) col-idx)]
168+
169+
;; (2) Write the cell data
170+
(p :write-cell
171+
(write-cell! cell (cond-> cell-data (map? cell-data) :value)))
172+
173+
;; (3) Set the cell style
174+
(let [style (build-style'
175+
(if (map? cell-data) (:style cell-data) {}))]
176+
(p :set-cell-style
177+
(.setCellStyle cell style)))
178+
179+
(recur (+ col-idx ^long width) (rest cells))))))))
121180
(catch Exception e
122181
(-> "Failed to write grid!"
123182
(ex-info {:sheet-name sheet-name :grid grid} e)
124183
(throw))))
125184

126185
(dotimes [i (transduce (map count) (completing max) 0 grid)]
127-
(.autoSizeColumn sh i)
186+
187+
;; Only auto-size small tables because it takes forever (~10s on a large
188+
;; grid)
189+
(when (< (count grid) 2000)
190+
(p :auto-size (.autoSizeColumn sh i)))
191+
128192
(when (> (.getColumnWidth sh i) *max-col-width*)
129193
(.setColumnWidth sh i *max-col-width*)))
130194

131-
(.setFitToPage sh true)
132-
(.setFitWidth (.getPrintSetup sh) 1)
195+
(p :set-print-settings
196+
(.setFitToPage sh true)
197+
(.setFitWidth (.getPrintSetup sh) 1))
133198
sh))
134199

135200
(defn- workbook!
@@ -278,8 +343,9 @@
278343
(fn [wb [sheet-name grid]] (doto wb (write-grid! sheet-name grid)))
279344
(workbook!)
280345
(seq workbook))]
281-
(with-open [fos (FileOutputStream. (str path'))]
282-
(.write wb fos))
346+
(p :write-to-disk
347+
(with-open [fos (io/output-stream (io/file (str path')))]
348+
(.write wb fos)))
283349
(io/file path')))
284350

285351
(defn convert-pdf!
@@ -359,4 +425,3 @@
359425
;; with the same contents. On platforms without OpenOffice the convert-pdf!
360426
;; call will most likely fail.
361427
(open (convert-pdf! (example) (temp ".pdf"))))
362-

0 commit comments

Comments
 (0)