Skip to content

Commit a07c18c

Browse files
committed
Excel resource manager
1 parent 7138102 commit a07c18c

File tree

9 files changed

+290
-69
lines changed

9 files changed

+290
-69
lines changed

.github/copilot-instructions.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@ ALWAYS reference these instructions first and fallback to search or bash command
55
Scautable is a Scala 3 project using the mill build tool. It is a lightweight dataframe library with the twist that it expects the "dataframe" in question to have its structure identified by the compiler, at compile time. A dataframe here is modelled as an `Iterator[NamedTuple[K, V]]` where `K` is a compile time constant tuple of strings representing column names.
66

77
## Working Effectively
8-
- **NEVER CANCEL BUILDS**: Mill cold compilation takes 2 minutes or so. Tests take 1-3 minutes from cold. Set timeout to 2+ minutes.
8+
On windows, use `mill scautable.jvm.test`, on MacOS/Linux use `./mill scautable.jvm.test` to run commands. Note the absence of `./` on windows.
9+
10+
- **BUILDS**: Mill cold compilation takes 2 minutes or so. Tests take 1-3 minutes from cold. Set timeout to 2+ minutes.
911
- Compile all modules:
1012
- `./mill __.compile` -- Compiles all modules (JVM, JS, tests). Takes 3-5 minutes cold, fast cached. NEVER CANCEL.
1113
- Compile specific platforms:

scautable/src-jvm/Excel.scala

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,23 @@ object Excel:
4444

4545
transparent inline def resource[K](filePath: String, sheetName: String) =
4646
${ readExcelResource('filePath, 'sheetName, '{""}, '{TypeInferrer.FromAllRows}) }
47-
47+
48+
/** Cleanup cached workbook resources for a specific file.
49+
*
50+
* Call this method when you know that no more operations will be performed
51+
* on a specific Excel file to free up memory immediately.
52+
*
53+
* @param filePath Path to the Excel file to cleanup
54+
*/
55+
// def cleanup(filePath: String): Unit =
56+
// ExcelResourceManager.cleanup(filePath)
57+
58+
/** Cleanup all cached Excel workbook resources.
59+
*
60+
* This method is useful for application shutdown or when you want to
61+
* free up all Excel-related memory immediately.
62+
*/
63+
// def cleanupAll(): Unit =
64+
// ExcelResourceManager.cleanupAll()
4865

4966
end Excel

scautable/src-jvm/ExcelIterator.scala

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
package io.github.quafadas.scautable
22

3-
import java.io.File
43
import scala.NamedTuple.*
54
import scala.collection.JavaConverters.*
6-
import org.apache.poi.ss.usermodel.{Row, WorkbookFactory}
5+
import org.apache.poi.ss.usermodel.Row
76
import org.apache.poi.ss.util.CellRangeAddress
87
import io.github.quafadas.scautable.BadTableException
8+
import io.github.quafadas.scautable.ExcelWorkbookCache
99

1010
/** Iterator for reading Excel files with compile-time type safety
1111
*
@@ -50,7 +50,9 @@ class ExcelIterator[K <: Tuple, V <: Tuple](filePath: String, sheetName: String,
5050

5151
// Lazy-initialized sheet iterator to avoid opening file until needed
5252
private lazy val sheetIterator =
53-
val workbook = WorkbookFactory.create(new File(filePath))
53+
val workbook = ExcelWorkbookCache.getOrCreate(filePath).getOrElse(
54+
throw new BadTableException(s"Failed to open Excel file: $filePath")
55+
)
5456
val sheet = workbook.getSheet(sheetName)
5557
// Create an iterator that gives us rows by index for the specified range
5658
colRange match
@@ -88,16 +90,15 @@ class ExcelIterator[K <: Tuple, V <: Tuple](filePath: String, sheetName: String,
8890
*/
8991
private inline def extractHeadersFromRange(range: String): List[String] =
9092
val (firstRow, _, firstCol, lastCol) = parseRange(range)
91-
val workbook = WorkbookFactory.create(new File(filePath))
92-
try
93-
val sheet = workbook.getSheet(sheetName)
94-
val headerRow = sheet.getRow(firstRow)
95-
val cells =
96-
for (i <- firstCol.to(lastCol))
97-
yield headerRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
98-
cells.toList
99-
finally
100-
workbook.close()
93+
val workbook = ExcelWorkbookCache.getOrCreate(filePath).getOrElse(
94+
throw new BadTableException(s"Failed to open Excel file: $filePath")
95+
)
96+
val sheet = workbook.getSheet(sheetName)
97+
val headerRow = sheet.getRow(firstRow)
98+
val cells =
99+
for (i <- firstCol.to(lastCol))
100+
yield headerRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
101+
cells.toList
101102
end extractHeadersFromRange
102103

103104
/** Extract headers from the first row of the sheet This consumes the header row from the sheet iterator

scautable/src-jvm/ExcelMacros.scala

Lines changed: 23 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@ package io.github.quafadas.scautable
33
import scala.quoted.*
44
import io.github.quafadas.scautable.ColumnTyped.*
55
import io.github.quafadas.table.TypeInferrer
6-
import org.apache.poi.ss.usermodel.{Row, WorkbookFactory, Cell, CellType, DateUtil}
6+
import org.apache.poi.ss.usermodel.{Row, Cell, CellType, DateUtil}
77
import org.apache.poi.ss.util.CellRangeAddress
88
import scala.collection.JavaConverters.*
9-
import java.io.File
109
import io.github.quafadas.scautable.BadTableException
11-
import io.github.quafadas.scautable.InferrerOps
10+
import io.github.quafadas.scautable.ExcelWorkbookCache
1211

1312
/** Compile-time macro functions for reading val initial = ColumnTypeInfo()
1413
val finalInfo = cells.foldLeft(initial)(updateTypeInfo) files These macros perform Excel file inspection at compile time to determine structure
@@ -128,24 +127,23 @@ object ExcelMacros:
128127
/** Extracts headers from an Excel sheet, either from a specific range or the first row
129128
*/
130129
private def extractHeaders(filePath: String, sheetName: String, colRange: Option[String]): List[String] =
131-
val workbook = WorkbookFactory.create(new File(filePath))
132-
try
133-
val sheet = workbook.getSheet(sheetName)
134-
135-
colRange match
136-
case Some(range) if range.nonEmpty =>
137-
val cellRange = CellRangeAddress.valueOf(range)
138-
val firstRow = sheet.getRow(cellRange.getFirstRow)
139-
val cells =
140-
for (i <- cellRange.getFirstColumn to cellRange.getLastColumn)
141-
yield firstRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
142-
cells.toList
143-
case _ =>
144-
if sheet.iterator().hasNext then sheet.iterator().next().cellIterator().asScala.toList.map(_.toString)
145-
else throw new BadTableException("No headers found in the first row of the sheet, and no range specified.")
146-
end match
147-
finally workbook.close()
148-
end try
130+
val workbook = ExcelWorkbookCache.getOrCreate(filePath).getOrElse(
131+
throw new BadTableException(s"Failed to open Excel file: $filePath")
132+
)
133+
val sheet = workbook.getSheet(sheetName)
134+
135+
colRange match
136+
case Some(range) if range.nonEmpty =>
137+
val cellRange = CellRangeAddress.valueOf(range)
138+
val firstRow = sheet.getRow(cellRange.getFirstRow)
139+
val cells =
140+
for (i <- cellRange.getFirstColumn to cellRange.getLastColumn)
141+
yield firstRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
142+
cells.toList
143+
case _ =>
144+
if sheet.iterator().hasNext then sheet.iterator().next().cellIterator().asScala.toList.map(_.toString)
145+
else throw new BadTableException("No headers found in the first row of the sheet, and no range specified.")
146+
end match
149147
end extractHeaders
150148

151149
/** Validates that headers are unique (no duplicates)
@@ -200,9 +198,10 @@ object ExcelMacros:
200198
): quotes.reflect.TypeRepr =
201199
import quotes.reflect.*
202200

203-
val workbook = WorkbookFactory.create(new File(filePath))
204-
try
205-
val sheet = workbook.getSheet(sheetName)
201+
val workbook = ExcelWorkbookCache.getOrCreate(filePath).getOrElse(
202+
throw new BadTableException(s"Failed to open Excel file: $filePath")
203+
)
204+
val sheet = workbook.getSheet(sheetName)
206205

207206
// Extract data based on column range or use all columns
208207
val columnData: List[List[Cell]] = colRange match
@@ -251,14 +250,6 @@ object ExcelMacros:
251250
}
252251

253252
tupleType
254-
finally
255-
try
256-
workbook.close()
257-
catch
258-
case _: Exception =>
259-
// Workbook close can fail for Excel files with corrupted drawings - this is expected
260-
println(s"Warning: Could not close workbook for file: $filePath")
261-
end try
262253
end inferTypesFromExcelDataDirect
263254

264255
/** Infer the most appropriate Scala type for a column based on Apache POI cell types
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
package io.github.quafadas.scautable
2+
3+
/** Utility object for managing Excel workbook resources and lifecycle
4+
*/
5+
object ExcelResourceManager:
6+
7+
/** Cleanup resources for a specific Excel file when no longer needed.
8+
*
9+
* This method should be called when you know that no more operations
10+
* will be performed on a specific Excel file to free up memory immediately.
11+
*
12+
* @param filePath Path to the Excel file to cleanup
13+
*/
14+
def cleanup(filePath: String): Unit =
15+
ExcelWorkbookCache.closeAndRemove(filePath)
16+
end cleanup
17+
18+
/** Cleanup all cached Excel workbook resources.
19+
*
20+
* This method is useful for application shutdown or when you want to
21+
* free up all Excel-related memory immediately.
22+
*/
23+
def cleanupAll(): Unit =
24+
ExcelWorkbookCache.clearAll()
25+
end cleanupAll
26+
27+
end ExcelResourceManager

scautable/src-jvm/ExcelUtils.scala

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
package io.github.quafadas.scautable
22

3-
import org.apache.poi.ss.usermodel.{Row, WorkbookFactory}
3+
import org.apache.poi.ss.usermodel.Row
44
import org.apache.poi.ss.util.CellRangeAddress
55
import scala.collection.JavaConverters.*
6-
import java.io.File
76
import io.github.quafadas.scautable.BadTableException
7+
import io.github.quafadas.scautable.ExcelWorkbookCache
88

99
/** Common utilities and exceptions for Excel processing
1010
*/
@@ -22,24 +22,23 @@ object ExcelUtils:
2222
* List of header strings
2323
*/
2424
inline def extractHeaders(filePath: String, sheetName: String, colRange: Option[String]): List[String] =
25-
val workbook = WorkbookFactory.create(new File(filePath))
26-
try
27-
val sheet = workbook.getSheet(sheetName)
25+
val workbook = ExcelWorkbookCache.getOrCreate(filePath).getOrElse(
26+
throw new BadTableException(s"Failed to open Excel file: $filePath")
27+
)
28+
val sheet = workbook.getSheet(sheetName)
2829

29-
colRange match
30-
case Some(range) if range.nonEmpty =>
31-
val cellRange = CellRangeAddress.valueOf(range)
32-
val firstRow = sheet.getRow(cellRange.getFirstRow)
33-
val cells =
34-
for (i <- cellRange.getFirstColumn to cellRange.getLastColumn)
35-
yield firstRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
36-
cells.toList
37-
case _ =>
38-
if sheet.iterator().hasNext then sheet.iterator().next().cellIterator().asScala.toList.map(_.toString)
39-
else throw new BadTableException("No headers found in the first row of the sheet, and no range specified.")
40-
end match
41-
finally workbook.close()
42-
end try
30+
colRange match
31+
case Some(range) if range.nonEmpty =>
32+
val cellRange = CellRangeAddress.valueOf(range)
33+
val firstRow = sheet.getRow(cellRange.getFirstRow)
34+
val cells =
35+
for (i <- cellRange.getFirstColumn to cellRange.getLastColumn)
36+
yield firstRow.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK).toString
37+
cells.toList
38+
case _ =>
39+
if sheet.iterator().hasNext then sheet.iterator().next().cellIterator().asScala.toList.map(_.toString)
40+
else throw new BadTableException("No headers found in the first row of the sheet, and no range specified.")
41+
end match
4342
end extractHeaders
4443

4544
/** Validates that headers are unique (no duplicates)
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package io.github.quafadas.scautable
2+
3+
import org.apache.poi.ss.usermodel.Workbook
4+
import org.apache.poi.ss.usermodel.WorkbookFactory
5+
import java.io.File
6+
import java.lang.ref.WeakReference
7+
import java.util.concurrent.ConcurrentHashMap
8+
import scala.util.Try
9+
10+
/** Thread-safe cache for Excel workbook instances to avoid file contention and improve performance.
11+
*
12+
* This cache uses weak references to allow workbooks to be garbage collected when no longer needed,
13+
* while providing significant performance improvements when multiple operations access the same Excel file.
14+
*/
15+
object ExcelWorkbookCache:
16+
17+
// Thread-safe cache using weak references to allow GC when workbooks are no longer used
18+
private val cache = new ConcurrentHashMap[String, WeakReference[Workbook]]()
19+
20+
/** Get or create a workbook for the specified file path.
21+
*
22+
* This method is thread-safe and will reuse existing workbook instances when possible.
23+
* If a workbook is garbage collected, a new one will be created automatically.
24+
*
25+
* @param filePath The absolute path to the Excel file
26+
* @return A Try containing the Workbook instance, or a Failure if the file cannot be opened
27+
*/
28+
def getOrCreate(filePath: String): Try[Workbook] =
29+
Try {
30+
// Normalize the file path to handle different path representations
31+
val normalizedPath = new File(filePath).getCanonicalPath
32+
33+
// Try to get existing workbook from cache
34+
val cachedRef = cache.get(normalizedPath)
35+
val existingWorkbook = Option(cachedRef).flatMap(ref => Option(ref.get()))
36+
37+
existingWorkbook match
38+
case Some(workbook) =>
39+
// Validate that the workbook is still usable (not closed)
40+
try
41+
// Simple validation - try to access the number of sheets
42+
workbook.getNumberOfSheets
43+
workbook
44+
catch
45+
case _: Exception =>
46+
// Workbook is no longer valid, remove from cache and create new one
47+
cache.remove(normalizedPath)
48+
WorkbookFactory.create(new File(normalizedPath))
49+
case None =>
50+
// No cached workbook or it was garbage collected
51+
val workbook = WorkbookFactory.create(new File(normalizedPath))
52+
cache.put(normalizedPath, new WeakReference(workbook))
53+
workbook
54+
}
55+
end getOrCreate
56+
57+
58+
59+
/** Explicitly close and remove a workbook from the cache.
60+
*
61+
* This method should be called when you know a workbook will no longer be needed
62+
* to free up resources immediately rather than waiting for garbage collection.
63+
*
64+
* @param filePath The path to the Excel file
65+
*/
66+
def closeAndRemove(filePath: String): Unit =
67+
try
68+
val normalizedPath = new File(filePath).getCanonicalPath
69+
val cachedRef = cache.remove(normalizedPath)
70+
71+
Option(cachedRef).flatMap(ref => Option(ref.get())).foreach { workbook =>
72+
try
73+
workbook.close()
74+
catch
75+
case _: Exception =>
76+
// Ignore close errors - workbook might already be corrupted/closed
77+
// This is expected for Excel files with certain drawing corruption issues
78+
end try
79+
}
80+
catch
81+
case _: Exception =>
82+
// Ignore errors in cleanup - this is a best-effort operation
83+
end try
84+
end closeAndRemove
85+
86+
/** Clear the entire cache and attempt to close all cached workbooks.
87+
*
88+
* This is primarily useful for testing or application shutdown.
89+
*/
90+
def clearAll(): Unit =
91+
val keys = cache.keySet().toArray(Array.empty[String])
92+
keys.foreach(closeAndRemove)
93+
end clearAll
94+
95+
/** Get the current number of cached workbook references.
96+
*
97+
* Note that this includes weak references that may have been garbage collected.
98+
* This method is primarily useful for testing and debugging.
99+
*
100+
* @return The number of cached workbook references
101+
*/
102+
def cacheSize: Int = cache.size()
103+
104+
end ExcelWorkbookCache

0 commit comments

Comments
 (0)