Skip to content

Commit 014914d

Browse files
committed
Improve Excek reading... again
1 parent 7379d0f commit 014914d

File tree

5 files changed

+88
-42
lines changed

5 files changed

+88
-42
lines changed

scautable/src-jvm/ExcelMacros.scala

Lines changed: 41 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ import java.io.File
1010
import io.github.quafadas.scautable.BadTableException
1111
import io.github.quafadas.scautable.InferrerOps
1212

13-
/** Compile-time macro functions for reading Excel files These macros perform Excel file inspection at compile time to determine structure
13+
/** Compile-time macro functions for reading val initial = ColumnTypeInfo()
14+
val finalInfo = cells.foldLeft(initial)(updateTypeInfo) files These macros perform Excel file inspection at compile time to determine structure
1415
*/
1516
object ExcelMacros:
1617

@@ -111,8 +112,8 @@ object ExcelMacros:
111112
inferredTypeRepr.asType match
112113
case '[v] => constructWithTypes[hdrs & Tuple, v & Tuple]
113114
end match
114-
case _ =>
115-
report.throwError("TypeInferrer not found")
115+
case other =>
116+
report.throwError(s"TypeInferrer not found: ${other}")
116117
case _ =>
117118
report.throwError(s"Could not summon Type for type: ${tupleExpr2.show}")
118119
end match
@@ -202,26 +203,30 @@ object ExcelMacros:
202203
val workbook = WorkbookFactory.create(new File(filePath))
203204
try
204205
val sheet = workbook.getSheet(sheetName)
205-
val sheetIterator = sheet.iterator().asScala
206-
207-
// Skip header row
208-
if sheetIterator.hasNext then sheetIterator.next()
209-
end if
210-
211-
val sampleRows = sheetIterator.take(numRows).toList
212206

213207
// Extract data based on column range or use all columns
214208
val columnData: List[List[Cell]] = colRange match
215209
case Some(range) if range.nonEmpty =>
216210
val cellRange = CellRangeAddress.valueOf(range)
211+
val firstRow = cellRange.getFirstRow
212+
val lastRow = cellRange.getLastRow
217213
val firstCol = cellRange.getFirstColumn
218214
val lastCol = cellRange.getLastColumn
219-
sampleRows.map { row =>
215+
216+
// Read only the specific rows from the range (including headers)
217+
val targetRows = (firstRow to lastRow).map(sheet.getRow).filter(_ != null).toList
218+
219+
targetRows.map { row =>
220220
(firstCol to lastCol).map { i =>
221221
row.getCell(i, Row.MissingCellPolicy.CREATE_NULL_AS_BLANK)
222222
}.toList
223223
}
224224
case _ =>
225+
val sheetIterator = sheet.iterator().asScala
226+
// Skip header row
227+
if sheetIterator.hasNext then sheetIterator.next()
228+
end if
229+
val sampleRows = sheetIterator.take(numRows).toList
225230
sampleRows.map { row =>
226231
// Ensure we extract exactly headers.length columns to match headers
227232
(0 until headers.length).map { i =>
@@ -233,8 +238,11 @@ object ExcelMacros:
233238
val columns = columnData.transpose
234239

235240
// Infer type for each column using Apache POI cell types
241+
// SKIP THE FIRST ROW (header row) for type inference
236242
val columnTypes: List[TypeRepr] = columns.map { columnCells =>
237-
inferColumnTypeFromCells(columnCells, preferIntToBoolean)
243+
// TODO Interaction with HeaderOptions
244+
val dataRows = columnCells.drop(1) // Skip header row
245+
inferColumnTypeFromCells(dataRows, preferIntToBoolean)
238246
}
239247

240248
// Build tuple type from column types
@@ -243,8 +251,14 @@ object ExcelMacros:
243251
}
244252

245253
tupleType
246-
finally workbook.close()
247-
end try
254+
finally
255+
try
256+
workbook.close()
257+
catch
258+
case _: Exception =>
259+
// Workbook close can fail for Excel files with corrupted drawings - this is expected
260+
println(s"Warning: Could not close workbook for file: $filePath")
261+
end try
248262
end inferTypesFromExcelDataDirect
249263

250264
/** Infer the most appropriate Scala type for a column based on Apache POI cell types
@@ -286,10 +300,11 @@ object ExcelMacros:
286300
)
287301
else
288302
val numericValue = cell.getNumericCellValue
289-
val isWholeNumber = numericValue == numericValue.toLong
303+
val isWholeNumber = numericValue == numericValue.toLong && !numericValue.isInfinite && !numericValue.isNaN
290304
info.copy(
291305
couldBeInt = info.couldBeInt && isWholeNumber && numericValue >= Int.MinValue && numericValue <= Int.MaxValue,
292306
couldBeLong = info.couldBeLong && isWholeNumber && numericValue >= Long.MinValue && numericValue <= Long.MaxValue,
307+
couldBeDouble = info.couldBeDouble && !numericValue.isInfinite && !numericValue.isNaN, // All finite numeric values can be Double
293308
// Be more conservative with Boolean inference for numeric cells - only if it's exactly 0 or 1
294309
couldBeBoolean = info.couldBeBoolean && isWholeNumber && (numericValue == 0.0 || numericValue == 1.0)
295310
)
@@ -331,9 +346,18 @@ object ExcelMacros:
331346
val initial = ColumnTypeInfo()
332347
val finalInfo = cells.foldLeft(initial)(updateTypeInfo)
333348

349+
// // Debug output to understand type inference
350+
// println(s"DEBUG inferColumnTypeFromCells: ${cells.length} cells")
351+
// cells.take(3).foreach { cell =>
352+
// println(s" Cell type: ${cell.getCellType}, value: '${cell.toString}'")
353+
// }
354+
// println(s" Final: couldBeInt=${finalInfo.couldBeInt}, couldBeDouble=${finalInfo.couldBeDouble}, couldBeBoolean=${finalInfo.couldBeBoolean}, seenEmpty=${finalInfo.seenEmpty}")
355+
334356
// Determine the most appropriate type based on what the column could be
335-
val baseType =
336-
if preferIntToBoolean then
357+
val baseType =
358+
// If we have no cells, default to String (safest option)
359+
if cells.isEmpty then TypeRepr.of[String]
360+
else if preferIntToBoolean then
337361
if finalInfo.couldBeInt then TypeRepr.of[Int]
338362
else if finalInfo.couldBeBoolean then TypeRepr.of[Boolean]
339363
else if finalInfo.couldBeLong then TypeRepr.of[Long]
@@ -352,5 +376,4 @@ object ExcelMacros:
352376
if finalInfo.seenEmpty then TypeRepr.of[Option].appliedTo(baseType) else baseType
353377
end if
354378
end inferColumnTypeFromCells
355-
356379
end ExcelMacros
58 Bytes
Binary file not shown.

scautable/test/src-jvm/testExcel.scala

Lines changed: 29 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@ package io.github.quafadas.scautable
44
import scala.NamedTuple.*
55

66
import io.github.quafadas.table.{*, given}
7+
8+
9+
/**
10+
* The cell contents of "Numbers.xlsx"
11+
*
12+
Doubles Int Longs Strings
13+
1.10 1.00 1.00 blah
14+
2.20 2.00 3.00 blah
15+
*/
716
class ExcelSuite extends munit.FunSuite:
817

918
test("excel provider compiles and typechecks") {
@@ -26,7 +35,7 @@ class ExcelSuite extends munit.FunSuite:
2635
}
2736

2837
test("Numbers") {
29-
val csv2 = Excel.resource("Numbers.xlsx", "Sheet1", TypeInferrer.FromAllRows)
38+
val csv2 = Excel.resource("Numbers.xlsx", "Sheet1", "A1:D3", TypeInferrer.FromAllRows)
3039
csv2.toSeq.ptbln
3140
val csv = Excel.resource("Numbers.xlsx", "Sheet1", "A1:C3", TypeInferrer.FromAllRows)
3241
val seq = csv.toSeq
@@ -121,33 +130,33 @@ class ExcelSuite extends munit.FunSuite:
121130

122131
test("excel provider with TypeInferrer.FirstRow should infer types from first row") {
123132
// Test FirstRow with Numbers.xlsx - should be equivalent to FirstN(1)
124-
def csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FirstRow)
133+
val csv = Excel.resource("Numbers.xlsx", "Sheet1", "A1:D3", TypeInferrer.FirstRow)
125134

126135
// Verify that we can read the data and it compiles with inferred types
127-
val rows = csv.toList
136+
val rows: List[(Doubles : Double, Int : Int, Longs : Int, Strings : String)] = csv.toList
128137
assertEquals(rows.size, 2)
129138

130139
// Test access to columns with the inferred types (should be same as FirstN(1))
131-
assertEquals(csv.column["Doubles"].toList.head, 1.1) // Double
132-
assertEquals(csv.column["Strings"].toList.head, "blah") // String
133-
assertEquals(csv.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
134-
assertEquals(csv.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
140+
assertEquals(rows.column["Doubles"].toList.head, 1.1) // Double
141+
assertEquals(rows.column["Strings"].toList.head, "blah") // String
142+
assertEquals(rows.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
143+
assertEquals(rows.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
135144
}
136145

137146
test("excel provider with TypeInferrer.FromAllRows should infer types from all rows") {
138147
// Test FromAllRows with Numbers.xlsx - should consider all data rows for type inference
139-
def csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FromAllRows)
148+
val csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FromAllRows)
140149

141150
// Verify that we can read the data and it compiles with inferred types
142151
val rows = csv.toList
143152
assertEquals(rows.size, 2)
144153

145154
// Test access to columns with the inferred types
146-
assertEquals(csv.column["Doubles"].toList.head, 1.1) // Double
147-
assertEquals(csv.column["Strings"].toList.head, "blah") // String
155+
assertEquals(rows.column["Doubles"].toList.head, 1.1) // Double
156+
assertEquals(rows.column["Strings"].toList.head, "blah") // String
148157
// Apache POI correctly identifies these as integers
149-
assertEquals(csv.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
150-
assertEquals(csv.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
158+
assertEquals(rows.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
159+
assertEquals(rows.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
151160
}
152161

153162
test("excel provider all TypeInferrer variants now supported") {
@@ -163,32 +172,32 @@ class ExcelSuite extends munit.FunSuite:
163172
test("excel provider with TypeInferrer.FirstN should infer types automatically") {
164173
// Test FirstN with Numbers.xlsx - based on error message, types are inferred as:
165174
// (Doubles : Double, Int : Double, Longs : Double, Strings : String)
166-
def csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FirstN(2))
175+
val csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FirstN(2))
167176

168177
// Verify that we can read the data and it compiles with inferred types
169178
val rows = csv.toList
170179
assertEquals(rows.size, 2)
171180

172181
// Test access to columns with the actually inferred types
173-
assertEquals(csv.column["Doubles"].toList.head, 1.1) // Double
174-
assertEquals(csv.column["Strings"].toList.head, "blah") // String
182+
assertEquals(rows.column["Doubles"].toList.head, 1.1) // Double
183+
assertEquals(rows.column["Strings"].toList.head, "blah") // String
175184

176185
// Apache POI correctly identifies these as integers based on cell type
177-
assertEquals(csv.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
178-
assertEquals(csv.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
186+
assertEquals(rows.column["Int"].toList.head, 1) // Apache POI correctly identifies as Int
187+
assertEquals(rows.column["Longs"].toList.head, 1) // Apache POI correctly identifies as Int
179188
}
180189

181190
test("excel provider with TypeInferrer.FirstN and preferIntToBoolean parameter") {
182191
// Test FirstN with custom preferIntToBoolean setting - just verify compilation and basic access
183-
def csv = Excel.resource("Numbers.xlsx", "Sheet1", "", TypeInferrer.FirstN(1, false))
192+
val csv: ExcelIterator[("Doubles", "Int", "Longs", "Strings"), (Double, Int, Int, String)] = Excel.resource("Numbers.xlsx", "Sheet1", "A1:D3", TypeInferrer.FirstN(1, false))
184193

185194
// Verify that we can read the data and it compiles with inferred types
186195
val rows = csv.toList
187196
assertEquals(rows.size, 2)
188197

189198
// Basic functionality test - verify we can access data (don't assert specific types due to preferIntToBoolean differences)
190-
assertEquals(csv.column["Doubles"].toList.head, 1.1)
191-
assertEquals(csv.column["Strings"].toList.head, "blah")
199+
assertEquals(rows.column["Doubles"].toList.head, 1.1)
200+
assertEquals(rows.column["Strings"].toList.head, "blah")
192201
// Note: Int/Longs columns may be inferred differently with preferIntToBoolean=false
193202
}
194203

scautable/test/src/csvWriterTest.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ class CSVWriterSuite extends munit.FunSuite:
9797
3,4,8
9898
5,6,9"""
9999

100-
assertEquals(data.toCsv(true, ',', '"'), expected)
100+
assertNoDiff(data.toCsv(true, ',', '"'), expected)
101101
}
102102

103103
test("List[NamedTuple].toCsv with empty values") {
@@ -112,7 +112,7 @@ class CSVWriterSuite extends munit.FunSuite:
112112
3,,8
113113
5,6,"""
114114

115-
assertEquals(data.toCsv(true, ',', '"'), expected)
115+
assertNoDiff(data.toCsv(true, ',', '"'), expected)
116116
}
117117

118118
test("Vector[NamedTuple].toCsv with numeric types") {
@@ -125,7 +125,7 @@ class CSVWriterSuite extends munit.FunSuite:
125125
1,Alice,95.5
126126
2,Bob,87.1"""
127127

128-
assertEquals(data.toCsv(true, ',', '"'), expected)
128+
assertNoDiff(data.toCsv(true, ',', '"'), expected)
129129
}
130130

131131
test("empty collection toCsv") {

site/docs/excel.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
# Excel
22

3-
43
Reading tables from excel should work a very similar way to CSV. The default behaviour is a bit of a hail mary. It assumes the excel workbook is rather well behaved, and that a blindly configured apache POI `RowIterator` will capture appropriate data. You should _not_ expect this method to be robust to blank rows / columns/ data elsewhere in the sheet.
54

65
```scala mdoc
@@ -25,3 +24,18 @@ One can also read from an absolute path
2524
import io.github.quafadas.table.*
2625
val csv = Excel.absolutePath("path/to/SimpleTable.xlsx", "Sheet1")
2726
```
27+
28+
## Problems and Hints
29+
30+
It is strongly recommended to specify a complete range when working with Excel. e..g.
31+
32+
`val range = Excel.resource("Numbers.xlsx", "Sheet1", "A1:C3", TypeInferrer.FromAllRows)`
33+
34+
Although this _may_ work,
35+
36+
`val norange = Excel.resource("Numbers.xlsx", "Sheet1", TypeInferrer.FromAllRows)`
37+
`val norange = Excel.resource("Numbers.xlsx", "", TypeInferrer.FromAllRows)`
38+
39+
Excel is somewhat pathological with regard to sheet boundaries. It is likely this will include blank cells, which will muck up type inference and reading ranges.
40+
41+
In general, trhe implementation here is not robust to Excel's flexibility (reading formula's is unimplemented) and assumes that we are working with a simple, well formed table.

0 commit comments

Comments
 (0)