Skip to content

Commit e410621

Browse files
committed
Updated dialect sniffer
1 parent 0f76b5b commit e410621

7 files changed

+115
-57
lines changed

VBA-CSV-interface.zip

19.7 MB
Binary file not shown.

src/Access_version.zip

228 Bytes
Binary file not shown.

src/All_Host_version.zip

216 Bytes
Binary file not shown.

src/CSVinterface.cls

Lines changed: 88 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1863,15 +1863,15 @@ Private Sub LoadNewStream()
18631863
End If
18641864
End Sub
18651865
Private Function MaxIndexVal(ByRef values As Variant) As Long
1866-
Dim tmpVal As Long, idx0 As Long, idx As Long
1866+
Dim tmpVal As Double, idx0 As Long, idx As Long
18671867

18681868
idx0 = LBound(values)
18691869
idx = idx0
1870-
tmpVal = values(idx0)
1870+
tmpVal = CDbl(values(idx0))
18711871
For idx0 = idx0 + 1 To UBound(values)
1872-
If values(idx0) > tmpVal Then
1872+
If CDbl(values(idx0)) > tmpVal Then
18731873
idx = idx0
1874-
tmpVal = values(idx0)
1874+
tmpVal = CDbl(values(idx0))
18751875
End If
18761876
Next idx0
18771877
MaxIndexVal = idx
@@ -3095,109 +3095,140 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
30953095
ByRef CSVstring As String, _
30963096
Optional Buffered As Boolean = False) As CSVdialect
30973097

3098-
Dim CtableScore() As Double
3099-
Dim DelIdx As Long
31003098
Dim EmptyParam() As Variant
31013099
Dim GuesserHelper As CSVSniffer
3102-
Dim GuessResults As CSVArrayList
3103-
Dim L1_FieldDelimiter As Long
3100+
Dim ImportedTable As CSVArrayList
3101+
Dim i As Long, j As Long, k As Long
31043102
Dim LinesEnds() As String
3105-
Dim Quote As String
31063103
Dim QuoteChar() As QuoteTokens
3104+
Dim ScoreArray As CSVArrayList
31073105
Dim tmpConfig As CSVparserConfig
3106+
Dim tmpResult As String
31083107
Dim TmpCSVstr As String
31093108
Dim TmpDelimiters() As String
3110-
Dim UBlinesEnds() As Long
3111-
Dim UBquoteChar() As Long
31123109

31133110
ReDim LinesEnds(0 To 2)
31143111
Set tmpConfig = confObject.CopyConfig
31153112
'@--------------------------------------------------------------------------------
3113+
'Field delimiters array
3114+
TmpDelimiters() = tmpConfig.delimitersToGuess
3115+
'@--------------------------------------------------------------------------------
31163116
'LinesEnds array
31173117
LinesEnds(0) = vbCrLf
31183118
LinesEnds(1) = vbCr
31193119
LinesEnds(2) = vbLf
3120-
ReDim UBlinesEnds(0 To 2)
31213120
'@--------------------------------------------------------------------------------
31223121
'Quotes array
31233122
ReDim QuoteChar(0 To 2)
31243123
QuoteChar(0) = DoubleQuotes
31253124
QuoteChar(1) = Apostrophe
31263125
QuoteChar(2) = Tilde
3127-
ReDim UBquoteChar(LBound(QuoteChar) To UBound(QuoteChar))
31283126
'@--------------------------------------------------------------------------------
31293127
'Parser config
31303128
ReDim EmptyParam(-1 To -1) 'To emule empty ParamArray parameter
3131-
TmpDelimiters() = tmpConfig.delimitersToGuess
31323129
Set SniffInString = New CSVdialect
31333130
'@--------------------------------------------------------------------------------
3134-
'Scores array
3135-
ReDim CtableScore(LBound(TmpDelimiters) To UBound(TmpDelimiters))
3136-
'@--------------------------------------------------------------------------------
3137-
'String to guess
3131+
'String source
31383132
If Not Buffered Then
31393133
TmpCSVstr = MidB$(CSVstring, 1, 524288) 'Load 0.5 MB of data
31403134
Else
31413135
TmpCSVstr = CSVstring
31423136
End If
3143-
'@--------------------------------------------------------------------------------
3144-
'Guess records delimiter
3145-
For DelIdx = LBound(LinesEnds) To UBound(LinesEnds)
3146-
UBlinesEnds(DelIdx) = UBound(Split(TmpCSVstr, LinesEnds(DelIdx)))
3147-
Next DelIdx
3148-
'@--------------------------------------------------------------------------------
3149-
'Guess Quote char
3150-
For DelIdx = LBound(QuoteChar) To UBound(QuoteChar)
3151-
Select Case QuoteChar(DelIdx)
3152-
Case 1
3153-
Quote = CHR_APOSTROPHE
3154-
Case 2
3155-
Quote = CHR_DOUBLE_QUOTES
3156-
Case Else
3157-
Quote = CHR_TILDE
3158-
End Select
3159-
UBquoteChar(DelIdx) = UBound(Split(TmpCSVstr, Quote))
3160-
Next DelIdx
3161-
TmpCSVstr = vbNullString
31623137
With tmpConfig
31633138
'@--------------------------------------------------------------------------------
3164-
'Set guessed chars
3165-
.dialect.recordsDelimiter = LinesEnds(MaxIndexVal(UBlinesEnds))
3166-
.dialect.quoteToken = QuoteChar(MaxIndexVal(UBquoteChar))
3167-
'@--------------------------------------------------------------------------------
3168-
'Guess fields delimiter using 10 samples lines with skipping
3139+
'Guess dialect using 10 samples records with skipping
31693140
.startingRecord = 1
31703141
.endingRecord = 10
31713142
.skipCommentLines = True
31723143
.skipEmptyLines = True
31733144
Set GuesserHelper = New CSVSniffer
3174-
For DelIdx = LBound(TmpDelimiters) To UBound(TmpDelimiters)
3175-
If InStrB(1, CSVstring, TmpDelimiters(DelIdx)) Then
3176-
.dialect.fieldsDelimiter = TmpDelimiters(DelIdx)
3177-
Set GuessResults = New CSVArrayList
3178-
ParseCSVstring CSVstring, tmpConfig, GuessResults, EmptyParam
3179-
CtableScore(DelIdx) = GuesserHelper.TableScore(GuessResults)
3145+
Set ScoreArray = New CSVArrayList
3146+
ScoreArray.indexing = True
3147+
For i = LBound(TmpDelimiters) To UBound(TmpDelimiters)
3148+
If InStrB(1, TmpCSVstr, TmpDelimiters(i)) Then
3149+
For j = LBound(LinesEnds) To UBound(LinesEnds)
3150+
If InStrB(1, TmpCSVstr, LinesEnds(j)) Then
3151+
For k = LBound(QuoteChar) To UBound(QuoteChar)
3152+
If InStrB(1, TmpCSVstr, GetQuoteChar(QuoteChar(k))) Then
3153+
'@--------------------------------------------------------------------------------
3154+
'Set CSV dialect
3155+
.dialect.fieldsDelimiter = TmpDelimiters(i)
3156+
.dialect.recordsDelimiter = LinesEnds(j)
3157+
.dialect.quoteToken = QuoteChar(k)
3158+
Set ImportedTable = New CSVArrayList
3159+
ParseCSVstring TmpCSVstr, tmpConfig, ImportedTable, EmptyParam
3160+
'@--------------------------------------------------------------------------------
3161+
'Save results with keys
3162+
ScoreArray.AddIndexedItem DialectToString(.dialect), GuesserHelper.TableScore(ImportedTable)
3163+
End If
3164+
Next k
3165+
End If
3166+
Next j
31803167
End If
3181-
Next DelIdx
3168+
Next i
3169+
End With
3170+
With ScoreArray
31823171
'@--------------------------------------------------------------------------------
31833172
'Choose the maximum score
3184-
L1_FieldDelimiter = MaxIndexVal(CtableScore)
3173+
tmpResult = .keys()(MaxIndexVal(.indexedItems))
31853174
'@--------------------------------------------------------------------------------
31863175
'Returns
3187-
SniffInString.quoteToken = .dialect.quoteToken
3188-
SniffInString.recordsDelimiter = .dialect.recordsDelimiter
3189-
SniffInString.fieldsDelimiter = TmpDelimiters(L1_FieldDelimiter)
3190-
If InStrB(1, CSVstring, CHR_BACKSLASH) Then
3176+
Set SniffInString = StringToDialect(tmpResult)
3177+
If InStrB(1, TmpCSVstr, CHR_BACKSLASH) Then
31913178
SniffInString.escapeMode = unix
31923179
End If
31933180
End With
31943181
Erase EmptyParam
3195-
Erase CtableScore
31963182
Erase LinesEnds
31973183
Erase QuoteChar
31983184
Erase TmpDelimiters
3199-
Erase UBlinesEnds
3200-
Erase UBquoteChar
3185+
Set ScoreArray = Nothing
3186+
End Function
3187+
Private Function DialectToString(ByRef dialectObj As CSVdialect) As String
3188+
Dim tmpResult() As String
3189+
ReDim tmpResult(0 To 2)
3190+
3191+
With dialectObj
3192+
tmpResult(0) = .fieldsDelimiter
3193+
tmpResult(1) = .recordsDelimiter
3194+
tmpResult(2) = CStr(.quoteToken)
3195+
End With
3196+
DialectToString = Join$(tmpResult, "ii")
3197+
End Function
3198+
Private Function StringToDialect(ByRef dialectString As String) As CSVdialect
3199+
Dim tmpArr() As String
3200+
Dim idx As Long
3201+
Dim tmpResult As CSVdialect
3202+
3203+
tmpArr() = Split(dialectString, "ii")
3204+
idx = LBound(tmpArr)
3205+
Set tmpResult = New CSVdialect
3206+
With tmpResult
3207+
.fieldsDelimiter = tmpArr(idx)
3208+
.recordsDelimiter = tmpArr(idx + 1)
3209+
.quoteToken = GetQuoteToken(CLng(tmpArr(idx + 2)))
3210+
End With
3211+
Set StringToDialect = tmpResult
3212+
End Function
3213+
Private Function GetQuoteChar(ByRef QTokenCode As QuoteTokens) As String
3214+
Select Case QTokenCode
3215+
Case 1
3216+
GetQuoteChar = CHR_APOSTROPHE
3217+
Case 2
3218+
GetQuoteChar = CHR_DOUBLE_QUOTES
3219+
Case Else
3220+
GetQuoteChar = CHR_TILDE
3221+
End Select
3222+
End Function
3223+
Private Function GetQuoteToken(ByRef QTokenCode As Long) As QuoteTokens
3224+
Select Case QTokenCode
3225+
Case 1
3226+
GetQuoteToken = QuoteTokens.Apostrophe
3227+
Case 2
3228+
GetQuoteToken = QuoteTokens.DoubleQuotes
3229+
Case Else
3230+
GetQuoteToken = QuoteTokens.Tilde
3231+
End Select
32013232
End Function
32023233
Public Function Sort(Optional ByVal fromIndex As Long = -1, _
32033234
Optional ByVal toIndex As Long = -1, _
16.2 KB
Binary file not shown.

testing/tests/delimiters-guessing/Line feed character is more frequent than the car return-line feed combination.csv

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
ID;Type;Item Name;Description1;Toys;Box of Tricks;Set for easy magic2;Toys;Lexibook;Educational laptop3;Books;Lessons in Chemistry by Bonnie Garmus;#1 NEW YORK TIMES BESTSELLER.
2+
Meet Elizabeth Zott: "a gifted research chemist, absurdly self-assured and immune to social convention"
3+
(The Washington Post) in 1960s California whose career takes a detour when she becomes the unlikely star of a beloved TV cooking show.
4+
This novel is "irresistible, satisfying and full of fuel" (The New York Times Book Review)
5+
and "witty, sometimes hilarious...the Catch-22 of early feminism." (Stephen King, via Twitter)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
=== Delimiters guessing test ===
2+
+ Mixed comma and semicolon
3+
+ File with multi-line field
4+
+ Optional quoted fields
5+
+ Mixed comma and semicolon - file B
6+
+ Geometric CSV
7+
+ Table embedded in the last record
8+
+ Table embedded in the second record
9+
+ Multiple commas in fields
10+
+ Uncommon char as field delimiter
11+
+ Wrong delimiters have been added to guessing operation
12+
+ FEC data - [clevercsv issue #15]
13+
+ Mixed comma and colon - [clevercsv issue #35]
14+
+ Json data type - [clevercsv issue #37]
15+
+ Undefined field delimiter
16+
+ Rainbow CSV [issue #92]
17+
+ Pipe character is more frequent than the comma
18+
+ Pipe character is more frequent than the semicolon
19+
+ Short pipe separated table embedded
20+
+ LF character is more frequent than CRLF combination
21+
= PASS (19 of 19 passed) = 25/3/2023 10:34:46 p.�m. =
22+

0 commit comments

Comments
 (0)