Skip to content

Commit 18250f2

Browse files
committed
Much more stable dialect sniffer
The parser can now infer CSV dialects in a much more stable way, omitting the propagation of average errors to the table record scores.
1 parent 4c1724b commit 18250f2

File tree

6 files changed

+43
-6
lines changed

6 files changed

+43
-6
lines changed

src/Access_version.zip

88 Bytes
Binary file not shown.

src/All_Host_version.zip

99 Bytes
Binary file not shown.

src/CSVSniffer.cls

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ End Function
527527
''' <summary>
528528
''' Calculates a score for the imported data based on the congruence
529529
''' of the detected data and the uniformity of the fields contained
530-
''' in each record. The score is in the range 0 < x <= 100.
530+
''' in each record. The score is in the range 0 < x <= inf.
531531
''' </summary>
532532
''' <param name="ArrayList">CSV imported data.</param>
533533
Public Function TableScore(ByRef ArrayList As CSVArrayList) As Double
@@ -540,7 +540,7 @@ Public Function TableScore(ByRef ArrayList As CSVArrayList) As Double
540540
For L0 = 0 To ArrayList.count - 1
541541
SumRecScores = SumRecScores + RecordScore(ArrayList(L0))
542542
Next L0
543-
TableScore = RecordsConsistencyFactor(ArrayList) * SumRecScores / ArrayList.count
543+
TableScore = RecordsConsistencyFactor(ArrayList) * SumRecScores
544544
End If
545545
End If
546546
End Function

src/CSVinterface.cls

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3149,9 +3149,7 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
31493149
Set ScoreArray = New CSVArrayList
31503150
ScoreArray.indexing = True
31513151
For i = LBound(TmpDelimiters) To UBound(TmpDelimiters)
3152-
If InStrB(1, TmpCSVstr, TmpDelimiters(i)) Then
31533152
For j = LBound(LinesEnds) To UBound(LinesEnds)
3154-
If InStrB(1, TmpCSVstr, LinesEnds(j)) Then
31553153
For k = LBound(QuoteChar) To UBound(QuoteChar)
31563154
'@--------------------------------------------------------------------------------
31573155
'Set CSV dialect
@@ -3168,9 +3166,7 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
31683166
ScoreArray.AddIndexedItem DialectToString(.dialect) & CHR_CARET, GuesserHelper.TableScore(ImportedTable) / 2
31693167
End If
31703168
Next k
3171-
End If
31723169
Next j
3173-
End If
31743170
Next i
31753171
End With
31763172
With ScoreArray
@@ -3183,6 +3179,26 @@ Private Function SniffInString(ByRef confObject As CSVparserConfig, _
31833179
If InStrB(1, TmpCSVstr, CHR_BACKSLASH) Then
31843180
SniffInString.escapeMode = unix
31853181
End If
3182+
'@--------------------------------------------------------------------------------
3183+
'2nd pass for unique record CSV file
3184+
Set ImportedTable = New CSVArrayList
3185+
Set tmpConfig.dialect = SniffInString
3186+
ParseCSVstring TmpCSVstr, tmpConfig, ImportedTable, EmptyParam
3187+
If ImportedTable.count = 1 Then
3188+
If InStrB(1, TmpCSVstr, vbCrLf) Then
3189+
SniffInString.recordsDelimiter = vbCrLf
3190+
Else
3191+
If InStrB(1, TmpCSVstr, vbCr) Then
3192+
SniffInString.recordsDelimiter = vbCr
3193+
Else
3194+
If InStrB(1, TmpCSVstr, vbLf) Then
3195+
SniffInString.recordsDelimiter = vbLf
3196+
Else
3197+
SniffInString.recordsDelimiter = vbCrLf
3198+
End If
3199+
End If
3200+
End If
3201+
End If
31863202
End With
31873203
Erase EmptyParam
31883204
Erase LinesEnds
1.12 KB
Binary file not shown.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
=== Delimiters guessing test ===
2+
+ Mixed comma and semicolon
3+
+ File with multi-line field
4+
+ Optional quoted fields
5+
+ Mixed comma and semicolon - file B
6+
+ Geometric CSV
7+
+ Table embedded in the last record
8+
+ Table embedded in the second record
9+
+ Multiple commas in fields
10+
+ Uncommon char as field delimiter
11+
+ Wrong delimiters have been added to guessing operation
12+
+ FEC data - [clevercsv issue #15]
13+
+ Mixed comma and colon - [clevercsv issue #35]
14+
+ Json data type - [clevercsv issue #37]
15+
+ Undefined field delimiter
16+
+ Rainbow CSV [issue #92]
17+
+ Pipe character is more frequent than the comma
18+
+ Pipe character is more frequent than the semicolon
19+
+ Short pipe separated table embedded
20+
= PASS (18 of 18 passed) = 3/4/2023 11:55:13 p.�m. =
21+

0 commit comments

Comments
 (0)