@@ -28,6 +28,7 @@ import (
2828 "log"
2929 "os"
3030 "path"
31+ "strings"
3132
3233 // My packages
3334 "github.com/caltechlibrary/cli"
@@ -56,6 +57,7 @@ merged-data.csv..
5657 %s -csv1=data1.csv -col1=1 \
5758 -csv2=data2.csv -col2=3 \
5859 -output=merged-data.csv
60+
5961`
6062
6163 // Standard Options
@@ -65,19 +67,75 @@ merged-data.csv..
6567 outputFName string
6668
6769 // App Options
68- csv1FName string
69- csv2FName string
70- col1 int
71- col2 int
70+ csv1FName string
71+ csv2FName string
72+ col1 int
73+ col2 int
74+ caseSensitive bool
75+ useContains bool
76+ useLevenshtein bool
77+ insertCost int
78+ deleteCost int
79+ substituteCost int
80+ maxEditDistance int
81+ stopWordsOption string
82+ allowDuplicates bool
7283)
7384
74- func scanTable (table [][]string , col2 int , val string ) ([]string , bool ) {
75- for _ , row := range table {
76- if col2 < len (row ) && row [col2 ] == val {
77- return row , true
85+ // cellsMatch checks if two cells' values match
86+ func cellsMatch (val1 , val2 string , stopWords []string ) bool {
87+ if caseSensitive == false {
88+ val2 = strings .ToLower (val2 )
89+ }
90+ if len (stopWords ) > 0 {
91+ val2 = strings .Join (datatools .ApplyStopWords (strings .Split (val2 , " " ), stopWords ), " " )
92+ }
93+ switch {
94+ case useLevenshtein == true :
95+ distance := datatools .Levenshtein (val2 , val1 , insertCost , deleteCost , substituteCost , caseSensitive )
96+ if distance <= maxEditDistance {
97+ return true
98+ }
99+ case useContains == true :
100+ if strings .Contains (val2 , val1 ) {
101+ return true
102+ }
103+ default :
104+ if val1 == val2 {
105+ return true
106+ }
107+ }
108+ return false
109+ }
110+
111+ func scanTable (w * csv.Writer , rowA []string , col1 int , table [][]string , col2 int , stopWords []string ) {
112+ if col1 >= len (rowA ) {
113+ return
114+ }
115+ val1 := rowA [col1 ]
116+ if caseSensitive == false {
117+ val1 = strings .ToLower (val1 )
118+ }
119+ if len (stopWords ) > 0 {
120+ val1 = strings .Join (datatools .ApplyStopWords (strings .Split (val1 , " " ), stopWords ), " " )
121+ }
122+ for i , rowB := range table {
123+ // Emit a joined row if we have a match
124+ if col2 < len (rowB ) {
125+ val2 := rowB [col2 ]
126+ if cellsMatch (val1 , val2 , stopWords ) == true {
127+ // We have a match, join the two rows and output
128+ combinedRows := append (rowA , rowB ... )
129+ if err := w .Write (combinedRows ); err != nil {
130+ fmt .Fprintf (os .Stderr , "Can't write csv row line %d of table 2, %s" , i , err )
131+ return
132+ }
133+ if allowDuplicates == false {
134+ return
135+ }
136+ }
78137 }
79138 }
80- return []string {}, false
81139}
82140
83141func init () {
@@ -96,6 +154,15 @@ func init() {
96154 flag .StringVar (& csv2FName , "csv2" , "" , "second CSV filename" )
97155 flag .IntVar (& col1 , "col1" , 0 , "column to on join on in first CSV file" )
98156 flag .IntVar (& col2 , "col2" , 0 , "column to on join on in second CSV file" )
157+ flag .BoolVar (& caseSensitive , "case-sensitive" , false , "make a case sensitive match (default is case insensitive)" )
158+ flag .BoolVar (& useContains , "contains" , false , "match columns based on csv1/col1 contained in csv2/col2" )
159+ flag .BoolVar (& useLevenshtein , "levenshtein" , false , "match columns using Levensthein edit distance" )
160+ flag .IntVar (& insertCost , "insert-cost" , 1 , "insertion cost to use when calculating Levenshtein edit distance" )
161+ flag .IntVar (& deleteCost , "delete-cost" , 1 , "deletion cost to use when calculating Levenshtein edit distance" )
162+ flag .IntVar (& substituteCost , "substitute-cost" , 1 , "substitution cost to use when calculating Levenshtein edit distance" )
163+ flag .IntVar (& maxEditDistance , "max-edit-distance" , 5 , "maximum edit distance for match using Levenshtein distance" )
164+ flag .StringVar (& stopWordsOption , "stop-words" , "" , "a column delimited list of stop words to ingnore when matching" )
165+ flag .BoolVar (& allowDuplicates , "allow-duplicates" , false , "allowing duplicates returns all rows that have matching columns rather than first match" )
99166}
100167
101168func main () {
@@ -150,6 +217,9 @@ func main() {
150217 os .Exit (1 )
151218 }
152219
220+ // FIXME: Should only read the smaller of two files into memory (and probably only the column value)
221+ // then interate through the other file for matches. This would let you work with larger files.
222+
153223 // Read in CSV1 and CSV2 then iterate over CSV1 output rows that have
154224 // matching column's value
155225 src1 , err := ioutil .ReadFile (csv1FName )
@@ -189,19 +259,12 @@ func main() {
189259 csv2Table = append (csv2Table , record )
190260 }
191261
262+ stopWords := strings .Split (stopWordsOption , ":" )
192263 w := csv .NewWriter (out )
193- val := ""
194264 for _ , rowA := range csv1Table {
195- if col1 < len (rowA ) {
196- val = rowA [col1 ]
197- // Name see if we find matching row in table 2
198- if rowB , ok := scanTable (csv2Table , col2 , val ); ok == true {
199- // We have
200- combinedRows := append (rowA , rowB ... )
201- if err := w .Write (combinedRows ); err != nil {
202- log .Fatalf ("error wrint args as csv, %s" , err )
203- }
204- }
265+ if col1 < len (rowA ) && rowA [col1 ] != "" {
266+ // We are relying on the side effect of writing the CSV output in scanTable
267+ scanTable (w , rowA , col1 , csv2Table , col2 , stopWords )
205268 }
206269 }
207270 w .Flush ()
0 commit comments