1919package main
2020
2121import (
22- "bytes"
2322 "encoding/csv"
2423 "flag"
2524 "fmt"
2625 "io"
27- "io/ioutil"
2826 "os"
2927 "path"
3028 "strings"
@@ -55,8 +53,7 @@ merged-data.csv..
5553
5654 %s -csv1=data1.csv -col1=1 \
5755 -csv2=data2.csv -col2=3 \
58- -output=merged-data.csv
59-
56+ -output=merged-data.csv
6057`
6158
6259 // Standard Options
@@ -71,6 +68,7 @@ merged-data.csv..
7168 csv2FName string
7269 col1 int
7370 col2 int
71+ trimSpaces bool
7472 caseSensitive bool
7573 useContains bool
7674 useLevenshtein bool
@@ -84,6 +82,9 @@ merged-data.csv..
8482
8583// cellsMatch checks if two cells' values match
8684func cellsMatch (val1 , val2 string , stopWords []string ) bool {
85+ if trimSpaces == true {
86+ val2 = strings .TrimSpace (val2 )
87+ }
8788 if caseSensitive == false {
8889 val2 = strings .ToLower (val2 )
8990 }
@@ -113,6 +114,9 @@ func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 in
113114 return
114115 }
115116 val1 := rowA [col1 ]
117+ if trimSpaces == true {
118+ val1 = strings .TrimSpace (val1 )
119+ }
116120 if caseSensitive == false {
117121 val1 = strings .ToLower (val1 )
118122 }
@@ -127,7 +131,7 @@ func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 in
127131 // We have a match, join the two rows and output
128132 combinedRows := append (rowA , rowB ... )
129133 if err := w .Write (combinedRows ); err != nil {
130- fmt .Fprintf (os .Stderr , "Can't write csv row line %d of table 2, %s" , i , err )
134+ fmt .Fprintf (os .Stderr , "Can't write csv row line %d of table 2, %s\n " , i , err )
131135 return
132136 }
133137 if allowDuplicates == false {
@@ -163,7 +167,8 @@ func init() {
163167 flag .IntVar (& substituteCost , "substitute-cost" , 1 , "substitution cost to use when calculating Levenshtein edit distance" )
164168 flag .IntVar (& maxEditDistance , "max-edit-distance" , 5 , "maximum edit distance for match using Levenshtein distance" )
165169 flag .StringVar (& stopWordsOption , "stop-words" , "" , "a column delimited list of stop words to ingnore when matching" )
166- flag .BoolVar (& allowDuplicates , "allow-duplicates" , false , "allowing duplicates returns all rows that have matching columns rather than first match" )
170+ flag .BoolVar (& allowDuplicates , "allow-duplicates" , true , "allow duplicates when searching for matches" )
171+ flag .BoolVar (& trimSpaces , "trim-spaces" , false , "trim spaces around cell values before comparing" )
167172}
168173
169174func main () {
@@ -218,35 +223,28 @@ func main() {
218223 os .Exit (1 )
219224 }
220225
221- // FIXME: Should only read the smaller of two files into memory (and probably only the column value)
226+ // FIXME: Should only read the smaller of two files into memory
222227 // then interate through the other file for matches. This would let you work with larger files.
223228
224- // Read in CSV1 and CSV2 then iterate over CSV1 output rows that have
229+ // Read in CSV2 to memory then iterate over CSV1 output rows that have
225230 // matching column's value
226- src1 , err := ioutil . ReadFile (csv1FName )
231+ fp1 , err := os . Open (csv1FName )
227232 if err != nil {
228233 fmt .Fprintf (os .Stderr , "Can't read %s, %s\n " , csv1FName , err )
229234 os .Exit (1 )
230235 }
231- src2 , err := ioutil .ReadFile (csv2FName )
236+ defer fp1 .Close ()
237+ csv1 := csv .NewReader (fp1 )
238+
239+ fp2 , err := os .Open (csv2FName )
232240 if err != nil {
233241 fmt .Fprintf (os .Stderr , "Can't read %s, %s\n " , csv2FName , err )
234242 os .Exit (1 )
235243 }
236- csv1 := csv .NewReader (bytes .NewReader (src1 ))
237- csv1Table := [][]string {}
238- for {
239- record , err := csv1 .Read ()
240- if err == io .EOF {
241- break
242- }
243- if err != nil {
244- fmt .Fprintf (os .Stderr , "%s, %s\n " , csv1FName , err )
245- fmt .Fprintf (os .Stderr , "%T %+v\n " , record , record )
246- }
247- csv1Table = append (csv1Table , record )
248- }
249- csv2 := csv .NewReader (bytes .NewReader (src2 ))
244+ defer fp2 .Close ()
245+ csv2 := csv .NewReader (fp2 )
246+
247+ // Note: we read one of the tables into memory to speed things up and limit disc reads
250248 csv2Table := [][]string {}
251249 for {
252250 record , err := csv2 .Read ()
@@ -262,17 +260,35 @@ func main() {
262260
263261 stopWords := strings .Split (stopWordsOption , ":" )
264262 w := csv .NewWriter (out )
265- for i , rowA := range csv1Table {
266- if col1 < len (rowA ) && rowA [col1 ] != "" {
267- // We are relying on the side effect of writing the CSV output in scanTable
268- scanTable (w , rowA , col1 , csv2Table , col2 , stopWords )
269- w .Flush ()
270- if err := w .Error (); err != nil {
271- fmt .Fprintf (os .Stderr , "Can't write CSV at line %d of csv table 1, %s" , i , err )
272- }
263+ lineNo := 0 // line number of csv 1 table
264+ for {
265+ rowA , err := csv1 .Read ()
266+ if err == io .EOF {
267+ break
273268 }
274- if verbose == true && (i % 100 ) == 0 {
275- fmt .Fprintf (os .Stderr , "%d rows of csv table 1 processed\n " , i )
269+ if err != nil {
270+ fmt .Fprintf (os .Stderr , "%d %s\n " , lineNo , err )
271+ } else {
272+ if col1 < len (rowA ) && rowA [col1 ] != "" {
273+ // We are relying on the side effect of writing the CSV output in scanTable
274+ scanTable (w , rowA , col1 , csv2Table , col2 , stopWords )
275+ w .Flush ()
276+ if err := w .Error (); err != nil {
277+ fmt .Fprintf (os .Stderr , "Can't write CSV at line %d of csv table 1, %s\n " , lineNo , err )
278+ }
279+ }
280+ if verbose == true {
281+ if (lineNo % 100 ) == 0 && lineNo > 0 {
282+ fmt .Fprintf (os .Stderr , "\n %d rows of %s processed\n " , lineNo , csv1FName )
283+ } else {
284+ fmt .Fprintf (os .Stderr , "." )
285+ }
286+ }
276287 }
288+ lineNo ++
289+ }
290+ w .Flush ()
291+ if err := w .Error (); err != nil {
292+ fmt .Fprintf (os .Stderr , "Can't write final CSV at line %d lines processed from CSV table 1, %s\n " , lineNo + 1 , err )
277293 }
278294}
0 commit comments