Skip to content

Commit 0ec064a

Browse files
committed
Added how-to, csvcols -f options is now -col option to match feel of csvfind and csvjoin
1 parent 13d1493 commit 0ec064a

28 files changed

+333
-49
lines changed

cmds/csvcols/csvcols.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,11 @@ Example parsing a pipe delimited string into a CSV line
6363
6464
Filter a 10 column CSV file for columns 0,3,5 (left most column is number zero)
6565
66-
cat 10col.csv | csvcols -f 0 3 5 > 3col.csv
66+
cat 10col.csv | csvcols -col 0 3 5 > 3col.csv
6767
6868
Filter a 10 columns CSV file for columns 0,3,5 from input file
6969
70-
%s -i 10col.csv -f 0 3 5 > 3col.csv
70+
%s -i 10col.csv -col 0 3 5 > 3col.csv
7171
`
7272

7373
// Standard Options
@@ -139,7 +139,7 @@ func init() {
139139
// App Options
140140
flag.StringVar(&delimiter, "d", "", "set delimiter for conversion")
141141
flag.StringVar(&delimiter, "delimiter", "", "set delimiter for conversion")
142-
flag.BoolVar(&filterColumns, "f", false, "filter CSV input for columns requested")
142+
flag.BoolVar(&filterColumns, "col", false, "filter CSV input for columns requested")
143143
flag.BoolVar(&filterColumns, "filter-columns", false, "filter CSV input for columns requested")
144144
}
145145

cmds/csvfind/csvfind.go

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ In this example we've appended the edit distance to see how close the matches ar
6262
You can also search for phrases in columns.
6363
6464
%s -i books.csv -col=1 -contains "Red Book"
65-
6665
`
6766

6867
// Standard Options
@@ -84,17 +83,10 @@ You can also search for phrases in columns.
8483
maxEditDistance int
8584
appendEditDistance bool
8685
stopWordsOption string
86+
trimSpaces bool
87+
allowDuplicates bool
8788
)
8889

89-
func scanTable(table [][]string, col2 int, val string) ([]string, bool) {
90-
for _, row := range table {
91-
if col2 < len(row) && row[col2] == val {
92-
return row, true
93-
}
94-
}
95-
return []string{}, false
96-
}
97-
9890
func init() {
9991
// Basic Options
10092
flag.BoolVar(&showHelp, "h", false, "display help")
@@ -120,6 +112,8 @@ func init() {
120112
flag.BoolVar(&appendEditDistance, "append-edit-distance", false, "append column with edit distance found (useful for tuning levenshtein)")
121113
flag.StringVar(&stopWordsOption, "stop-words", "", "use the colon delimited list of stop words")
122114
flag.BoolVar(&skipHeaderRow, "skip-header-row", true, "skip the header row")
115+
flag.BoolVar(&allowDuplicates, "allow-duplicates", true, "allow duplicates when searching for matches")
116+
flag.BoolVar(&trimSpaces, "trim-spaces", false, "trim spaces around cell values before comparing")
123117
}
124118

125119
func main() {
@@ -239,6 +233,9 @@ func main() {
239233
}
240234
}
241235
}
236+
if allowDuplicates == false {
237+
break
238+
}
242239
} else {
243240
fmt.Fprintf(os.Stderr, "%d line skipped, missing column %d", lineNo, col)
244241
}

cmds/csvjoin/csvjoin.go

Lines changed: 50 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,10 @@
1919
package main
2020

2121
import (
22-
"bytes"
2322
"encoding/csv"
2423
"flag"
2524
"fmt"
2625
"io"
27-
"io/ioutil"
2826
"os"
2927
"path"
3028
"strings"
@@ -55,8 +53,7 @@ merged-data.csv..
5553
5654
%s -csv1=data1.csv -col1=1 \
5755
-csv2=data2.csv -col2=3 \
58-
-output=merged-data.csv
59-
56+
-output=merged-data.csv
6057
`
6158

6259
// Standard Options
@@ -71,6 +68,7 @@ merged-data.csv..
7168
csv2FName string
7269
col1 int
7370
col2 int
71+
trimSpaces bool
7472
caseSensitive bool
7573
useContains bool
7674
useLevenshtein bool
@@ -84,6 +82,9 @@ merged-data.csv..
8482

8583
// cellsMatch checks if two cells' values match
8684
func cellsMatch(val1, val2 string, stopWords []string) bool {
85+
if trimSpaces == true {
86+
val2 = strings.TrimSpace(val2)
87+
}
8788
if caseSensitive == false {
8889
val2 = strings.ToLower(val2)
8990
}
@@ -113,6 +114,9 @@ func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 in
113114
return
114115
}
115116
val1 := rowA[col1]
117+
if trimSpaces == true {
118+
val1 = strings.TrimSpace(val1)
119+
}
116120
if caseSensitive == false {
117121
val1 = strings.ToLower(val1)
118122
}
@@ -127,7 +131,7 @@ func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 in
127131
// We have a match, join the two rows and output
128132
combinedRows := append(rowA, rowB...)
129133
if err := w.Write(combinedRows); err != nil {
130-
fmt.Fprintf(os.Stderr, "Can't write csv row line %d of table 2, %s", i, err)
134+
fmt.Fprintf(os.Stderr, "Can't write csv row line %d of table 2, %s\n", i, err)
131135
return
132136
}
133137
if allowDuplicates == false {
@@ -163,7 +167,8 @@ func init() {
163167
flag.IntVar(&substituteCost, "substitute-cost", 1, "substitution cost to use when calculating Levenshtein edit distance")
164168
flag.IntVar(&maxEditDistance, "max-edit-distance", 5, "maximum edit distance for match using Levenshtein distance")
165169
flag.StringVar(&stopWordsOption, "stop-words", "", "a column delimited list of stop words to ingnore when matching")
166-
flag.BoolVar(&allowDuplicates, "allow-duplicates", false, "allowing duplicates returns all rows that have matching columns rather than first match")
170+
flag.BoolVar(&allowDuplicates, "allow-duplicates", true, "allow duplicates when searching for matches")
171+
flag.BoolVar(&trimSpaces, "trim-spaces", false, "trim spaces around cell values before comparing")
167172
}
168173

169174
func main() {
@@ -218,35 +223,28 @@ func main() {
218223
os.Exit(1)
219224
}
220225

221-
// FIXME: Should only read the smaller of two files into memory (and probably only the column value)
226+
// FIXME: Should only read the smaller of two files into memory
222227
// then interate through the other file for matches. This would let you work with larger files.
223228

224-
// Read in CSV1 and CSV2 then iterate over CSV1 output rows that have
229+
// Read in CSV2 to memory then iterate over CSV1 output rows that have
225230
// matching column's value
226-
src1, err := ioutil.ReadFile(csv1FName)
231+
fp1, err := os.Open(csv1FName)
227232
if err != nil {
228233
fmt.Fprintf(os.Stderr, "Can't read %s, %s\n", csv1FName, err)
229234
os.Exit(1)
230235
}
231-
src2, err := ioutil.ReadFile(csv2FName)
236+
defer fp1.Close()
237+
csv1 := csv.NewReader(fp1)
238+
239+
fp2, err := os.Open(csv2FName)
232240
if err != nil {
233241
fmt.Fprintf(os.Stderr, "Can't read %s, %s\n", csv2FName, err)
234242
os.Exit(1)
235243
}
236-
csv1 := csv.NewReader(bytes.NewReader(src1))
237-
csv1Table := [][]string{}
238-
for {
239-
record, err := csv1.Read()
240-
if err == io.EOF {
241-
break
242-
}
243-
if err != nil {
244-
fmt.Fprintf(os.Stderr, "%s, %s\n", csv1FName, err)
245-
fmt.Fprintf(os.Stderr, "%T %+v\n", record, record)
246-
}
247-
csv1Table = append(csv1Table, record)
248-
}
249-
csv2 := csv.NewReader(bytes.NewReader(src2))
244+
defer fp2.Close()
245+
csv2 := csv.NewReader(fp2)
246+
247+
// Note: we read one of the tables into memory to speed things up and limit disc reads
250248
csv2Table := [][]string{}
251249
for {
252250
record, err := csv2.Read()
@@ -262,17 +260,35 @@ func main() {
262260

263261
stopWords := strings.Split(stopWordsOption, ":")
264262
w := csv.NewWriter(out)
265-
for i, rowA := range csv1Table {
266-
if col1 < len(rowA) && rowA[col1] != "" {
267-
// We are relying on the side effect of writing the CSV output in scanTable
268-
scanTable(w, rowA, col1, csv2Table, col2, stopWords)
269-
w.Flush()
270-
if err := w.Error(); err != nil {
271-
fmt.Fprintf(os.Stderr, "Can't write CSV at line %d of csv table 1, %s", i, err)
272-
}
263+
lineNo := 0 // line number of csv 1 table
264+
for {
265+
rowA, err := csv1.Read()
266+
if err == io.EOF {
267+
break
273268
}
274-
if verbose == true && (i%100) == 0 {
275-
fmt.Fprintf(os.Stderr, "%d rows of csv table 1 processed\n", i)
269+
if err != nil {
270+
fmt.Fprintf(os.Stderr, "%d %s\n", lineNo, err)
271+
} else {
272+
if col1 < len(rowA) && rowA[col1] != "" {
273+
// We are relying on the side effect of writing the CSV output in scanTable
274+
scanTable(w, rowA, col1, csv2Table, col2, stopWords)
275+
w.Flush()
276+
if err := w.Error(); err != nil {
277+
fmt.Fprintf(os.Stderr, "Can't write CSV at line %d of csv table 1, %s\n", lineNo, err)
278+
}
279+
}
280+
if verbose == true {
281+
if (lineNo%100) == 0 && lineNo > 0 {
282+
fmt.Fprintf(os.Stderr, "\n%d rows of %s processed\n", lineNo, csv1FName)
283+
} else {
284+
fmt.Fprintf(os.Stderr, ".")
285+
}
286+
}
276287
}
288+
lineNo++
289+
}
290+
w.Flush()
291+
if err := w.Error(); err != nil {
292+
fmt.Fprintf(os.Stderr, "Can't write final CSV at line %d lines processed from CSV table 1, %s\n", lineNo+1, err)
277293
}
278294
}

demo/dups.csv

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
ID,Person,Email
2+
3+
2,Frida Kahlo,[email protected]
4+
3,Diego Rivera,[email protected]
5+
4,Jack Flanders,[email protected]
6+
5,Vincent Van Gough,[email protected]
7+
8+
7,Little Frieda,[email protected]
9+
10+
9,Frida Kahlo,[email protected]
11+
10,Diego Rivera,[email protected]
12+
11,Jack Flanders,[email protected]
13+
12,Vincent Van Gough,[email protected]
14+
13,Mojo Sam,[email protected]
15+
14,Little Frieda,[email protected]

demo/find-duplicates.bash

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#!/bin/bash
2+
3+
if [ "$1" = "" ] && [ "$2" = "" ]; then
4+
echo "USAGE: $(basename "$0") CSV_FILENAME CSV_COL_NO"
5+
exit 1
6+
fi
7+
if [ "$1" = "" ]; then
8+
echo "Missing CSV FILE name"
9+
exit 1
10+
fi
11+
12+
if [ "$2" = "" ]; then
13+
echo "Missing column number to match on"
14+
exit 1
15+
fi
16+
17+
CSV_FILE="$1"
18+
CSV_COL_NO="$2"
19+
20+
csvcols -i "$CSV_FILE" -col "$CSV_COL_NO" | sort -u | while read CELL; do
21+
if [ "$CELL" != "" ]; then
22+
csvfind -i "$CSV_FILE" -trim-spaces -col "$CSV_COL_NO" "${CELL}"
23+
fi
24+
done

docs/csv2json.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<li><a href="/">Home</a></li>
1515
<li><a href="../">up</a></li>
1616
<li><a href="./">Documentation</a></li>
17+
<li><a href="../how-to/">How To &hellip;</a></li>
1718
<li><a href="csv2json.html">csv2json</a></li>
1819
<li><a href="csv2mdtable.html">csv2mdtable</a></li>
1920
<li><a href="csv2xlsx.html">csv2xlsx</a></li>

docs/csv2mdtable.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<li><a href="/">Home</a></li>
1515
<li><a href="../">up</a></li>
1616
<li><a href="./">Documentation</a></li>
17+
<li><a href="../how-to/">How To &hellip;</a></li>
1718
<li><a href="csv2json.html">csv2json</a></li>
1819
<li><a href="csv2mdtable.html">csv2mdtable</a></li>
1920
<li><a href="csv2xlsx.html">csv2xlsx</a></li>

docs/csv2xlsx.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<li><a href="/">Home</a></li>
1515
<li><a href="../">up</a></li>
1616
<li><a href="./">Documentation</a></li>
17+
<li><a href="../how-to/">How To &hellip;</a></li>
1718
<li><a href="csv2json.html">csv2json</a></li>
1819
<li><a href="csv2mdtable.html">csv2mdtable</a></li>
1920
<li><a href="csv2xlsx.html">csv2xlsx</a></li>

docs/csvcols.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<li><a href="/">Home</a></li>
1515
<li><a href="../">up</a></li>
1616
<li><a href="./">Documentation</a></li>
17+
<li><a href="../how-to/">How To &hellip;</a></li>
1718
<li><a href="csv2json.html">csv2json</a></li>
1819
<li><a href="csv2mdtable.html">csv2mdtable</a></li>
1920
<li><a href="csv2xlsx.html">csv2xlsx</a></li>

docs/csvfind.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
<li><a href="/">Home</a></li>
1515
<li><a href="../">up</a></li>
1616
<li><a href="./">Documentation</a></li>
17+
<li><a href="../how-to/">How To &hellip;</a></li>
1718
<li><a href="csv2json.html">csv2json</a></li>
1819
<li><a href="csv2mdtable.html">csv2mdtable</a></li>
1920
<li><a href="csv2xlsx.html">csv2xlsx</a></li>

0 commit comments

Comments
 (0)