Skip to content

Commit ab28227

Browse files
committed
adding support for Levenshtein and contains matching in csvjoin
1 parent f58f200 commit ab28227

File tree

3 files changed

+173
-36
lines changed

3 files changed

+173
-36
lines changed

cmds/csvfind/csvfind.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ You can also search for phrases in columns.
8383
caseSensitive bool
8484
maxEditDistance int
8585
appendEditDistance bool
86-
stopWords string
86+
stopWordsOption string
8787
)
8888

8989
func scanTable(table [][]string, col2 int, val string) ([]string, bool) {
@@ -118,7 +118,7 @@ func init() {
118118
flag.IntVar(&substituteCost, "substitute-cost", 1, "set the substitution cost to use for levenshtein matching")
119119
flag.BoolVar(&caseSensitive, "case-sensitive", false, "perform a case sensitive match (default is false)")
120120
flag.BoolVar(&appendEditDistance, "append-edit-distance", false, "append column with edit distance found (useful for tuning levenshtein)")
121-
flag.StringVar(&stopWords, "stop-words", "", "use the colon delimited list of stop words")
121+
flag.StringVar(&stopWordsOption, "stop-words", "", "use the colon delimited list of stop words")
122122
flag.BoolVar(&skipHeaderRow, "skip-header-row", true, "skip the header row")
123123
}
124124

@@ -158,16 +158,16 @@ func main() {
158158
}
159159

160160
target := args[0]
161-
stopList := []string{}
161+
stopWords := []string{}
162162

163163
// NOTE: If we're doing a case Insensitive search (the default) the lower case everything before matching
164164
if caseSensitive == false {
165165
target = strings.ToLower(target)
166-
//stopWords = strings.ToLower(target)
166+
stopWordsOption = strings.ToLower(stopWordsOption)
167167
}
168-
if len(stopWords) > 0 {
169-
stopList = strings.Split(stopWords, ":")
170-
target = strings.Join(datatools.ApplyStopWords(strings.Split(target, " "), stopList), " ")
168+
if len(stopWordsOption) > 0 {
169+
stopWords = strings.Split(stopWordsOption, ":")
170+
target = strings.Join(datatools.ApplyStopWords(strings.Split(target, " "), stopWords), " ")
171171
}
172172

173173
in, err := cli.Open(inputFName, os.Stdin)
@@ -204,13 +204,13 @@ func main() {
204204
if caseSensitive == false {
205205
src = strings.ToLower(src)
206206
}
207-
if len(stopList) > 0 {
207+
if len(stopWords) > 0 {
208208
// Split into fields applying datatools filter
209209
fields := strings.FieldsFunc(src, func(c rune) bool {
210210
return datatools.Filter(c, "", false)
211211
})
212212
// Convert to an array of strings back into a space separted string
213-
src = strings.Join(datatools.ApplyStopWords(fields, stopList), " ")
213+
src = strings.Join(datatools.ApplyStopWords(fields, stopWords), " ")
214214
}
215215
switch {
216216
case useContains:

cmds/csvjoin/csvjoin.go

Lines changed: 83 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"log"
2929
"os"
3030
"path"
31+
"strings"
3132

3233
// My packages
3334
"github.com/caltechlibrary/cli"
@@ -56,6 +57,7 @@ merged-data.csv..
5657
%s -csv1=data1.csv -col1=1 \
5758
-csv2=data2.csv -col2=3 \
5859
-output=merged-data.csv
60+
5961
`
6062

6163
// Standard Options
@@ -65,19 +67,75 @@ merged-data.csv..
6567
outputFName string
6668

6769
// App Options
68-
csv1FName string
69-
csv2FName string
70-
col1 int
71-
col2 int
70+
csv1FName string
71+
csv2FName string
72+
col1 int
73+
col2 int
74+
caseSensitive bool
75+
useContains bool
76+
useLevenshtein bool
77+
insertCost int
78+
deleteCost int
79+
substituteCost int
80+
maxEditDistance int
81+
stopWordsOption string
82+
allowDuplicates bool
7283
)
7384

74-
func scanTable(table [][]string, col2 int, val string) ([]string, bool) {
75-
for _, row := range table {
76-
if col2 < len(row) && row[col2] == val {
77-
return row, true
85+
// cellsMatch checks if two cells' values match
86+
func cellsMatch(val1, val2 string, stopWords []string) bool {
87+
if caseSensitive == false {
88+
val2 = strings.ToLower(val2)
89+
}
90+
if len(stopWords) > 0 {
91+
val2 = strings.Join(datatools.ApplyStopWords(strings.Split(val2, " "), stopWords), " ")
92+
}
93+
switch {
94+
case useLevenshtein == true:
95+
distance := datatools.Levenshtein(val2, val1, insertCost, deleteCost, substituteCost, caseSensitive)
96+
if distance <= maxEditDistance {
97+
return true
98+
}
99+
case useContains == true:
100+
if strings.Contains(val2, val1) {
101+
return true
102+
}
103+
default:
104+
if val1 == val2 {
105+
return true
106+
}
107+
}
108+
return false
109+
}
110+
111+
func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 int, stopWords []string) {
112+
if col1 >= len(rowA) {
113+
return
114+
}
115+
val1 := rowA[col1]
116+
if caseSensitive == false {
117+
val1 = strings.ToLower(val1)
118+
}
119+
if len(stopWords) > 0 {
120+
val1 = strings.Join(datatools.ApplyStopWords(strings.Split(val1, " "), stopWords), " ")
121+
}
122+
for i, rowB := range table {
123+
// Emit a joined row if we have a match
124+
if col2 < len(rowB) {
125+
val2 := rowB[col2]
126+
if cellsMatch(val1, val2, stopWords) == true {
127+
// We have a match, join the two rows and output
128+
combinedRows := append(rowA, rowB...)
129+
if err := w.Write(combinedRows); err != nil {
130+
fmt.Fprintf(os.Stderr, "Can't write csv row line %d of table 2, %s", i, err)
131+
return
132+
}
133+
if allowDuplicates == false {
134+
return
135+
}
136+
}
78137
}
79138
}
80-
return []string{}, false
81139
}
82140

83141
func init() {
@@ -96,6 +154,15 @@ func init() {
96154
flag.StringVar(&csv2FName, "csv2", "", "second CSV filename")
97155
flag.IntVar(&col1, "col1", 0, "column to on join on in first CSV file")
98156
flag.IntVar(&col2, "col2", 0, "column to on join on in second CSV file")
157+
flag.BoolVar(&caseSensitive, "case-sensitive", false, "make a case sensitive match (default is case insensitive)")
158+
flag.BoolVar(&useContains, "contains", false, "match columns based on csv1/col1 contained in csv2/col2")
159+
flag.BoolVar(&useLevenshtein, "levenshtein", false, "match columns using Levensthein edit distance")
160+
flag.IntVar(&insertCost, "insert-cost", 1, "insertion cost to use when calculating Levenshtein edit distance")
161+
flag.IntVar(&deleteCost, "delete-cost", 1, "deletion cost to use when calculating Levenshtein edit distance")
162+
flag.IntVar(&substituteCost, "substitute-cost", 1, "substitution cost to use when calculating Levenshtein edit distance")
163+
flag.IntVar(&maxEditDistance, "max-edit-distance", 5, "maximum edit distance for match using Levenshtein distance")
164+
flag.StringVar(&stopWordsOption, "stop-words", "", "a column delimited list of stop words to ingnore when matching")
165+
flag.BoolVar(&allowDuplicates, "allow-duplicates", false, "allowing duplicates returns all rows that have matching columns rather than first match")
99166
}
100167

101168
func main() {
@@ -150,6 +217,9 @@ func main() {
150217
os.Exit(1)
151218
}
152219

220+
// FIXME: Should only read the smaller of two files into memory (and probably only the column value)
221+
// then interate through the other file for matches. This would let you work with larger files.
222+
153223
// Read in CSV1 and CSV2 then iterate over CSV1 output rows that have
154224
// matching column's value
155225
src1, err := ioutil.ReadFile(csv1FName)
@@ -189,19 +259,12 @@ func main() {
189259
csv2Table = append(csv2Table, record)
190260
}
191261

262+
stopWords := strings.Split(stopWordsOption, ":")
192263
w := csv.NewWriter(out)
193-
val := ""
194264
for _, rowA := range csv1Table {
195-
if col1 < len(rowA) {
196-
val = rowA[col1]
197-
// Name see if we find matching row in table 2
198-
if rowB, ok := scanTable(csv2Table, col2, val); ok == true {
199-
// We have
200-
combinedRows := append(rowA, rowB...)
201-
if err := w.Write(combinedRows); err != nil {
202-
log.Fatalf("error wrint args as csv, %s", err)
203-
}
204-
}
265+
if col1 < len(rowA) && rowA[col1] != "" {
266+
// We are relying on the side effect of writing the CSV output in scanTable
267+
scanTable(w, rowA, col1, csv2Table, col2, stopWords)
205268
}
206269
}
207270
w.Flush()

css/site.css

Lines changed: 81 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,9 @@ body {
1919
width: 100%;
2020
height: 100%;
2121
color: black;
22-
background-color: #AAA99F; /* #76777B;*/
23-
/*
24-
color: #FF6E1E;
2522
background-color: white;
26-
*/
2723
font-family: Open Sans, Helvetica, Sans-Serif;
28-
font-size: 16px;
24+
font-size: calc(1em+1vm);
2925
}
3026

3127
header {
@@ -141,7 +137,7 @@ section {
141137
position: relative;
142138
display: inline-block;
143139
width: 100%;
144-
min-height: 84%;
140+
height: 100%;
145141
color: black;
146142
background-color: white;
147143
margin: 0;
@@ -151,6 +147,11 @@ section {
151147
padding-right: 0;
152148
}
153149

150+
section p {
151+
width: 85%;
152+
height: auto;
153+
}
154+
154155
section h1 {
155156
font-size: 1.32em;
156157
}
@@ -194,6 +195,79 @@ section ul ul ul ul ul ul {
194195
}
195196

196197

198+
section a:link, section a:visited {
199+
/* orange: #FF6E1E; */
200+
font-style: italic;
201+
font-weight: normal;
202+
}
203+
204+
section a:active, section a:hover, section a:focus {
205+
/* orange: #FF6E1E; */
206+
color: #FF6E1E;
207+
}
208+
209+
table {
210+
table-layout: fixed;
211+
max-width: 60%;
212+
border-collapse: collapse;
213+
border: 2px solid black;
214+
padding: 0;
215+
margin-top: 2em;
216+
margin-bottom: 2em;
217+
margin-left: 5em;
218+
margin-right: 5em;
219+
}
220+
221+
thead th:nth-child(1) {
222+
width: auto;
223+
}
224+
225+
thead th:nth-child(2) {
226+
width: auto;
227+
}
228+
229+
thead th:nth-child(3) {
230+
width: auto;
231+
}
232+
233+
thead th:nth-child(4) {
234+
width: auto;
235+
}
236+
237+
th, td {
238+
padding-top: 0.24em;
239+
padding-bottom: 0;
240+
padding-left: 1em;
241+
padding-right: 0;
242+
text-align: center;
243+
}
244+
245+
th {
246+
padding-top: 0;
247+
padding-bottom: 0;
248+
padding-left: 1em;
249+
padding-right: 1em;
250+
font-size: 0.9em;
251+
text-align: center
252+
}
253+
254+
td {
255+
font-size: 0.8em;
256+
}
257+
258+
thead tr:nth-child(1) {
259+
background-color: lightgrey;
260+
border-bottom: 1px solid black;
261+
}
262+
263+
tr {
264+
border-bottom: 1px groove black;
265+
}
266+
267+
tr:nth-child(even) {
268+
background-color: wheat;
269+
}
270+
197271
aside {
198272
margin: 0;
199273
border: 0;
@@ -229,7 +303,7 @@ aside ul > ul {
229303
}
230304

231305
footer {
232-
position: relative;
306+
position: fixed;
233307
bottom: 0;
234308
display: block;
235309
width: 100%;

0 commit comments

Comments
 (0)