adding support for Levenshtein and contains matching in csvjoin

rsdoiel · rsdoiel · commit ab282275cdb9 · 2017-04-13T12:01:10.000-07:00
diff --git a/cmds/csvfind/csvfind.go b/cmds/csvfind/csvfind.go
@@ -83,7 +83,7 @@ You can also search for phrases in columns.
 	caseSensitive      bool
 	maxEditDistance    int
 	appendEditDistance bool
-	stopWords          string
+	stopWordsOption    string
 )
 
 func scanTable(table [][]string, col2 int, val string) ([]string, bool) {
@@ -118,7 +118,7 @@ func init() {
 	flag.IntVar(&substituteCost, "substitute-cost", 1, "set the substitution cost to use for levenshtein matching")
 	flag.BoolVar(&caseSensitive, "case-sensitive", false, "perform a case sensitive match (default is false)")
 	flag.BoolVar(&appendEditDistance, "append-edit-distance", false, "append column with edit distance found (useful for tuning levenshtein)")
-	flag.StringVar(&stopWords, "stop-words", "", "use the colon delimited list of stop words")
+	flag.StringVar(&stopWordsOption, "stop-words", "", "use the colon delimited list of stop words")
 	flag.BoolVar(&skipHeaderRow, "skip-header-row", true, "skip the header row")
 }
 
@@ -158,16 +158,16 @@ func main() {
 	}
 
 	target := args[0]
-	stopList := []string{}
+	stopWords := []string{}
 
 	// NOTE: If we're doing a case Insensitive search (the default) the lower case everything before matching
 	if caseSensitive == false {
 		target = strings.ToLower(target)
-		//stopWords = strings.ToLower(target)
+		stopWordsOption = strings.ToLower(stopWordsOption)
 	}
-	if len(stopWords) > 0 {
-		stopList = strings.Split(stopWords, ":")
-		target = strings.Join(datatools.ApplyStopWords(strings.Split(target, " "), stopList), " ")
+	if len(stopWordsOption) > 0 {
+		stopWords = strings.Split(stopWordsOption, ":")
+		target = strings.Join(datatools.ApplyStopWords(strings.Split(target, " "), stopWords), " ")
 	}
 
 	in, err := cli.Open(inputFName, os.Stdin)
@@ -204,13 +204,13 @@ func main() {
 				if caseSensitive == false {
 					src = strings.ToLower(src)
 				}
-				if len(stopList) > 0 {
+				if len(stopWords) > 0 {
 					// Split into fields applying datatools filter
 					fields := strings.FieldsFunc(src, func(c rune) bool {
 						return datatools.Filter(c, "", false)
 					})
 					// Convert to an array of strings back into a space separted string
-					src = strings.Join(datatools.ApplyStopWords(fields, stopList), " ")
+					src = strings.Join(datatools.ApplyStopWords(fields, stopWords), " ")
 				}
 				switch {
 				case useContains:
diff --git a/cmds/csvjoin/csvjoin.go b/cmds/csvjoin/csvjoin.go
@@ -28,6 +28,7 @@ import (
 	"log"
 	"os"
 	"path"
+	"strings"
 
 	// My packages
 	"github.com/caltechlibrary/cli"
@@ -56,6 +57,7 @@ merged-data.csv..
     %s -csv1=data1.csv -col1=1 \
        -csv2=data2.csv -col2=3 \
 	   -output=merged-data.csv
+
 `
 
 	// Standard Options
@@ -65,19 +67,75 @@ merged-data.csv..
 	outputFName string
 
 	// App Options
-	csv1FName string
-	csv2FName string
-	col1      int
-	col2      int
+	csv1FName       string
+	csv2FName       string
+	col1            int
+	col2            int
+	caseSensitive   bool
+	useContains     bool
+	useLevenshtein  bool
+	insertCost      int
+	deleteCost      int
+	substituteCost  int
+	maxEditDistance int
+	stopWordsOption string
+	allowDuplicates bool
 )
 
-func scanTable(table [][]string, col2 int, val string) ([]string, bool) {
-	for _, row := range table {
-		if col2 < len(row) && row[col2] == val {
-			return row, true
+// cellsMatch checks if two cells' values match
+func cellsMatch(val1, val2 string, stopWords []string) bool {
+	if caseSensitive == false {
+		val2 = strings.ToLower(val2)
+	}
+	if len(stopWords) > 0 {
+		val2 = strings.Join(datatools.ApplyStopWords(strings.Split(val2, " "), stopWords), " ")
+	}
+	switch {
+	case useLevenshtein == true:
+		distance := datatools.Levenshtein(val2, val1, insertCost, deleteCost, substituteCost, caseSensitive)
+		if distance <= maxEditDistance {
+			return true
+		}
+	case useContains == true:
+		if strings.Contains(val2, val1) {
+			return true
+		}
+	default:
+		if val1 == val2 {
+			return true
+		}
+	}
+	return false
+}
+
+func scanTable(w *csv.Writer, rowA []string, col1 int, table [][]string, col2 int, stopWords []string) {
+	if col1 >= len(rowA) {
+		return
+	}
+	val1 := rowA[col1]
+	if caseSensitive == false {
+		val1 = strings.ToLower(val1)
+	}
+	if len(stopWords) > 0 {
+		val1 = strings.Join(datatools.ApplyStopWords(strings.Split(val1, " "), stopWords), " ")
+	}
+	for i, rowB := range table {
+		// Emit a joined row if we have a match
+		if col2 < len(rowB) {
+			val2 := rowB[col2]
+			if cellsMatch(val1, val2, stopWords) == true {
+				// We have a match, join the two rows and output
+				combinedRows := append(rowA, rowB...)
+				if err := w.Write(combinedRows); err != nil {
+					fmt.Fprintf(os.Stderr, "Can't write csv row line %d of table 2, %s", i, err)
+					return
+				}
+				if allowDuplicates == false {
+					return
+				}
+			}
 		}
 	}
-	return []string{}, false
 }
 
 func init() {
@@ -96,6 +154,15 @@ func init() {
 	flag.StringVar(&csv2FName, "csv2", "", "second CSV filename")
 	flag.IntVar(&col1, "col1", 0, "column to on join on in first CSV file")
 	flag.IntVar(&col2, "col2", 0, "column to on join on in second CSV file")
+	flag.BoolVar(&caseSensitive, "case-sensitive", false, "make a case sensitive match (default is case insensitive)")
+	flag.BoolVar(&useContains, "contains", false, "match columns based on csv1/col1 contained in csv2/col2")
+	flag.BoolVar(&useLevenshtein, "levenshtein", false, "match columns using Levensthein edit distance")
+	flag.IntVar(&insertCost, "insert-cost", 1, "insertion cost to use when calculating Levenshtein edit distance")
+	flag.IntVar(&deleteCost, "delete-cost", 1, "deletion cost to use when calculating Levenshtein edit distance")
+	flag.IntVar(&substituteCost, "substitute-cost", 1, "substitution cost to use when calculating Levenshtein edit distance")
+	flag.IntVar(&maxEditDistance, "max-edit-distance", 5, "maximum edit distance for match using Levenshtein distance")
+	flag.StringVar(&stopWordsOption, "stop-words", "", "a column delimited list of stop words to ingnore when matching")
+	flag.BoolVar(&allowDuplicates, "allow-duplicates", false, "allowing duplicates returns all rows that have matching columns rather than first match")
 }
 
 func main() {
@@ -150,6 +217,9 @@ func main() {
 		os.Exit(1)
 	}
 
+	// FIXME: Should only read the smaller of two files into memory (and probably only the column value)
+	// then interate through the other file for matches. This would let you work with larger files.
+
 	// Read in CSV1 and CSV2 then iterate over CSV1 output rows that have
 	// matching column's value
 	src1, err := ioutil.ReadFile(csv1FName)
@@ -189,19 +259,12 @@ func main() {
 		csv2Table = append(csv2Table, record)
 	}
 
+	stopWords := strings.Split(stopWordsOption, ":")
 	w := csv.NewWriter(out)
-	val := ""
 	for _, rowA := range csv1Table {
-		if col1 < len(rowA) {
-			val = rowA[col1]
-			// Name see if we find matching row in table 2
-			if rowB, ok := scanTable(csv2Table, col2, val); ok == true {
-				// We have
-				combinedRows := append(rowA, rowB...)
-				if err := w.Write(combinedRows); err != nil {
-					log.Fatalf("error wrint args as csv, %s", err)
-				}
-			}
+		if col1 < len(rowA) && rowA[col1] != "" {
+			// We are relying on the side effect of writing the CSV output in scanTable
+			scanTable(w, rowA, col1, csv2Table, col2, stopWords)
 		}
 	}
 	w.Flush()
diff --git a/css/site.css b/css/site.css
@@ -19,13 +19,9 @@ body {
      width: 100%;
      height: 100%;
      color: black;
-     background-color: #AAA99F; /* #76777B;*/
-     /*
-     color: #FF6E1E;
      background-color: white;
-     */
      font-family: Open Sans, Helvetica, Sans-Serif;
-     font-size: 16px;
+     font-size: calc(1em+1vm);
 }
 
 header {
@@ -141,7 +137,7 @@ section {
      position: relative;
      display: inline-block;
      width: 100%;
-     min-height: 84%;
+     height: 100%;
      color: black;
      background-color: white;
      margin: 0;
@@ -151,6 +147,11 @@ section {
      padding-right: 0;
 }
 
+section p {
+    width: 85%;
+    height: auto;
+}
+
 section h1 {
     font-size: 1.32em;
 }
@@ -194,6 +195,79 @@ section ul ul ul ul ul ul {
 }
 
 
+section a:link, section a:visited {
+    /* orange: #FF6E1E; */
+    font-style: italic;
+    font-weight: normal;
+}
+
+section a:active, section a:hover, section a:focus {
+    /* orange: #FF6E1E; */
+    color: #FF6E1E;
+}
+
+table {
+    table-layout: fixed;
+    max-width: 60%;
+    border-collapse: collapse;
+    border: 2px solid black;
+    padding: 0;
+    margin-top: 2em;
+    margin-bottom: 2em;
+    margin-left: 5em;
+    margin-right: 5em;
+}
+
+thead th:nth-child(1) {
+    width: auto;
+}
+
+thead th:nth-child(2) {
+    width: auto;
+}
+
+thead th:nth-child(3) {
+    width: auto;
+}
+
+thead th:nth-child(4) {
+    width: auto;
+}
+
+th, td {
+    padding-top: 0.24em;
+    padding-bottom: 0;
+    padding-left: 1em;
+    padding-right: 0;
+    text-align: center;
+}
+
+th {
+    padding-top: 0;
+    padding-bottom: 0;
+    padding-left: 1em;
+    padding-right: 1em;
+    font-size: 0.9em;
+    text-align: center
+}
+
+td {
+    font-size: 0.8em;
+}
+
+thead tr:nth-child(1) {
+    background-color: lightgrey;
+    border-bottom: 1px solid black;
+}
+
+tr {
+    border-bottom: 1px groove black;
+}
+
+tr:nth-child(even) {
+    background-color: wheat;
+}
+
 aside {
      margin: 0;
      border: 0;
@@ -229,7 +303,7 @@ aside ul > ul {
 }
 
 footer {
-     position: relative;
+     position: fixed;
      bottom: 0;
      display: block;
      width: 100%;