vanessaklee
diff --git a/‎README.md‎
Lines changed: 24 additions & 24 deletions b/‎README.md‎
Lines changed: 24 additions & 24 deletions
diff --git a/‎lib/akin.ex‎
Lines changed: 2 additions & 0 deletions b/‎lib/akin.ex‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/akin/algorithms/helpers/initials_comparison.ex‎
Lines changed: 35 additions & 15 deletions b/‎lib/akin/algorithms/helpers/initials_comparison.ex‎
Lines changed: 35 additions & 15 deletions
diff --git a/‎lib/akin/algorithms/names.ex‎
Lines changed: 2 additions & 1 deletion b/‎lib/akin/algorithms/names.ex‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/akin/task.ex‎
Lines changed: 2 additions & 2 deletions b/‎lib/akin/task.ex‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/akin/util.ex‎
Lines changed: 1 addition & 0 deletions b/‎lib/akin/util.ex‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/scripts/ml.ex‎
Lines changed: 114 additions & 0 deletions b/‎lib/scripts/ml.ex‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎test/algorithms/chunk_set_test.exs‎
Lines changed: 2 additions & 2 deletions b/‎test/algorithms/chunk_set_test.exs‎
Lines changed: 2 additions & 2 deletions
@@ -257,17 +257,14 @@ The bag distance is a cheap distance measure which always returns a distance sma
 </details>
 
 <details>
-  <summary><u>Substring Set</u></summary>
-
-Splits the strings on spaces, sorts, re-joins, and then determines Jaro-Winkler distance. Best when the strings contain irrelevent substrings. 
-</details>
-
-<details>
-  <summary><u>Sørensen–Dice</u></summary>
+  <summary><u>Double Metaphone</u></summary>
 
-Sørensen–Dice coefficient is calculated using bigrams. The equation is `2nt / nx + ny` where nx is the number of bigrams in string x, ny is the number of bigrams in string y, and nt is the number of bigrams in both strings. For example, the bigrams of `night` and `nacht` are `{ni,ig,gh,ht}` and `{na,ac,ch,ht}`. They each have four and the intersection is `ht`. 
+Calculates the [Double Metaphone Phonetic Algorithm](https://xlinux.nist.gov/dads/HTML/doubleMetaphone.html) metric of two strings. The return value is based on the match level: strict, strong, normal (default), or weak. 
 
-``` (2 · 1) / (4 + 4) = 0.25 ```
+  * "strict": both encodings for each string must match
+  * "strong": the primary encoding for each string must match
+  * "normal": the primary encoding of one string must match either encoding of other string (default)
+  * "weak":   either primary or secondary encoding of one string must match one encoding of other string
 </details>
 
 <details>
@@ -303,14 +300,23 @@ Compares two strings by converting each to an approximate phonetic representatio
 </details>
 
 <details>
-  <summary><u>Double Metaphone</u></summary>
+  <summary><u>N-Gram Similarity</u></summary>
 
-Calculates the [Double Metaphone Phonetic Algorithm](https://xlinux.nist.gov/dads/HTML/doubleMetaphone.html) metric of two strings. The return value is based on the match level: strict, strong, normal (default), or weak. 
+Calculates the ngram distance between two strings. Default ngram: 2.
+</details>
 
-  * "strict": both encodings for each string must match
-  * "strong": the primary encoding for each string must match
-  * "normal": the primary encoding of one string must match either encoding of other string (default)
-  * "weak":   either primary or secondary encoding of one string must match one encoding of other string
+<details>
+  <summary><u>Overlap Metric</u></summary>
+
+Uses the Overlap Similarity metric to compare two strings by tokenizing the strings and measuring their overlap. Default ngram: 1.
+</details>
+
+<details>
+  <summary><u>Sørensen–Dice</u></summary>
+
+Sørensen–Dice coefficient is calculated using bigrams. The equation is `2nt / nx + ny` where nx is the number of bigrams in string x, ny is the number of bigrams in string y, and nt is the number of bigrams in both strings. For example, the bigrams of `night` and `nacht` are `{ni,ig,gh,ht}` and `{na,ac,ch,ht}`. They each have four and the intersection is `ht`. 
+
+``` (2 · 1) / (4 + 4) = 0.25 ```
 </details>
 
 <details>
@@ -324,15 +330,9 @@ accuracy for search terms containing more than one word.
 </details>
 
 <details>
-  <summary><u>N-Gram Similarity</u></summary>
-
-Calculates the ngram distance between two strings. Default ngram: 2.
-</details>
-
-<details>
-  <summary><u>Overlap Metric</u></summary>
+  <summary><u>Substring Set</u></summary>
 
-Uses the Overlap Similarity metric to compare two strings by tokenizing the strings and measuring their overlap. Default ngram: 1.
+Splits the strings on spaces, sorts, re-joins, and then determines Jaro-Winkler distance. Best when the strings contain irrelevent substrings. 
 </details>
 
 <details>
@@ -361,7 +361,7 @@ A generalization of Sørensen–Dice and Jaccard.
 
 ## In Development
 
-* Author Name Disambiguation (see lib/akin/and.ex for developments)
+* Further enhancements to name matching
 * Add Damerau-Levenshtein algorithm
   * [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau-Levenshtein_distance)
   * [Examples](https://datascience.stackexchange.com/questions/60019/damerau-levenshtein-edit-distance-in-python)
 
@@ -72,6 +72,8 @@ defmodule Akin do
   """
   def match_names(left, rights, opts \\ default_opts())
 
+  def match_names(_, [], _), do: []
+
   def match_names(left, rights, opts) when is_binary(left) and is_list(rights) do
     rights = Enum.map(rights, fn right -> compose(right) end)
     match_names(compose(left), rights, opts)
 
@@ -5,17 +5,9 @@ defmodule Akin.Helpers.InitialsComparison do
   import Akin.Util, only: [ngram_tokenize: 2]
   alias Akin.Corpus
 
-  # the mean bag distance from training is 0.71
-  @min_bag_distance 0.5
-
   def similarity(%Corpus{} = left, %Corpus{} = right) do
-    similarity(left, right, String.bag_distance(left.string, right.string) >= @min_bag_distance)
-  end
-
-  # do the inital letters of each string match?
-  def similarity(left, right, true) do
-    left_initials = initials(left)
-    right_initials = initials(right)
+    left_initials = initials(left) |> Enum.sort()
+    right_initials = initials(right) |> Enum.sort()
 
     left_i_count = Enum.count(left_initials)
     right_i_count = Enum.count(right_initials)
@@ -30,10 +22,34 @@ defmodule Akin.Helpers.InitialsComparison do
       |> List.flatten()
       |> Enum.uniq()
 
-    case {left_i_count, right_i_count} do
-      {li, ri} when li == ri -> left_initials == right_initials
-      {li, ri} when li > ri -> left_initials -- right_initials == []
-      {li, ri} when li < ri -> right_initials -- left_initials == []
+    if String.contains?(left.original, ["-", "'"]) or String.contains?(right.original, ["-", "'"]) do
+      case {left_i_count, right_i_count} do
+        {li, ri} when li == ri -> left_initials == right_initials
+        {li, ri} when li > ri ->
+          case left_initials -- right_initials do
+            [] -> true
+            [_i] ->
+              combined_hyphenation = right.list -- left.list
+              full_permutations = get_permuations(left.list)
+              combined_hyphenation -- full_permutations == []
+            _ -> false
+          end
+        {li, ri} when li < ri ->
+          case right_initials -- left_initials do
+            [] -> true
+            [_i] ->
+              combined_hyphenation = left.list -- right.list
+              full_permutations = get_permuations(right.list)
+              combined_hyphenation -- full_permutations == []
+            _ -> false
+          end
+      end
+    else
+      case {left_i_count, right_i_count} do
+        {li, ri} when li == ri -> left_initials == right_initials
+        {li, ri} when li > ri -> left_initials -- right_initials == []
+        {li, ri} when li < ri -> right_initials -- left_initials == []
+      end
     end
     |> cartesian_match(left_c_intials, right_c_intials)
     |> permutation_match(left.list, right.list)
@@ -45,6 +61,10 @@ defmodule Akin.Helpers.InitialsComparison do
     Enum.map(lists, fn list -> String.at(list, 0) end)
   end
 
+  defp initials(list) when is_list(list) do
+    Enum.map(list, fn l -> String.at(l, 0) end)
+  end
+
   defp initials(_), do: []
 
   defp actual_initials(list) do
@@ -68,7 +88,7 @@ defmodule Akin.Helpers.InitialsComparison do
   defp cartesian_match(false, left, right) do
     Enum.filter(left, fn l -> l in right end)
     |> Enum.count()
-    |> Kernel.>(0)
+    |> Kernel.>(1)
   end
 
   defp permutation_match(true, _, _), do: true
 
@@ -10,7 +10,6 @@ defmodule Akin.Names do
   @weight 0.05
   @shortness_boost 0.0175
 
-  @spec compare(binary() | %Corpus{}, binary() | %Corpus{}, keyword()) :: float()
   @doc """
   Manage the steps of comparing two names. Collect metrics from the algorithms requested
   in the options or the default algorithms. Give weight to the consideration of initials
@@ -32,10 +31,12 @@ defmodule Akin.Names do
     metrics = Akin.compare(left, right)
 
     short_length = opts(opts, :short_length)
+    initials_match? = if weight > 0, do: 1.0, else: 0.0
 
     score =
       calc(metrics, weight, short_length, len(right.string))
       |> Enum.map(fn {k, v} -> {k, r(v)} end)
+      |> Keyword.put(:initials, initials_match?)
 
     %{scores: score}
   end
 
@@ -2,7 +2,7 @@ defmodule Akin.Task do
   @moduledoc """
   API for all string comparison modules.
   """
-  @callback compare(%Akin.Corpus{}, %Akin.Corpus{}, Keyword.t(any())) :: number()
-  @callback compare(%Akin.Corpus{}, %Akin.Corpus{}) :: number()
+  @callback compare(%Akin.Corpus{}, %Akin.Corpus{}, Keyword.t(any())) :: number() | map()
+  @callback compare(%Akin.Corpus{}, %Akin.Corpus{}) :: number() | map()
   @optional_callbacks compare: 2
 end
@@ -87,6 +87,7 @@ defmodule Akin.Util do
   end
 
   defp replace(string) do
+    string = String.replace(string, "'", "")
     Regex.replace(@nontext_codepoints, string, " ")
     |> String.replace(~r/[\p{P}\p{S}]/u, " ")
     |> :unicode.characters_to_nfd_binary()
 
@@ -0,0 +1,114 @@
+defmodule Akin.ML do
+  def training_data() do
+    NimbleCSV.define(CSVParse, separator: ",", escape: "\\")
+    File.rm("test/support/metrics_for_training.csv")
+
+    File.stream!("test/support/dblp_for_training.csv")
+    |> Stream.map(&String.trim(&1))
+    |> Enum.to_list()
+    |> Enum.each(fn row ->
+      [left, right, match] = String.split(row, "\t")
+
+      case Akin.match_names_metrics(left, [right]) do
+        [%{left: _, right: _, metrics: scores, match: _}] ->
+          # names = l <> " <- (" <> to_string(m) <> ") -> " <> r
+          match = if match == "1", do: "match", else: "non-match"
+          scores = Enum.into(scores, %{})
+
+          data =
+            [
+              [
+                scores.bag_distance,
+                scores.substring_set,
+                scores.sorensen_dice,
+                scores.metaphone,
+                scores.double_metaphone,
+                scores.substring_double_metaphone,
+                scores.jaccard,
+                scores.jaro_winkler,
+                scores.levenshtein,
+                scores.ngram,
+                scores.overlap,
+                scores.substring_sort,
+                scores.tversky,
+                match
+              ]
+            ]
+            |> CSVParse.dump_to_iodata()
+
+          File.write!("test/support/metrics_for_training.csv", [data], [:append])
+
+        _ ->
+          nil
+      end
+    end)
+  end
+
+  def tangram_data() do
+    NimbleCSV.define(CSVParse, separator: "\t")
+    File.rm("test/support/metrics_for_predicting.csv")
+
+    # File.stream!("test/support/orcid_for_predicting.csv")
+    File.stream!("test/support/orcid/predict_b.csv")
+    |> Stream.map(&String.trim(&1))
+    |> Enum.to_list()
+    |> Enum.reduce(:ok, fn row, acc ->
+      # Phase 4 prediction data for tangram
+      # [a, b, c, d] = String.split(row, "\t")
+      [_, _, _, a, _, b, _, _, _, c, d] = String.split(row, "\t")
+      b = String.replace(b, "|", ", ")
+      c = String.replace(c, "_", " ")
+      d = String.replace(d, "_", " ")
+
+      Akin.match_names_metrics(b, [a, c, d])
+      |> Enum.each(fn %{left: l, right: r, metrics: s, match: m} ->
+        names = l <> " <- (" <> to_string(m) <> ") -> " <> r
+        scores = Enum.into(s, %{})
+        match = "match"
+
+        IO.inspect scores
+
+        data =
+          [
+            [
+              scores.bag_distance,
+              scores.substring_set,
+              scores.sorensen_dice,
+              scores.metaphone,
+              scores.double_metaphone,
+              scores.substring_double_metaphone,
+              scores.jaccard,
+              scores.jaro_winkler,
+              scores.levenshtein,
+              scores.ngram,
+              scores.overlap,
+              scores.substring_sort,
+              scores.tversky,
+              scores.
+              match
+              # scores.bag_distance,
+              # scores.substring_set,
+              # scores.sorensen_dice,
+              # scores.metaphone,
+              # scores.double_metaphone,
+              # scores.substring_double_metaphone,
+              # scores.jaccard,
+              # scores.jaro_winkler,
+              # scores.levenshtein,
+              # scores.ngram,
+              # scores.overlap,
+              # scores.substring_sort,
+              # scores.tversky,
+              # names,
+              # match
+            ]
+          ]
+          |> CSVParse.dump_to_iodata()
+
+        File.write!("test/support/orcid_for_training.csv", [data], [:append])
+      end)
+
+      acc
+    end)
+  end
+end
@@ -50,10 +50,10 @@ defmodule SubstringSetTest do
 
   test "returns expected float value for comparing string of extreme length difference" do
     left = "alice in wonderland"
-    right = "alice's adventures through the looking glass"
+    right = "alice's adventures in wonderland"
 
     normal = normal(left, right)
-    assert normal == 0.79
+    assert normal == 0.83
     assert normal < weak(left, right)
   end