summaryrefslogtreecommitdiff
path: root/lib/did_you_mean/jaro_winkler.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/did_you_mean/jaro_winkler.rb')
-rw-r--r--lib/did_you_mean/jaro_winkler.rb87
1 files changed, 87 insertions, 0 deletions
diff --git a/lib/did_you_mean/jaro_winkler.rb b/lib/did_you_mean/jaro_winkler.rb
new file mode 100644
index 0000000000..56db130af4
--- /dev/null
+++ b/lib/did_you_mean/jaro_winkler.rb
@@ -0,0 +1,87 @@
+module DidYouMean
+ module Jaro
+ module_function
+
+ def distance(str1, str2)
+ str1, str2 = str2, str1 if str1.length > str2.length
+ length1, length2 = str1.length, str2.length
+
+ m = 0.0
+ t = 0.0
+ range = (length2 / 2).floor - 1
+ range = 0 if range < 0
+ flags1 = 0
+ flags2 = 0
+
+ # Avoid duplicating enumerable objects
+ str1_codepoints = str1.codepoints
+ str2_codepoints = str2.codepoints
+
+ i = 0
+ while i < length1
+ last = i + range
+ j = (i >= range) ? i - range : 0
+
+ while j <= last
+ if flags2[j] == 0 && str1_codepoints[i] == str2_codepoints[j]
+ flags2 |= (1 << j)
+ flags1 |= (1 << i)
+ m += 1
+ break
+ end
+
+ j += 1
+ end
+
+ i += 1
+ end
+
+ k = i = 0
+ while i < length1
+ if flags1[i] != 0
+ j = index = k
+
+ k = while j < length2
+ index = j
+ break(j + 1) if flags2[j] != 0
+
+ j += 1
+ end
+
+ t += 1 if str1_codepoints[i] != str2_codepoints[index]
+ end
+
+ i += 1
+ end
+ t = (t / 2).floor
+
+ m == 0 ? 0 : (m / length1 + m / length2 + (m - t) / m) / 3
+ end
+ end
+
+ module JaroWinkler
+ WEIGHT = 0.1
+ THRESHOLD = 0.7
+
+ module_function
+
+ def distance(str1, str2)
+ jaro_distance = Jaro.distance(str1, str2)
+
+ if jaro_distance > THRESHOLD
+ codepoints2 = str2.codepoints
+ prefix_bonus = 0
+
+ i = 0
+ str1.each_codepoint do |char1|
+ char1 == codepoints2[i] && i < 4 ? prefix_bonus += 1 : break
+ i += 1
+ end
+
+ jaro_distance + (prefix_bonus * WEIGHT * (1 - jaro_distance))
+ else
+ jaro_distance
+ end
+ end
+ end
+end