summaryrefslogtreecommitdiff
path: root/lib/did_you_mean/jaro_winkler.rb
blob: 56db130af45993517ead8b70e4a163e5061c5b25 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
module DidYouMean
  module Jaro
    module_function

    def distance(str1, str2)
      str1, str2 = str2, str1 if str1.length > str2.length
      length1, length2 = str1.length, str2.length

      m          = 0.0
      t          = 0.0
      range      = (length2 / 2).floor - 1
      range      = 0 if range < 0
      flags1     = 0
      flags2     = 0

      # Avoid duplicating enumerable objects
      str1_codepoints = str1.codepoints
      str2_codepoints = str2.codepoints

      i = 0
      while i < length1
        last = i + range
        j    = (i >= range) ? i - range : 0

        while j <= last
          if flags2[j] == 0 && str1_codepoints[i] == str2_codepoints[j]
            flags2 |= (1 << j)
            flags1 |= (1 << i)
            m += 1
            break
          end

          j += 1
        end

        i += 1
      end

      k = i = 0
      while i < length1
        if flags1[i] != 0
          j = index = k

          k = while j < length2
            index = j
            break(j + 1) if flags2[j] != 0

            j += 1
          end

          t += 1 if str1_codepoints[i] != str2_codepoints[index]
        end

        i += 1
      end
      t = (t / 2).floor

      m == 0 ? 0 : (m / length1 + m / length2 + (m - t) / m) / 3
    end
  end

  module JaroWinkler
    WEIGHT    = 0.1
    THRESHOLD = 0.7

    module_function

    def distance(str1, str2)
      jaro_distance = Jaro.distance(str1, str2)

      if jaro_distance > THRESHOLD
        codepoints2  = str2.codepoints
        prefix_bonus = 0

        i = 0
        str1.each_codepoint do |char1|
          char1 == codepoints2[i] && i < 4 ? prefix_bonus += 1 : break
          i += 1
        end

        jaro_distance + (prefix_bonus * WEIGHT * (1 - jaro_distance))
      else
        jaro_distance
      end
    end
  end
end