summaryrefslogtreecommitdiff
path: root/src/util/didyoumean.cpp
blob: 0f95722fd55bf616feb68ca67e4f5a50814c6fb1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
/******************************************************************************
 * Top contributors (to current version):
 *   Kshitij Bansal, Tim King, Clark Barrett
 *
 * This file is part of the cvc5 project.
 *
 * Copyright (c) 2009-2021 by the authors listed in the file AUTHORS
 * in the top-level source directory and their institutional affiliations.
 * All rights reserved.  See the file COPYING in the top-level source
 * directory for licensing information.
 * ****************************************************************************
 *
 * Did-you-mean style suggestions.
 *
 * ``What do you mean? I don't understand.'' An attempt to be more
 * helpful than that. Similar to one in git.
 *
 * There are no dependencies on cvc5 (except namespace).
 */

#include "util/didyoumean.h"

#include <algorithm>
#include <array>
#include <sstream>
#include <string>
#include <vector>

namespace cvc5 {

namespace {

uint64_t editDistance(const std::string& a, const std::string& b) {
  // input string: a
  // desired string: b

  constexpr uint64_t swapCost = 0;
  constexpr uint64_t substituteCost = 2;
  constexpr uint64_t addCost = 1;
  constexpr uint64_t deleteCost = 3;
  constexpr uint64_t switchCaseCost = 0;

  uint64_t len1 = a.size();
  uint64_t len2 = b.size();

  std::array<std::vector<uint64_t>, 3> C;
  for (auto& c: C)
  {
    c.resize(len2 + 1);
  }
  //  int C[3][len2+1];             // cost

  for (uint64_t j = 0; j <= len2; ++j)
  {
    C[0][j] = j * addCost;
  }

  for (uint64_t i = 1; i <= len1; ++i)
  {
    uint64_t cur = i % 3;
    uint64_t prv = (i + 2) % 3;
    uint64_t pr2 = (i + 1) % 3;

    C[cur][0] = i * deleteCost;

    for (uint64_t j = 1; j <= len2; ++j)
    {
      C[cur][j] = 100000000;  // INF

      if (a[i - 1] == b[j - 1]) {
        // match
        C[cur][j] = std::min(C[cur][j], C[prv][j - 1]);
      } else if (tolower(a[i - 1]) == tolower(b[j - 1])) {
        // switch case
        C[cur][j] = std::min(C[cur][j], C[prv][j - 1] + switchCaseCost);
      } else {
        // substitute
        C[cur][j] = std::min(C[cur][j], C[prv][j - 1] + substituteCost);
      }

      // swap
      if (i >= 2 && j >= 2 && a[i - 1] == b[j - 2] && a[i - 2] == b[j - 1]) {
        C[cur][j] = std::min(C[cur][j], C[pr2][j - 2] + swapCost);
      }

      // add
      C[cur][j] = std::min(C[cur][j], C[cur][j - 1] + addCost);

      // delete
      C[cur][j] = std::min(C[cur][j], C[prv][j] + deleteCost);
    }
  }
  return C[len1 % 3][len2];
}

}

std::vector<std::string> DidYouMean::getMatch(const std::string& input)
{
  {
    std::sort(d_words.begin(), d_words.end());
    auto it = std::unique(d_words.begin(), d_words.end());
    d_words.erase(it, d_words.end());
  }

  /** Magic numbers */
  constexpr uint64_t similarityThreshold = 7;
  constexpr uint64_t numMatchesThreshold = 10;

  std::vector<std::pair<uint64_t,std::string>> scores;
  std::vector<std::string> ret;
  for (const auto& s: d_words) {
    if (s == input) {
      // if input matches AS-IS just return that
      ret.emplace_back(s);
      return ret;
    }
    uint64_t score = 0;
    if (s.compare(0, input.size(), input) != 0) {
      score = editDistance(input, s) + 1;
    }
    scores.emplace_back(std::make_pair(score, s));
  }
  std::sort(scores.begin(), scores.end());
  const uint64_t min_score = scores.begin()->first;
  for (const auto& score: scores) {
    // from here on, matches are not similar enough
    if (score.first > similarityThreshold) break;
    // from here on, matches are way worse than the best one
    if (score.first > min_score + 2) break;
    // we already have enough matches
    if (ret.size() >= numMatchesThreshold) break;
    ret.push_back(score.second);
  }
  return ret;
}

std::string DidYouMean::getMatchAsString(const std::string& input)
{
  std::vector<std::string> matches = getMatch(input);
  std::ostringstream oss;
  if (matches.size() > 0) {
    oss << std::endl << std::endl;
    if (matches.size() == 1) {
      oss << "Did you mean this?";
    } else {
      oss << "Did you mean any of these?";
    }
    for (size_t i = 0; i < matches.size(); ++i)
    {
      oss << "\n        " << matches[i];
    }
  }
  return oss.str();
}

}  // namespace cvc5
generated by cgit on debian on lair
contact matthew@masot.net with questions or feedback