-
Notifications
You must be signed in to change notification settings - Fork 1
/
BoyerMoore.h
175 lines (155 loc) · 6.53 KB
/
BoyerMoore.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#include <string>
#include <map>
#include <vector>
#include <assert.h>
#ifndef max
#define max(a, b) ((a < b) ? b : a)
#endif
template<bool caseSensitive>
wchar_t make_case(wchar_t c);
template<>
wchar_t make_case<true>(wchar_t c) { return c; }
template<>
wchar_t make_case<false>(wchar_t c) { return towupper(c); }
// delta1 table: delta1[c] contains the distance between the last
// character of pat and the rightmost occurrence of c in pat.
// If c does not occur in pat, then delta1[c] = pat.size().
// If c is at string[i] and c != pat[pat.size()-1], we can
// safely shift i over by delta1[c], which is the minimum distance
// needed to shift pat forward to get string[i] lined up
// with some character in pat.
// this algorithm runs in pat.size() time.
template<bool caseSensitive, class T>
std::map<wchar_t, size_t> make_delta1(const T& patBegin, const T& patEnd) {
assert(patBegin != patEnd);
std::map<wchar_t, size_t> delta1;
size_t patSize = patEnd - patBegin;
for (size_t i = 0; i < patSize - 1; i++) {
delta1[make_case<caseSensitive>(patBegin[i])] = patSize - 1 - i;
}
return delta1;
}
// true if the suffix of word starting from word[pos] is a prefix
// of word
template<bool caseSensitive, class T>
bool is_prefix(const T& wordBegin, const T& wordEnd, size_t pos) {
size_t suffixlen = (wordEnd - wordBegin) - pos;
for (size_t i = 0; i < suffixlen; i++) {
if (make_case<caseSensitive>(wordBegin[i]) != make_case<caseSensitive>(wordBegin[pos + i])) {
return false;
}
}
return true;
}
// length of the longest suffix of word ending on word[pos].
// suffix_length(L"dddbcabc", 4) = 2
template<bool caseSensitive, class T>
size_t suffix_length(const T& wordBegin, const T& wordEnd, size_t pos) {
size_t i;
size_t wordSize = (wordEnd - wordBegin);
// increment suffix length i to the first mismatch or beginning
// of the word
for (i = 0; (make_case<caseSensitive>(wordBegin[pos - i]) == make_case<caseSensitive>(wordBegin[wordSize - 1 - i])) && (i < pos); i++);
return i;
}
// delta2 table: given a mismatch at pat[pos], we want to align
// with the next possible full match could be based on what we
// know about pat[pos+1] to pat[pat.size()-1].
//
// In case 1:
// pat[pos+1] to pat[pat.size()-1] does not occur elsewhere in pat,
// the next plausible match starts at or after the mismatch.
// If, within the substring pat[pos+1 .. pat.size()-1], lies a prefix
// of pat, the next plausible match is here (if there are multiple
// prefixes in the substring, pick the longest). Otherwise, the
// next plausible match starts past the character aligned with
// pat[pat.size()-1].
//
// In case 2:
// pat[pos+1] to pat[pat.size()-1] does occur elsewhere in pat. The
// mismatch tells us that we are not looking at the end of a match.
// We may, however, be looking at the middle of a match.
//
// The first loop, which takes care of case 1, is analogous to
// the KMP table, adapted for a 'backwards' scan order with the
// additional restriction that the substrings it considers as
// potential prefixes are all suffixes. In the worst case scenario
// pat consists of the same letter repeated, so every suffix is
// a prefix. This loop alone is not sufficient, however:
// Suppose that pat is "ABYXCDBYX", and text is ".....ABYXCDEYX".
// We will match X, Y, and find B != E. There is no prefix of pat
// in the suffix "YX", so the first loop tells us to skip forward
// by 9 characters.
// Although superficially similar to the KMP table, the KMP table
// relies on information about the beginning of the partial match
// that the BM algorithm does not have.
//
// The second loop addresses case 2. Since suffix_length may not be
// unique, we want to take the minimum value, which will tell us
// how far away the closest potential match is.
template<bool caseSensitive, class T>
std::vector<size_t> make_delta2(const T& patBegin, const T& patEnd) {
assert(patBegin != patEnd);
size_t patSize = patEnd - patBegin;
std::vector<size_t> delta2(patSize);
size_t last_prefix_index = patSize - 1;
// first loop
for (size_t p = patSize; p > 0; --p) {
if (is_prefix<caseSensitive>(patBegin, patEnd, p)) {
last_prefix_index = p;
}
delta2[p - 1] = last_prefix_index + (patSize - p);
}
// second loop
for (size_t p = 0; p < patSize - 1; p++) {
size_t slen = suffix_length<caseSensitive>(patBegin, patEnd, p);
if (make_case<caseSensitive>(patBegin[p - slen]) != make_case<caseSensitive>(patBegin[patSize - 1 - slen])) {
delta2[patSize - 1 - slen] = patSize - 1 - p + slen;
}
}
return delta2;
}
template<bool caseSensitive, class T, class U>
size_t boyer_moore_it(const T& string, size_t begin, size_t end, const U& patBegin, const U& patEnd) {
// The empty pattern must be considered specially
size_t patSize = patEnd - patBegin;
if (patSize == 0 || patSize > (end - begin)) {
return UINT_MAX;
}
const std::map<wchar_t, size_t> delta1(make_delta1<caseSensitive>(patBegin, patEnd));
const std::vector<size_t> delta2(make_delta2<caseSensitive>(patBegin, patEnd));
size_t i = begin + patSize;
while (i < end) {
size_t j = patSize;
while (j > 0 && (make_case<caseSensitive>(string[i - 1]) == make_case<caseSensitive>(patBegin[j - 1]))) {
--i;
--j;
}
if (j == 0) {
return i;
}
std::map<wchar_t, size_t>::const_iterator it = delta1.find(make_case<caseSensitive>(string[i - 1]));
size_t d1 = it != delta1.end() ? d1 = it->second : patSize;
i += max(d1, delta2[j - 1]);
}
return UINT_MAX;
}
template<bool caseSensitive, class T>
size_t boyer_moore(const T& string, size_t begin, size_t end, std::wstring pat) {
return boyer_moore_it<caseSensitive>(string.begin(), begin, end, pat.begin(), pat.end());
}
template<bool caseSensitive, class T>
size_t boyer_moore_r(const T& string, size_t begin, size_t end, const std::wstring& pat) {
size_t p = boyer_moore_it<caseSensitive>(string.rbegin(), string.size() - end, string.size() - begin, pat.rbegin(), pat.rend());
if (p != UINT_MAX)
p = string.size() - pat.size() - p;
return p;
}
template<bool caseSensitive>
size_t boyer_moore(const std::wstring& string, const std::wstring& pat) {
return boyer_moore<caseSensitive>(string, 0, string.size(), pat);
}
template<bool caseSensitive>
size_t boyer_moore_r(const std::wstring& string, const std::wstring& pat) {
return boyer_moore_r<caseSensitive>(string, 0, string.size(), pat);
}