From 8679a70c5567770c532e28e6c7342ffe5ea11779 Mon Sep 17 00:00:00 2001 From: lauraste1 <30854371+lauraste1@users.noreply.github.com> Date: Sat, 2 Mar 2019 10:00:27 +1100 Subject: [PATCH] Add Boyer-Moore article to string algorithms section of guide (#34937) * Add files via upload * fix: corrected front matter * fix: renamed to be index.md --- .../boyermoore/index.md | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 guide/english/algorithms/string-matching-algorithms/boyermoore/index.md diff --git a/guide/english/algorithms/string-matching-algorithms/boyermoore/index.md b/guide/english/algorithms/string-matching-algorithms/boyermoore/index.md new file mode 100644 index 0000000000..075018e3f9 --- /dev/null +++ b/guide/english/algorithms/string-matching-algorithms/boyermoore/index.md @@ -0,0 +1,89 @@ +--- +title: Boyer-Moore +--- + +## Boyer-Moore + +This algorithm has been shown to be more effective than the Knuth-Morris-Pratt algorithm and others for pattern matching in natural languages like English. It relies on the use of [two key heuristics][1], namely: +* The looking-glass heuristic: where the pattern P is compared to a substring of the text T starting from P’s last letter +* Character-jump heuristic: if there is a mismatch at T[i] = c then + - If P contains c, shift P to line up the last instance of c in P with T[i] + - Else move P to align P[0] with T[i+1] + +Before applying these heuristics however, the algorithm analyses pattern P and alphabet Σ to create a last occurrence function. This function ties the letters of the alphabet to the letters in P according to where they occur in P. + +So if the last occurence of the letter c in P is at index P[i] then at L(c) the index i of that letter will be stored. If the letter c does not occur in P, -1 will be stored there. + +#### Example: + +__Σ =__ {e, f, g, h} +__P =__ egef + +| | | | | | +|--- |--- |--- |--- |--- | +| __c__ | e | f | g | h | +| __L(c)__ | 2 | 3 | 1 | -1 | + +The last occurrence function can be stored as an array indexed by the numeric codes of the characters. This function can be calculated in O(m+s) time where m is the length of the pattern and s the size of the alphabet. + +### Code of Boyer-Moore Algorithm in C++ +```c++ +#include +#define MAXCHAR 256 //there are 256 ASCII characters +using namespace std; + +int min(int num1, int num2) { + if (num1 < num2) { + return num1; + } + return num2; +} + +void makeLOFunction(string pattern, int LOFunction[MAXCHAR]) { + int n = pattern.size(); + for (int c = 0; c < MAXCHAR; c++) + LOFunction[c] = -1; + for (int c = 0; c < n; c++) + LOFunction[(int)pattern[c]] = c; +} + +int boyerMoore(string pattern, string text) { + int m = pattern.size(); + int n = text.size(); + int LOFunction[MAXCHAR]; + makeLOFunction(pattern, LOFunction); //fill array of last occurrence function + int i = m-1; + int j = m-1; + do { + if (text[i] == pattern[j]) { + if (j == 0) { + cout << "Pattern starts at index " << i << "\n"; + return 1; //match found at i + } else { + i--; + j--; //move backwards through pattern + } + } else { + int leap = LOFunction[(int)text[i]]; //character jump + i = i + m - min(j, leap+1); + j = m - 1; + } + } + while (i <= n-1); //while not reached end of text + cout << "Match not found\n"; + return -1; //no match found +} + +int main() { + string pattern = "abacab"; + string text = "abacaabadcabacabaabb"; + boyerMoore(pattern, text); +} + +``` +#### References +[1]: https://www.cs.purdue.edu/homes/aliaga/cs251-16/lectures/22-PatternMatching.pdf +1. Aliaga, D (2016) *Strings and Pattern Matching*, lecture notes, Data Structures and Algorithms CS251, Purdue University, delivered 9 April 2016. +2. Christine, S (2018) Bad Character Heuristic [Source code]. https://www.tutorialspoint.com/Bad-Character-Heuristic +3. Buerger, H (2012) Boyer-Moore Algorithm [Source code]. +https://gist.github.com/obstschale/3060059