Week12KMP
串的模式匹配(KMP)
什么是串
线性存储的 组数据 一组数据(默 是字符 认是字符)
特殊操作集
- 求串的长度
- 比较两串是否相等
- 两串相接
- 求子串
- 插入子串
- 匹配子串
- 删除子串
目标
给定一段文本,从中找出某个指定的关键字。
例如从一本 Thomas Love Peacock 写于十九世纪的小说《 Headlong Hall 》 中找到那个最长的单词
或者从古希腊喜剧《 Assemblywomen 》 中找到一道菜的名字
给定一段文本: $string = s_0s_1 …… s_{n-1}$
给定一个模式:$ pattern = p_0p_1 …… p_{m-1}$
Position PatternMatch(char *string, char *pattern)
简单实现
方法1: C的库函数 strstr
#include <stdio.h>
#include <string.h>
//char *strstr(char *string, char *pattern);
typedef char *Position;
int main() {
char string[] = "This is a simple example.";
char pattern[] = "simple";
Position p = strstr(string, pattern);
if (p == NotFound) printf("Not Found.\n");
else printf("%s\n", p);
return 0;
}
$strlen(string)=n,strlen(pattern)=m$,下同
$T
= O(n·m) $
简单改进
方法2:从末尾开始比
改了个寂寞
大师改进
方法3: KMP( Knuth、 Morris、 Pratt ) 算法
$T = O(n+m) $
$$
match(j)=\left\{
\begin{aligned}
&满足p_0\cdots p_i=p_{j-i}\cdots p_j的最大i(<j) \newline
&-1\ 如果这样的i不存在
\end{aligned}
\right.
$$
KMP算法实现
#include <stdio.h>
#include <string.h>
typedef int Position;
#define NotFound -1
int main() {
char string[] = "This is a simple example.";
char pattern[] = "simple";
Position p = KMP(string, pattern);
if (p == NotFound) printf("Not Found.\n");
else printf("%s\n", string + p);
return 0;
}
Position KMP(char *string, char *pattern) {
int n = strlen(string); /*O(n)*/
int m = strlen(pattern); /*O(m)*/
int s, p, *match;
match = (int *)malloc(sizeof(int) * m);
BuildMatch(pattern, match); /*Tm=o(?)*/
s = p = 0;
while (s < n && p < m) { /*O(n)*/
if (string[s] == pattern[p]) { s++; p++;}
else if (p > 0) p = match[p - 1] + 1;
else s++;
}
return (p == m) ? (s - m) : NotFound;
}
$T = O(n+m+T_m)$
BuildMatch 的实现
for ( j = 0; j < m; j++ )
$1+2+\cdots+\frac{j+1}{2}+\cdots+j=O(j^2)$
$T_m = O( m^3 )$ Bad!
void BuildMatch(char *pattern, int *match) {
int i, j;
int m = strlen(pattern); /*O(m)*/
match[0] = -1;
for (j = 1; j < m; < j++) { /*O(m)*/
i = match[j - 1];
while ((i >= 0) && (pattern[i + 1] != pattern[j])) i = match[i];
if (pattern[i + 1] == pattern[j]) match[j] = i + 1;
else match[j] = -1;
}
}
$i$ 回退的总次数不会超过 $i$增加的总次数
$T_m=O(m)$
代码模板
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef int Position;
#define NotFound -1
void BuildMatch(char *pattern, int *match) {
Position i, j;
int m = strlen(pattern);
match[0] = -1;
for (j = 1; j < m; j++) {
i = match[j - 1];
while ((i >= 0) && (pattern[i + 1] != pattern[j]))
i = match[i];
if (pattern[i + 1] == pattern[j])
match[j] = i + 1;
else
match[j] = -1;
}
}
Position KMP(char *string, char *pattern) {
int n = strlen(string);
int m = strlen(pattern);
Position s, p, *match;
if (n < m)
return NotFound;
match = (Position *)malloc(sizeof(Position) * m);
BuildMatch(pattern, match);
s = p = 0;
while (s < n && p < m) {
if (string[s] == pattern[p]) {
s++;
p++;
} else if (p > 0)
p = match[p - 1] + 1;
else
s++;
}
return (p == m) ? (s - m) : NotFound;
}
int main() {
char string[] = "This is a simple example.";
char pattern[] = "simple";
Position p = KMP(string, pattern);
if (p == NotFound)
printf("Not Found.\n");
else
printf("%s\n", string + p);
return 0;
}