数据结构week12


Week12KMP

串的模式匹配(KMP)

什么是串

线性存储的 组数据 一组数据(默 是字符 认是字符)

特殊操作集

  • 求串的长度
  • 比较两串是否相等
  • 两串相接
  • 求子串
  • 插入子串
  • 匹配子串
  • 删除子串

目标

给定一段文本,从中找出某个指定的关键字。

例如从一本 Thomas Love Peacock 写于十九世纪的小说《 Headlong Hall 》 中找到那个最长的单词

或者从古希腊喜剧《 Assemblywomen 》 中找到一道菜的名字

给定一段文本: $string = s_0s_1 …… s_{n-1}$

给定一个模式:$ pattern = p_0p_1 …… p_{m-1}$

Position PatternMatch(char *string, char *pattern)

简单实现

方法1: C的库函数 strstr

#include <stdio.h>
#include <string.h>
//char *strstr(char *string, char *pattern);
typedef char *Position;
int main() {
    char string[] = "This is a simple example.";
    char pattern[] = "simple";
    Position p = strstr(string, pattern);
    if (p == NotFound) printf("Not Found.\n");
	else printf("%s\n", p);
    return 0;
}

$strlen(string)=n,strlen(pattern)=m$,下同

$T
= O(n·m) $

简单改进

方法2:从末尾开始比

改了个寂寞

大师改进

方法3: KMP( Knuth、 Morris、 Pratt ) 算法

$T = O(n+m) $
$$
match(j)=\left\{
\begin{aligned}
&满足p_0\cdots p_i=p_{j-i}\cdots p_j的最大i(<j) \newline
&-1\ 如果这样的i不存在
\end{aligned}
\right.
$$

KMP算法实现

#include <stdio.h>
#include <string.h>
typedef int Position;
#define NotFound -1
int main() {
    char string[] = "This is a simple example.";
    char pattern[] = "simple";
    Position p = KMP(string, pattern);
    if (p == NotFound) printf("Not Found.\n");
    else printf("%s\n", string + p);
    return 0;
}

Position KMP(char *string, char *pattern) {
    int n = strlen(string);		/*O(n)*/
    int m = strlen(pattern);	/*O(m)*/
    int s, p, *match;
    
    match = (int *)malloc(sizeof(int) * m);
    BuildMatch(pattern, match);	/*Tm=o(?)*/
    s = p = 0;
    while (s < n && p < m) {	/*O(n)*/
        if (string[s] == pattern[p]) { s++; p++;} 
        else if (p > 0) p = match[p - 1] + 1;
        else s++;
    }
    return (p == m) ? (s - m) : NotFound;
}

$T = O(n+m+T_m)$

BuildMatch 的实现

for ( j = 0; j < m; j++ )

$1+2+\cdots+\frac{j+1}{2}+\cdots+j=O(j^2)$

$T_m = O( m^3 )$​ Bad!

void BuildMatch(char *pattern, int *match) {
    int i, j;
    int m = strlen(pattern);	/*O(m)*/
    match[0] = -1;
    for (j = 1; j < m; < j++) {	/*O(m)*/
        i = match[j - 1];
        while ((i >= 0) && (pattern[i + 1] != pattern[j])) i = match[i];
        if (pattern[i + 1] == pattern[j]) match[j] = i + 1;
        else match[j] = -1;
    }
}

$i$ 回退的总次数不会超过 $i$增加的总次数

$T_m=O(m)$

代码模板

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

typedef int Position;
#define NotFound -1

void BuildMatch(char *pattern, int *match) {
    Position i, j;
    int m = strlen(pattern);
    match[0] = -1;

    for (j = 1; j < m; j++) {
        i = match[j - 1];
        while ((i >= 0) && (pattern[i + 1] != pattern[j]))
            i = match[i];
        if (pattern[i + 1] == pattern[j])
            match[j] = i + 1;
        else
            match[j] = -1;
    }
}

Position KMP(char *string, char *pattern) {
    int n = strlen(string);
    int m = strlen(pattern);
    Position s, p, *match;

    if (n < m)
        return NotFound;
    match = (Position *)malloc(sizeof(Position) * m);
    BuildMatch(pattern, match);
    s = p = 0;
    while (s < n && p < m) {
        if (string[s] == pattern[p]) {
            s++;
            p++;
        } else if (p > 0)
            p = match[p - 1] + 1;
        else
            s++;
    }
    return (p == m) ? (s - m) : NotFound;
}

int main() {
    char string[] = "This is a simple example.";
    char pattern[] = "simple";
    Position p = KMP(string, pattern);
    if (p == NotFound)
        printf("Not Found.\n");
    else
        printf("%s\n", string + p);
    return 0;
}

文章作者: Phoenix
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 Phoenix !
  目录