A Fast String Searching Algorithm

BF算法(Brute Force)

将模式串和主串进行比较,一致时则继续比较下一字符,直到比较完整个模式串。不一致时则将模式串后移一位,重新从模式串的首位开始对比

给定一个 haystack 字符串和一个 needle 字符串,在 haystack 字符串中找出 needle 字符串出现的第一个位置 (从0开始)。如果不存在,则返回 -1。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
package main

import "fmt"

func main() {
    haystack := "hello"
    needle := "ll"
    n := searching(haystack, needle)
    fmt.Println(n) // 2
}

func searching(haystack, needle string) int {
    //i代表主串指针,j代表模式串
    var i, j int
    var halen = len(haystack)
    var nelen = len(needle)
    for i < halen && j < nelen {
        if haystack[i] == needle[j] {
            j++
        } else {
            i -= j
            j = 0
        }
        i++
    }
    if j == nelen {
        return i - nelen
    }
    return -1
}

BM算法 (Boyer-Moore)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
package main

import (
    "fmt"
)

func main() {
    haystack := "hello"
    needle := "ll"

    hay := []rune(haystack)
    need := []rune(needle)

    var halen = len(haystack)
    var nelen = len(needle)

    n := bm(hay, need, halen, nelen)
    fmt.Println(n) // 2
}

func badChar(b []rune, m int) (bc []int) {
    bc = make([]int, 256)
    var i int
    for i < 256 { // charset
        bc[i] = -1
        i++
    }

    i = 0
    for i < m {
        var ascii = b[i]
        bc[ascii] = i
        i++
    }
    return
}

func goodSuffix(b []rune, m int) (suffix []int, prefix []bool) {
    suffix = make([]int, m)
    prefix = make([]bool, m)
    var i int
    for i < m {
        suffix[i] = -1
        //prefix[i] = false
        i++
    }

    i = 0
    for i < (m - 1) {
        var j = i
        var k = 0

        for j >= 0 && (b[j] == b[m-1-k]) {
            k++
            suffix[k] = j + 1
            j--
        }
        if j == -1 {
            prefix[k] = true
        }
        i++
    }
    return
}

func bm(a, b []rune, n, m int) int {
    bc := badChar(b, m)
    suffix_index, ispre := goodSuffix(b, m)
    var i = 0
    var j int
    for i <= (n - m) {
        j = 0
        //从后往前匹配,找到坏字符
        for j = m - 1; j >= 0; {
            if a[i+j] != b[j] {
                break
            }
            j--
        }
        //匹配成功
        if j < 0 {
            return i
        }
        //求坏字符规则下移动位数
        var x = j - bc[a[i+j]]
        var y int
        if y < (m-1) && (m-1-j) > 0 {
            y = move(j, m, suffix_index, ispre)
        }
        //移动
        if x >= y {
            i += x
        } else {
            i += y
        }
    }
    return -1
}

// j代表坏字符的下标
func move(j, m int, suffix_index []int, ispre []bool) int {
    //好后缀长度
    var k = m - 1 - j
    //如果含有长度为k的好后缀,返回移动位数
    if suffix_index[k] != -1 {
        return j - suffix_index[k] + 1
    }
    //找头部为好后缀子串的最大长度,从长度最大的子串开始
    for r := j + 2; r <= m-1; {
        //如果是头部
        if ispre[m-r] {
            return r
        }
        r++
    }
    //没有好后缀,或头部为好后缀子串,返回匹配串长度m
    return m
}

KMP算法(Knuth-Morris-Pratt)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
package main

import "fmt"

func main() {
    haystack := "hello"
    needle := "ll"
    n := searching(haystack, needle)
    fmt.Println(n) // 2
}

func searching(haystack, needle string) int {
    if len(needle) == 0 {
        return 0
    }
    if len(haystack) == 0 {
        return -1
    }

    hasyarr := []rune(haystack)
    nearr := []rune(needle)

    halen := len(haystack)
    nelen := len(nearr)

    return kmp(hasyarr, nearr, halen, nelen)
}

func kmp(hasyarr, nearr []rune, halen, nelen int) int {
    //获取next数组
    next := next(nearr, nelen)
    var i, j int
    for i < halen {
        //发现不匹配字符,按next数组移动到最大公共后缀
        //前缀的后一位
        for j > 0 && hasyarr[i] != nearr[j] {
            j = next[j-1] + 1
            //超出长度
            if nelen-j+i > halen {
                return -1
            }
        }
        //相同后移比较下个字符
        if hasyarr[i] == nearr[j] {
            j++
        }
        //遍历完,返回起点下标
        if j == nelen {
            return i - nelen + 1
        }
        i++
    }
    return -1
}

func next(nearr []rune, nelen int) (next []int) {
    next = make([]int, nelen)
    next[0] = -1
    var k = -1
    var i = 1
    for i < nelen {
        for (k != -1) && nearr[k+1] != nearr[i] {
            k = next[k]
        }
        if nearr[k+1] == nearr[i] {
            k++
        }
        next[i] = k
        i++
    }
    return
}

参考