字符串匹配算法

旌磅箱 · 2 小时前

Rabin-Karp算法

Rabin-Karp算法是一种基于哈希函数的字符串匹配算法，由 Michael O. Rabin 和 Richard M. Karp 于1987年提出，核心思想是用哈希函数将模式串和文本串中的子串转换为数值进行比较，避免大量不必要的字符比较。这个算法特别适合多模式串匹配场景，时间复杂度平均为O(n+m)，n是文本串长度，m是模式串长度。
Rabin-Karp算法的关键在于使用滚动哈希函数（Rolling Hash），它可以在常数时间内计算出滑动窗口的新哈希值，保证算法在大多数情况下的高效性。
算法步骤

计算模式串的哈希值
计算文本串中长度为m的第一个子串的哈希值（m为模式串长度）
在文本串上滑动窗口，对于每个位置：
- 使用滚动哈希技术高效计算当前窗口的哈希值
- 如果哈希值与模式串相等，则进行字符逐一比较以避免哈希冲突
- 如果完全匹配，则找到一个匹配位置
重复步骤3，直到处理完整个文本串

核心特性

基于哈希比较：通过哈希值比较代替直接字符比较
滚动哈希：O(1)时间复杂度计算下一窗口的哈希值
时间复杂度：平均情况O(n+m)，最坏情况O(n*m)
空间复杂度：O(1)，只需常数额外空间
适用范围：单模式和多模式串匹配场景，特别是多模式匹配

基础实现

接下来大家一起看下Rabin-Karp算法的部分主流语言实现：

public class RabinKarp {
private final static int PRIME = 101; // 哈希计算使用的质数
public static int search(String text, String pattern) {
int m = pattern.length();
int n = text.length();
if (m > n) return -1;
if (m == 0) return 0;
// 计算哈希乘数，等于d^(m-1) % PRIME，用于滚动哈希计算
int h = 1;
for (int i = 0; i < m - 1; i++) {
h = (h * 256) % PRIME;
}
// 计算模式串和第一个窗口的哈希值
int patternHash = 0;
int textHash = 0;
for (int i = 0; i < m; i++) {
patternHash = (256 * patternHash + pattern.charAt(i)) % PRIME;
textHash = (256 * textHash + text.charAt(i)) % PRIME;
}
// 滑动窗口，比较哈希值
for (int i = 0; i <= n - m; i++) {
// 哈希值相等时，检查是否真正匹配
if (patternHash == textHash) {
boolean match = true;
for (int j = 0; j < m; j++) {
if (text.charAt(i + j) != pattern.charAt(j)) {
match = false;
break;
}
}
if (match) {
return i; // 找到匹配
}
}
// 计算下一个窗口的哈希值
if (i < n - m) {
textHash = (256 * (textHash - text.charAt(i) * h) + text.charAt(i + m)) % PRIME;
// 处理负数哈希值
if (textHash < 0) {
textHash += PRIME;
}
}
}
return -1; // 未找到匹配
}
// 打印结果
public static void main(String[] args) {
String text = "ABABCABABDABACDABABCABAB";
String pattern = "ABABCABAB";
int position = search(text, pattern);
if (position == -1) {
System.out.println("未找到匹配");
} else {
System.out.println("模式串在位置 " + position + " 处匹配");
System.out.println(text);
// 打印指示匹配位置的指针
for (int i = 0; i < position; i++) {
System.out.print(" ");
}
System.out.println(pattern);
}
}
}

复制代码

在上述代码中：

public class ImprovedRabinKarp {
private final static long PRIME1 = 1000000007; // 第一个哈希的质数
private final static long PRIME2 = 1000000009; // 第二个哈希的质数
// 使用双哈希来减少冲突
public static int search(String text, String pattern) {
int m = pattern.length();
int n = text.length();
if (m > n) return -1;
if (m == 0) return 0;
// 计算哈希乘数
long h1 = 1;
long h2 = 1;
for (int i = 0; i < m - 1; i++) {
h1 = (h1 * 256) % PRIME1;
h2 = (h2 * 256) % PRIME2;
}
// 计算模式串和第一个窗口的哈希值
long patternHash1 = 0;
long patternHash2 = 0;
long textHash1 = 0;
long textHash2 = 0;
for (int i = 0; i < m; i++) {
patternHash1 = (256 * patternHash1 + pattern.charAt(i)) % PRIME1;
patternHash2 = (256 * patternHash2 + pattern.charAt(i)) % PRIME2;
textHash1 = (256 * textHash1 + text.charAt(i)) % PRIME1;
textHash2 = (256 * textHash2 + text.charAt(i)) % PRIME2;
}
// 滑动窗口，比较哈希值
for (int i = 0; i <= n - m; i++) {
// 两个哈希都相等时，再进行字符比较
if (patternHash1 == textHash1 && patternHash2 == textHash2) {
boolean match = true;
for (int j = 0; j < m; j++) {
if (text.charAt(i + j) != pattern.charAt(j)) {
match = false;
break;
}
}
if (match) {
return i; // 找到匹配
}
}
// 计算下一个窗口的哈希值
if (i < n - m) {
textHash1 = (256 * (textHash1 - text.charAt(i) * h1) + text.charAt(i + m)) % PRIME1;
textHash2 = (256 * (textHash2 - text.charAt(i) * h2) + text.charAt(i + m)) % PRIME2;
// 处理负数哈希值
if (textHash1 < 0) textHash1 += PRIME1;
if (textHash2 < 0) textHash2 += PRIME2;
}
}
return -1; // 未找到匹配
}
}

复制代码

是 KMP 算法的核心，在匹配失败时根据预先计算的next数组来确定模式串指针的回退位置。
优化

优化后的 next 数组

public class RabinKarpFingerprint {
private final static long PRIME = 1000000007;
private final static int WINDOW_SIZE = 5; // 指纹窗口大小
public static Set<Long> generateFingerprints(String text) {
Set<Long> fingerprints = new HashSet<>();
int n = text.length();
if (n < WINDOW_SIZE) {
fingerprints.add(calculateHash(text, n));
return fingerprints;
}
// 计算第一个窗口的哈希值
long textHash = calculateHash(text, WINDOW_SIZE);
fingerprints.add(textHash);
// 计算哈希乘数
long h = 1;
for (int i = 0; i < WINDOW_SIZE - 1; i++) {
h = (h * 256) % PRIME;
}
// 滑动窗口，计算所有长度为WINDOW_SIZE的子串哈希值
for (int i = 0; i <= n - WINDOW_SIZE - 1; i++) {
textHash = (256 * (textHash - text.charAt(i) * h) + text.charAt(i + WINDOW_SIZE)) % PRIME;
if (textHash < 0) {
textHash += PRIME;
}
fingerprints.add(textHash);
}
return fingerprints;
}
public static double calculateSimilarity(String text1, String text2) {
Set<Long> fingerprints1 = generateFingerprints(text1);
Set<Long> fingerprints2 = generateFingerprints(text2);
// 计算交集大小
Set<Long> intersection = new HashSet<>(fingerprints1);
intersection.retainAll(fingerprints2);
// 计算并集大小
Set<Long> union = new HashSet<>(fingerprints1);
union.addAll(fingerprints2);
// 杰卡德相似度系数
return (double) intersection.size() / union.size();
}
private static long calculateHash(String str, int length) {
long hash = 0;
for (int i = 0; i < length; i++) {
hash = (256 * hash + str.charAt(i)) % PRIME;
}
return hash;
}
}

复制代码

预处理减少分支实现

public class SubstringHash {
private static final long PRIME = 1000000007;
private static final int BASE = 256;
private long[] hash; // 前缀哈希值
private long[] pow; // BASE的幂
private String s; // 源字符串
public SubstringHash(String s) {
this.s = s;
int n = s.length();
hash = new long[n + 1];
pow = new long[n + 1];
// 预计算BASE的幂
pow[0] = 1;
for (int i = 1; i <= n; i++) {
pow[i] = (pow[i - 1] * BASE) % PRIME;
}
// 计算所有前缀的哈希值
hash[0] = 0;
for (int i = 0; i < n; i++) {
hash[i + 1] = (hash[i] * BASE + s.charAt(i)) % PRIME;
}
}
// 计算子串s[l..r]的哈希值（0-indexed）
public long substringHash(int l, int r) {
// 获取s[0...r]的哈希值，减去s[0...l-1]的哈希值（需要进行适当调整）
long result = (hash[r + 1] - (hash[l] * pow[r - l + 1]) % PRIME) % PRIME;
if (result < 0) {
result += PRIME;
}
return result;
}
// 检查两个子串是否相同
public boolean areSubstringsEqual(int l1, int r1, int l2, int r2) {
if (r1 - l1 != r2 - l2) {
return false; // 长度不同
}
return substringHash(l1, r1) == substringHash(l2, r2);
}
}

复制代码

优点

时间复杂度为O(m+n)，优于朴素的字符串匹配算法(暴力解法)
文本串只需扫描一次，不会回退
对于包含重复模式的字符串会高效
预处理模式串，可以多次用于不同的文本串
能快速跳过已知不会匹配的位置

缺点

需要额外的空间存储next数组
构建next数组的逻辑较为复杂，不易理解
在模式串较短或无重复模式时，相比简单算法优势不明显
实现时容易出错，特别是处理边界情况

应用场景

1）生物信息学中的DNA序列匹配
2）网络入侵检测系统中的模式匹配
3）搜索引擎的关键词匹配
4）数据压缩算法中的模式识别
扩展：多模式字符串匹配

public class BoyerMoore {
private final int R; // 字符集大小
private int[] badChar; // 坏字符表
private int[] goodSuffix; // 好后缀表
private int[] borderPos; // 边界位置表
private String pattern; // 模式串
public BoyerMoore(String pattern) {
this.R = 256; // ASCII字符集
this.pattern = pattern;
int m = pattern.length();
// 初始化坏字符表
badChar = new int[R];
for (int c = 0; c < R; c++) {
badChar[c] = -1; // 初始化为-1
}
for (int j = 0; j < m; j++) {
badChar[pattern.charAt(j)] = j; // 记录每个字符最右出现位置
}
// 初始化好后缀表和边界位置表
goodSuffix = new int[m];
borderPos = new int[m];
processSuffixes();
}
// 预处理好后缀表
private void processSuffixes() {
int m = pattern.length();
int i = m, j = m + 1;
borderPos[i] = j;
// 计算边界位置
while (i > 0) {
while (j <= m && pattern.charAt(i - 1) != pattern.charAt(j - 1)) {
if (goodSuffix[j] == 0) {
goodSuffix[j] = j - i;
}
j = borderPos[j];
}
i--; j--;
borderPos[i] = j;
}
// 计算好后缀表
j = borderPos[0];
for (i = 0; i <= m; i++) {
if (goodSuffix[i] == 0) {
goodSuffix[i] = j;
}
if (i == j) {
j = borderPos[j];
}
}
}
// 搜索文本串中的匹配
public int search(String text) {
int n = text.length();
int m = pattern.length();
if (m == 0) return 0;
int skip;
for (int i = 0; i <= n - m; i += skip) {
skip = 0;
for (int j = m - 1; j >= 0; j--) {
if (pattern.charAt(j) != text.charAt(i + j)) {
// 坏字符规则
skip = Math.max(1, j - badChar[text.charAt(i + j)]);
// 好后缀规则
if (j < m - 1) {
skip = Math.max(skip, goodSuffix[j + 1]);
}
break;
}
}
if (skip == 0) return i; // 找到匹配
}
return -1; // 没有找到匹配
}
// 测试
public static void main(String[] args) {
String text = "HERE IS A SIMPLE EXAMPLE";
String pattern = "EXAMPLE";
BoyerMoore bm = new BoyerMoore(pattern);
int position = bm.search(text);
if (position == -1) {
System.out.println("未找到匹配");
} else {
System.out.println("模式串在位置 " + position + " 处匹配");
System.out.println(text);
for (int i = 0; i < position; i++) {
System.out.print(" ");
}
System.out.println(pattern);
}
}
}

复制代码

账号		自动登录	找回密码
密码			立即注册

字符串匹配算法

相关帖子

签约作者

字符串匹配算法

相关帖子

相关推荐

签约作者