问题描述
我正在使用加权的Levenshtein距离,其成本如下:
I am using a weighted Levenshtein distance with the following costs:
- 插入:1
- 删除:1
- 替换:2
正如wildwasser在评论中指出的,这意味着将替换视为插入和删除.因此该算法可以避免替换.
As pointed out by wildwasser in a comment, this means, that a substitution is treated as an insertion and a deletion. So substitutions could be avoided by the algorithm.
对于每个操作成本为1的常规实现,有多种位并行实现,例如Myers/Hyyrö:
For the normal implementation with a cost of 1 for each operation there are multiple bitparallel implementations like e.g. Myers/Hyyrö:
static const uint64_t masks[64] = {
0x0000000000000001, 0x0000000000000003, 0x0000000000000007, 0x000000000000000f,
0x000000000000001f, 0x000000000000003f, 0x000000000000007f, 0x00000000000000ff,
0x00000000000001ff, 0x00000000000003ff, 0x00000000000007ff, 0x0000000000000fff,
0x0000000000001fff, 0x0000000000003fff, 0x0000000000007fff, 0x000000000000ffff,
0x000000000001ffff, 0x000000000003ffff, 0x000000000007ffff, 0x00000000000fffff,
0x00000000001fffff, 0x00000000003fffff, 0x00000000007fffff, 0x0000000000ffffff,
0x0000000001ffffff, 0x0000000003ffffff, 0x0000000007ffffff, 0x000000000fffffff,
0x000000001fffffff, 0x000000003fffffff, 0x000000007fffffff, 0x00000000ffffffff,
0x00000001ffffffff, 0x00000003ffffffff, 0x00000007ffffffff, 0x0000000fffffffff,
0x0000001fffffffff, 0x0000003fffffffff, 0x0000007fffffffff, 0x000000ffffffffff,
0x000001ffffffffff, 0x000003ffffffffff, 0x000007ffffffffff, 0x00000fffffffffff,
0x00001fffffffffff, 0x00003fffffffffff, 0x00007fffffffffff, 0x0000ffffffffffff,
0x0001ffffffffffff, 0x0003ffffffffffff, 0x0007ffffffffffff, 0x000fffffffffffff,
0x001fffffffffffff, 0x003fffffffffffff, 0x007fffffffffffff, 0x00ffffffffffffff,
0x01ffffffffffffff, 0x03ffffffffffffff, 0x07ffffffffffffff, 0x0fffffffffffffff,
0x1fffffffffffffff, 0x3fffffffffffffff, 0x7fffffffffffffff, 0xffffffffffffffff,
};
int distance(char* a, int len_a, char* b, int len_b) {
if (len_a > 64) {
return -1;
}
uint64_t posbits[256] = {0};
for (int i=0; i < len_a; i++){
posbits[(unsigned char)a[i]] |= 1ull << i;
}
int dist = len_a;
uint64_t mask = 1ull << (len_a - 1);
uint64_t VP = masks[len_a - 1];
uint64_t VN = 0;
for (int i=0; i < len_b; i++){
uint64_t y = posbits[(unsigned char)b[i]];
uint64_t X = y | VN;
uint64_t D0 = ((VP + (X & VP)) ^ VP) | X;
uint64_t HN = VP & D0;
uint64_t HP = VN | ~(VP|D0);
X = (HP << 1) | 1;
VN = X & D0;
VP = (HN << 1) | ~(X | D0);
if (HP & mask) { dist++; }
if (HN & mask) { dist--; }
}
return dist;
}
是否有类似的算法来计算莱文施泰因距离的此加权版本?
Is there a similar algorithm to calculate this weighted version of the levenshtein distance?
推荐答案
我能够使用 BitPAl:一种位并行,通用整数计分序列比对算法.
int weighted_levenshtein_bitpal(char* a, int len_a, char* b, int len_b)
{
if (len_a > 64) {
return -1;
}
uint64_t posbits[256] = {0};
for (int i = 0; i < len_a; i++){
posbits[(unsigned char)a[i]] |= 1ull << i;
}
uint64_t DHneg1 = ~0x0ull;
uint64_t DHzero = 0;
uint64_t DHpos1 = 0;
//recursion
for (int i = 0; i < len_b; ++i)
{
uint64_t Matches = posbits[(unsigned char)b[i]];
//Complement Matches
uint64_t NotMatches = ~Matches;
//Finding the vertical values. //Find 1s
uint64_t INITpos1s = DHneg1 & Matches;
uint64_t DVpos1shift = (((INITpos1s + DHneg1) ^ DHneg1) ^ INITpos1s);
//set RemainingDHneg1
uint64_t RemainDHneg1 = DHneg1 ^ (DVpos1shift >> 1);
//combine 1s and Matches
uint64_t DVpos1shiftorMatch = DVpos1shift | Matches;
//Find 0s
uint64_t INITzeros = (DHzero & DVpos1shiftorMatch) ;
uint64_t DVzeroshift = ((INITzeros << 1) + RemainDHneg1) ^ RemainDHneg1;
//Find -1s
uint64_t DVneg1shift = ~(DVpos1shift | DVzeroshift);
DHzero &= NotMatches;
//combine 1s and Matches
uint64_t DHpos1orMatch = DHpos1| Matches;
//Find 0s
DHzero = (DVzeroshift & DHpos1orMatch) | (DVneg1shift & DHzero);
//Find 1s
DHpos1 = (DVneg1shift & DHpos1orMatch);
//Find -1s
DHneg1 = ~(DHzero | DHpos1);
}
//find scores in last row
uint64_t add1 = DHzero;
uint64_t add2 = DHpos1;
int dist = len_b;
for (int i = 0; i < len_a; i++)
{
uint64_t bitmask = 1ull << i;
dist -= ((add1 & bitmask) >> i) * 1 + ((add2 & bitmask) >> i) * 2 - 1;
}
return dist;
}
该算法要求两个字符串都只使用256个以下字符(扩展的Ascii),并且字符串 a
的 length<65
.可以使用简单的键值存储而不是数组来添加对超过256个字符(完整utf32)的支持:
The algorithm requires, that both strings only use characters below 256 (extended Ascii), and that string a
has a length < 65
. Support for characters over 256 (full utf32) can be added using a simple key value storage instead of the array:
typedef struct {
uint32_t key[128];
uint64_t val[128];
} blockmap_entry;
void blockmap_insert(blockmap_entry* map, uint32_t ch, int pos) {
uint8_t hash = ch % 128;
uint32_t key = ch | 0x80000000U;
while (map->key[hash] && map->key[hash] != key) {
if (hash == 127) hash = 0;
else hash++;
}
map->key[hash] = key;
map->val[hash] |= 1 << pos;
}
uint64_t blockmap_get(blockmap_entry* map, uint32_t ch) {
uint8_t hash = ch % 128;
uint32_t key = ch | 0x80000000U;
while (map->key[hash] && map->key[hash] != key) {
if (hash == 127) hash = 0;
else hash++;
}
return (map->key[hash] == key) ? map->val[hash] : 0;
}
然后可以通过以下方式实现算法:
The algorithm can then be implemented in the following way:
int weighted_levenshtein_bitpal(uint32_t* a, int len_a, uint32_t* b, int len_b)
{
if (len_a > 64) {
return -1;
}
blockmap_entry posbits = {0};
for (int i = 0; i < len_a; i++){
blockmap_insert(&posbits, a[i], i);
}
uint64_t DHneg1 = ~0x0ull;
uint64_t DHzero = 0;
uint64_t DHpos1 = 0;
//recursion
for (int i = 0; i < len_b; ++i)
{
uint64_t Matches = blockmap_get(&posbits, a[i]);
//Complement Matches
uint64_t NotMatches = ~Matches;
//Finding the vertical values. //Find 1s
uint64_t INITpos1s = DHneg1 & Matches;
uint64_t DVpos1shift = (((INITpos1s + DHneg1) ^ DHneg1) ^ INITpos1s);
//set RemainingDHneg1
uint64_t RemainDHneg1 = DHneg1 ^ (DVpos1shift >> 1);
//combine 1s and Matches
uint64_t DVpos1shiftorMatch = DVpos1shift | Matches;
//Find 0s
uint64_t INITzeros = (DHzero & DVpos1shiftorMatch) ;
uint64_t DVzeroshift = ((INITzeros << 1) + RemainDHneg1) ^ RemainDHneg1;
//Find -1s
uint64_t DVneg1shift = ~(DVpos1shift | DVzeroshift);
DHzero &= NotMatches;
//combine 1s and Matches
uint64_t DHpos1orMatch = DHpos1| Matches;
//Find 0s
DHzero = (DVzeroshift & DHpos1orMatch) | (DVneg1shift & DHzero);
//Find 1s
DHpos1 = (DVneg1shift & DHpos1orMatch);
//Find -1s
DHneg1 = ~(DHzero | DHpos1);
}
//find scores in last row
uint64_t add1 = DHzero;
uint64_t add2 = DHpos1;
int dist = len_b;
for (int i = 0; i < len_a; i++)
{
uint64_t bitmask = 1ull << i;
dist -= ((add1 & bitmask) >> i) * 1 + ((add2 & bitmask) >> i) * 2 - 1;
}
return dist;
}
对于更长的字符串,可以使用多个块来实现相同的算法.可以找到这里.
It is possible to implement the same algorithm using multiple blocks for longer strings. A C++ implementation of this can be found here.
这篇关于比特并行加权Levenshtein距离的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!