一段时间以来,我使用了Levenshtein distance的Apache Commons lang StringUtils实现。 它实现了一些众所周知的技巧,通过仅挂接到两个数组而不是为备忘录表分配巨大的nxm表来使用较少的内存。 它还仅检查宽度为2 * k +1的“条带”,其中k为最大编辑次数。

在levenshtein的大多数实际用法中,您只关心一个字符串是否在另一个字符串的少量编辑(1、2、3)之内。 这避免了使levenstein变得“昂贵”的大部分n * m计算。 我们发现,在ak <= 3时,具有这些技巧的levenshtein的速度比Jaro-Winkler distance快,后者是一种近似编辑距离计算,被创建为更快的近似值(这有很多原因)。

不幸的是,Apache Commons Lang实现仅计算Levenshtein,而不计算可能更有用的Damerau-Levenshtein距离 。 Levenshtein定义了编辑操作的插入,删除和替换。 Damerau变体将* transposition *添加到列表中,这对于我使用编辑距离的大多数位置都非常有用。 不幸的是,DL距离不是真正的度量标准,因为它不考虑三角形不等式,但是许多应用程序不受此影响。 从该维基百科页面可以看到,“最佳字符串对齐”和DL距离之间经常会混淆。 实际上,OSA是一种更简单的算法,并且需要较少的簿记,因此运行时间可能略微更快。

我找不到任何使用我在Apache Commons Lang中看到的内存技巧和“条带化”技巧的OSA或DL实现。 因此,我使用这些技巧实现了自己的OSA。 在某些时候,我还将使用这些技巧来实现DL,并查看性能差异是什么:

这是Java中的OSA。 它是公共领域; 随意使用。 单元测试如下。 唯一的依赖关系是Guava-,但它只是前提条件类和文档注释,因此如果您愿意,可以轻松删除该依赖关系:

package com.github.steveash.util;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.primitives.Shorts.checkedCast;
import static java.lang.Math.abs;
import static java.lang.Math.max;

import java.util.Arrays;

import com.google.common.annotations.VisibleForTesting;

/**
 * Implementation of the OSA which is similar to the Damerau-Levenshtein in that it allows for transpositions to
 * count as a single edit distance, but is not a true metric and can over-estimate the cost because it disallows
 * substrings to edited more than once.  See wikipedia for more discussion on OSA vs DL
 * <p/>
 * See Algorithms on Strings, Trees and Sequences by Dan Gusfield for more information.
 * <p/>
 * This also has a set of local buffer implementations to avoid allocating new buffers each time, which might be
 * a premature optimization
 * <p/>
 * @author Steve Ash
 */
public class OptimalStringAlignment {

    private static final int threadLocalBufferSize = 64;

    private static final ThreadLocal<short[]> costLocal = new ThreadLocal<short[]>() {
        @Override
        protected short[] initialValue() {
            return new short[threadLocalBufferSize];
        }
    };

    private static final ThreadLocal<short[]> back1Local = new ThreadLocal<short[]>() {
        @Override
        protected short[] initialValue() {
            return new short[threadLocalBufferSize];
        }
    };

    private static final ThreadLocal<short[]> back2Local = new ThreadLocal<short[]>() {
        @Override
        protected short[] initialValue() {
            return new short[threadLocalBufferSize];
        }
    };

    public static int editDistance(CharSequence s, CharSequence t, int threshold) {
        checkNotNull(s, "cannot measure null strings");
        checkNotNull(t, "cannot measure null strings");
        checkArgument(threshold >= 0, "Threshold must not be negative");
        checkArgument(s.length() < Short.MAX_VALUE, "Cannot take edit distance of strings longer than 32k chars");
        checkArgument(t.length() < Short.MAX_VALUE, "Cannot take edit distance of strings longer than 32k chars");

        if (s.length() + 1 > threadLocalBufferSize || t.length() + 1 > threadLocalBufferSize)
            return editDistanceWithNewBuffers(s, t, checkedCast(threshold));

        short[] cost = costLocal.get();
        short[] back1 = back1Local.get();
        short[] back2 = back2Local.get();
        return editDistanceWithBuffers(s, t, checkedCast(threshold), back2, back1, cost);
    }

    @VisibleForTesting
    static int editDistanceWithNewBuffers(CharSequence s, CharSequence t, short threshold) {
        int slen = s.length();
        short[] back1 = new short[slen + 1];    // "up 1" row in table
        short[] back2 = new short[slen + 1];    // "up 2" row in table
        short[] cost = new short[slen + 1];     // "current cost"

        return editDistanceWithBuffers(s, t, threshold, back2, back1, cost);
    }

    private static int editDistanceWithBuffers(CharSequence s, CharSequence t, short threshold,
            short[] back2, short[] back1, short[] cost) {

        short slen = (short) s.length();
        short tlen = (short) t.length();

        // if one string is empty, the edit distance is necessarily the length of the other
        if (slen == 0) {
            return tlen <= threshold ? tlen : -1;
        } else if (tlen == 0) {
            return slen <= threshold ? slen : -1;
        }

        // if lengths are different > k, then can't be within edit distance
        if (abs(slen - tlen) > threshold)
            return -1;

        if (slen > tlen) {
            // swap the two strings to consume less memory
            CharSequence tmp = s;
            s = t;
            t = tmp;
            slen = tlen;
            tlen = (short) t.length();
        }

        initMemoiseTables(threshold, back2, back1, cost, slen);

        for (short j = 1; j <= tlen; j++) {
            cost[0] = j; // j is the cost of inserting this many characters

            // stripe bounds
            int min = max(1, j - threshold);
            int max = min(slen, (short) (j + threshold));

            // at this iteration the left most entry is "too much" so reset it
            if (min > 1) {
                cost[min - 1] = Short.MAX_VALUE;
            }

            iterateOverStripe(s, t, j, cost, back1, back2, min, max);

            // swap our cost arrays to move on to the next "row"
            short[] tempCost = back2;
            back2 = back1;
            back1 = cost;
            cost = tempCost;
        }

        // after exit, the current cost is in back1
        // if back1[slen] > k then we exceeded, so return -1
        if (back1[slen] > threshold) {
            return -1;
        }
        return back1[slen];
    }

    private static void iterateOverStripe(CharSequence s, CharSequence t, short j,
            short[] cost, short[] back1, short[] back2, int min, int max) {

        // iterates over the stripe
        for (int i = min; i <= max; i++) {

            if (s.charAt(i - 1) == t.charAt(j - 1)) {
                cost[i] = back1[i - 1];
            } else {
                cost[i] = (short) (1 + min(cost[i - 1], back1[i], back1[i - 1]));
            }
            if (i >= 2 && j >= 2) {
                // possible transposition to check for
                if ((s.charAt(i - 2) == t.charAt(j - 1)) &&
                        s.charAt(i - 1) == t.charAt(j - 2)) {
                    cost[i] = min(cost[i], (short) (back2[i - 2] + 1));
                }
            }
        }
    }

    private static void initMemoiseTables(short threshold, short[] back2, short[] back1, short[] cost, short slen) {
        // initial "starting" values for inserting all the letters
        short boundary = (short) (min(slen, threshold) + 1);
        for (short i = 0; i < boundary; i++) {
            back1[i] = i;
            back2[i] = i;
        }
        // need to make sure that we don't read a default value when looking "up"
        Arrays.fill(back1, boundary, slen + 1, Short.MAX_VALUE);
        Arrays.fill(back2, boundary, slen + 1, Short.MAX_VALUE);
        Arrays.fill(cost, 0, slen + 1, Short.MAX_VALUE);
    }

    private static short min(short a, short b) {
        return (a <= b ? a : b);
    }

    private static short min(short a, short b, short c) {
        return min(a, min(b, c));
    }
}
import org.junit.Test

import static com.github.steveash.util.OptimalStringAlignment.editDistance

/**
 * @author Steve Ash
 */
class OptimalStringAlignmentTest {

    @Test
    public void shouldBeZeroForEqualStrings() throws Exception {
        assert 0 == editDistance("steve", "steve", 1)
        assert 0 == editDistance("steve", "steve", 0)
        assert 0 == editDistance("steve", "steve", 2)
        assert 0 == editDistance("steve", "steve", 100)

        assert 0 == editDistance("s", "s", 1)
        assert 0 == editDistance("s", "s", 0)
        assert 0 == editDistance("s", "s", 2)
        assert 0 == editDistance("s", "s", 100)

        assert 0 == editDistance("", "", 0)
        assert 0 == editDistance("", "", 1)
        assert 0 == editDistance("", "", 100)
    }

    @Test
    public void shouldBeOneForSingleOperation() throws Exception {
        def a = "steve";
        for (int i = 0; i < 5; i++) {
            assertOneOp(new StringBuilder(a).insert(i, 'f'), a)
            assertOneOp(new StringBuilder(a).deleteCharAt(i), a)
            def sb = new StringBuilder(a)
            sb.setCharAt(i, 'x' as char);
            assertOneOp(sb, a)

            if (i > 1) {
                sb = new StringBuilder(a)
                char t = sb.charAt(i - 1)
                sb.setCharAt(i - 1, sb.charAt(i))
                sb.setCharAt(i, t)
                println "comparing " + sb.toString() + " -> " + a
                assertOneOp(sb, a)
            }
        }
    }

    @Test
    public void shouldCountTransposeAsOne() throws Exception {
        assert 3 == editDistance("xxsteve", "steev", 4)
        assert 3 == editDistance("xxsteve", "steev", 3)
        assert 3 == editDistance("steev", "xxsteve", 4)
        assert 3 == editDistance("steev", "xxsteve", 3)
        assert -1 == editDistance("steev", "xxsteve", 2)

        assert 4 == editDistance("xxtseve", "steev", 4)
        assert 5 == editDistance("xxtsevezx", "steevxz", 5)
        assert 6 == editDistance("xxtsevezx", "steevxzpp", 6)
        assert 7 == editDistance("xxtsfevezx", "steevxzpp", 7)

        assert 4 == editDistance("xxtsf", "st", 7)
        assert 4 == editDistance("evezx", "eevxzpp", 7)
        assert 7 == editDistance("xxtsfevezx", "steevxzpp", 7)
    }

    @Test
    public void shouldCountLeadingCharacterTranspositionsAsOne() throws Exception {
        assert 1 == editDistance("rosa", "orsa", 2)
    }

    private void assertOneOp(CharSequence a, CharSequence b) {
        assert 1 == editDistance(a, b, 1)
        assert 1 == editDistance(b, a, 1)
        assert 1 == editDistance(a, b, 2)
        assert 1 == editDistance(b, a, 2)
    }

    @Test
    public void shouldShortCutWhenSpecialCase() throws Exception {
        assert 1 == editDistance("s", "", 1)
        assert 1 == editDistance("", "s", 1)
        assert -1 == editDistance("s", "", 0)
        assert -1 == editDistance("", "s", 0)
        assert -1 == editDistance("st", "", 1)
        assert -1 == editDistance("", "st", 1)
        assert -1 == editDistance("steve", "ste", 0)
        assert -1 == editDistance("ste", "steve", 0)
        assert -1 == editDistance("stev", "steve", 0)
        assert -1 == editDistance("ste", "steve", 1)
        assert -1 == editDistance("steve", "ste", 1)
        assert 1 == editDistance("steve", "stev", 1)
        assert 1 == editDistance("stev", "steve", 1)
    }
}