-
-
Notifications
You must be signed in to change notification settings - Fork 92
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: Implement Weighted Distance Algorithm (#2255)
* doc: add comments to suggest * dev: First pass at Levenshtein * dev: Add an A* distance calculation function. * dev: Implement weighted distance
- Loading branch information
Showing
12 changed files
with
641 additions
and
17 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
18 changes: 18 additions & 0 deletions
18
packages/cspell-trie-lib/src/lib/suggestions/distanceAStar.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import { distanceAStar } from './distanceAStar'; | ||
import { levenshteinDistance } from './levenshtein'; | ||
|
||
describe('distanceAStar', () => { | ||
test.each` | ||
wordA | wordB | ||
${''} | ${''} | ||
${'apple'} | ${'apple'} | ||
${'apple'} | ${''} | ||
${'apple'} | ${'apples'} | ||
${'apple'} | ${'maple'} | ||
${'grapple'} | ${'maples'} | ||
`('distanceAStar vs Levenshtein "$wordA" "$wordB"', ({ wordA, wordB }) => { | ||
const expected = levenshteinDistance(wordA, wordB) * 100; | ||
expect(distanceAStar(wordA, wordB)).toBe(expected); | ||
expect(distanceAStar(wordB, wordA)).toBe(expected); | ||
}); | ||
}); |
75 changes: 75 additions & 0 deletions
75
packages/cspell-trie-lib/src/lib/suggestions/distanceAStar.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import { PairingHeap } from '../utils/PairingHeap'; | ||
|
||
/** | ||
* Calculate the edit distance between two words using an A* algorithm. | ||
* | ||
* Using basic weights, this algorithm has the same results as the Damerau-Levenshtein algorithm. | ||
*/ | ||
export function distanceAStar(a: string, b: string): number { | ||
const aN = a.length; | ||
const bN = b.length; | ||
const cost = 100; | ||
|
||
const candidates = new PairingHeap(compare); | ||
|
||
candidates.add({ ai: 0, bi: 0, c: 0 }); | ||
|
||
function opSub(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (ai < aN && bi < bN) { | ||
const cc = a[ai] === b[bi] ? c : c + cost; | ||
candidates.add({ ai: ai + 1, bi: bi + 1, c: cc }); | ||
} | ||
} | ||
|
||
function opIns(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (bi < bN) { | ||
candidates.add({ ai: ai, bi: bi + 1, c: c + cost }); | ||
} | ||
} | ||
|
||
function opDel(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (ai < aN) { | ||
candidates.add({ ai: ai + 1, bi: bi, c: c + cost }); | ||
} | ||
} | ||
|
||
function opSwap(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (a[ai] === b[bi + 1] && a[ai + 1] === b[bi]) { | ||
candidates.add({ ai: ai + 2, bi: bi + 2, c: c + cost }); | ||
} | ||
} | ||
|
||
let best: Node | undefined; | ||
// const bc2 = 2 * bc; | ||
while ((best = candidates.dequeue())) { | ||
if (best.ai === aN && best.bi === bN) break; | ||
|
||
opSwap(best); | ||
opIns(best); | ||
opDel(best); | ||
opSub(best); | ||
} | ||
|
||
return best?.c ?? -1; | ||
} | ||
|
||
interface Pos { | ||
/** the offset in string `a` */ | ||
ai: number; | ||
/** the offset in string `b` */ | ||
bi: number; | ||
} | ||
|
||
interface Node extends Pos { | ||
/** the current cost */ | ||
c: number; | ||
} | ||
|
||
function compare(a: Node, b: Node): number { | ||
// Choose lowest cost or farthest Manhattan distance. | ||
return a.c - b.c || b.ai + b.bi - a.ai - a.bi; | ||
} |
42 changes: 42 additions & 0 deletions
42
packages/cspell-trie-lib/src/lib/suggestions/distanceAStarWeighted.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
import { distanceAStarWeighted } from './distanceAStarWeighted'; | ||
import { levenshteinDistance } from './levenshtein'; | ||
import { buildWeightedMapTrie } from './weightedMaps'; | ||
|
||
describe('distanceAStar', () => { | ||
test.each` | ||
wordA | wordB | ||
${''} | ${''} | ||
${'apple'} | ${'apple'} | ||
${'apple'} | ${''} | ||
${'apple'} | ${'apples'} | ||
${'apple'} | ${'maple'} | ||
${'grapple'} | ${'maples'} | ||
`('distanceAStar vs Levenshtein "$wordA" "$wordB"', ({ wordA, wordB }) => { | ||
const expected = levenshteinDistance(wordA, wordB) * 100; | ||
expect(distanceAStarWeighted(wordA, wordB, {})).toBe(expected); | ||
expect(distanceAStarWeighted(wordB, wordA, {})).toBe(expected); | ||
}); | ||
|
||
// cspell:ignore aeiou | ||
test.each` | ||
wordA | wordB | map | expected | ||
${''} | ${''} | ${undefined} | ${0} | ||
${'apple'} | ${'apple'} | ${{ map: 'ae', insDel: 75 }} | ${0} | ||
${'apple'} | ${''} | ${{ map: 'ae', insDel: 75 }} | ${450} | ||
${'apple'} | ${''} | ${{ map: 'ae|(ap)', insDel: 75 }} | ${350} | ||
${'apple'} | ${''} | ${{ map: '(ap)', insDel: 1 }} | ${301} | ||
${'apple'} | ${'apples'} | ${{ map: '(les)(le)', replace: 50 }} | ${50} | ||
${'apple'} | ${'maple'} | ${{ map: '(pp)p', replace: 50 }} | ${150} | ||
${'grapple'} | ${'maples'} | ${{ map: '(pp)p', replace: 50 }} | ${350} | ||
${'bite'} | ${'bate'} | ${{ map: 'aei', replace: 25 }} | ${25} | ||
${'receive'} | ${'recieve' /* cspell:ignore recieve */} | ${{ map: 'ei', swap: 25 }} | ${25} | ||
${'airplane'} | ${'aeroplane'} | ${{ map: '(ai)(ae)', replace: 25 }} | ${125} | ||
${'airplane'} | ${'aeroplane'} | ${{ map: '(air)(aero)|aeiou', replace: 25 }} | ${25} | ||
${'airplane'} | ${'aeroplane'} | ${{ map: 'aeiou', replace: 25 }} | ${125} | ||
${'plain'} | ${'plane'} | ${{ map: '(ane)(ain)', replace: 100 }} | ${100} | ||
`('distanceAStar vs Levenshtein "$wordA" "$wordB" $map', ({ wordA, wordB, map, expected }) => { | ||
const trie = map ? buildWeightedMapTrie([map]) : buildWeightedMapTrie([]); | ||
expect(distanceAStarWeighted(wordA, wordB, trie)).toBe(expected); | ||
expect(distanceAStarWeighted(wordB, wordA, trie)).toBe(expected); | ||
}); | ||
}); |
168 changes: 168 additions & 0 deletions
168
packages/cspell-trie-lib/src/lib/suggestions/distanceAStarWeighted.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import { PairingHeap } from '../utils/PairingHeap'; | ||
import { WeightedMapTrie, WeightedRepMapTrie } from './weightedMaps'; | ||
|
||
/** | ||
* Calculate the edit distance between two words using an A* algorithm. | ||
* | ||
* Using basic weights, this algorithm has the same results as the Damerau-Levenshtein algorithm. | ||
*/ | ||
export function distanceAStarWeighted(a: string, b: string, map: WeightedMapTrie): number { | ||
const aN = a.length; | ||
const bN = b.length; | ||
const cost = 100; | ||
|
||
const candidates = new PairingHeap(compare); | ||
|
||
candidates.add({ ai: 0, bi: 0, c: 0 }); | ||
|
||
function opSub(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (ai < aN && bi < bN) { | ||
const cc = a[ai] === b[bi] ? c : c + cost; | ||
candidates.add({ ai: ai + 1, bi: bi + 1, c: cc }); | ||
} | ||
} | ||
|
||
function opIns(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (bi < bN) { | ||
candidates.add({ ai: ai, bi: bi + 1, c: c + cost }); | ||
} | ||
} | ||
|
||
function opDel(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (ai < aN) { | ||
candidates.add({ ai: ai + 1, bi: bi, c: c + cost }); | ||
} | ||
} | ||
|
||
function opSwap(n: Node) { | ||
const { ai, bi, c } = n; | ||
if (a[ai] === b[bi + 1] && a[ai + 1] === b[bi]) { | ||
candidates.add({ ai: ai + 2, bi: bi + 2, c: c + cost }); | ||
} | ||
} | ||
|
||
function opMap(n: Node) { | ||
const { ai, bi, c } = n; | ||
|
||
function ins(ai: number, bi: number, m: WeightedMapTrie | undefined) { | ||
if (bi >= bN || !m) return; | ||
const n = m[b[bi]]; | ||
if (!n) return; | ||
const cost = n.insDel; | ||
++bi; | ||
if (cost !== undefined) { | ||
candidates.add({ ai, bi, c: c + cost }); | ||
} | ||
ins(ai, bi, n.t); | ||
} | ||
|
||
function del(ai: number, bi: number, m: WeightedMapTrie | undefined) { | ||
if (ai >= aN || !m) return; | ||
const n = m[a[ai]]; | ||
if (!n) return; | ||
++ai; | ||
const cost = n.insDel; | ||
if (cost !== undefined) { | ||
candidates.add({ ai, bi, c: c + cost }); | ||
} | ||
del(ai, bi, n.t); | ||
} | ||
|
||
function repApply(ai: number, bi: number, m: WeightedRepMapTrie | undefined) { | ||
if (!m || bi >= bN) return; | ||
const char = b[bi]; | ||
const n = m[char]; | ||
if (!n) return; | ||
++bi; | ||
const cost = n.rep; | ||
if (cost !== undefined) { | ||
candidates.add({ ai, bi, c: c + cost }); | ||
} | ||
repApply(ai, bi, n.r); | ||
} | ||
|
||
function rep(ai: number, bi: number, m: WeightedMapTrie | undefined) { | ||
if (!m || ai >= aN || bi >= bN) return; | ||
const n = m[a[ai]]; | ||
if (!n) return; | ||
++ai; | ||
repApply(ai, bi, n.r); | ||
rep(ai, bi, n.t); | ||
} | ||
|
||
function swap(ai: number, bi: number, m: WeightedMapTrie | undefined) { | ||
if (!m || ai >= aN || bi >= bN) return; | ||
|
||
function apply(mid: number, right: number, cost: number | undefined) { | ||
if (cost === undefined) return; | ||
const swap = a.slice(mid, right) + a.slice(ai, mid); | ||
const len = swap.length; | ||
|
||
const subB = b.slice(bi, bi + len); | ||
if (swap === subB) { | ||
candidates.add({ ai: ai + len, bi: bi + len, c: cost }); | ||
} | ||
} | ||
|
||
function right(aim: number, ail: number, m: WeightedRepMapTrie | undefined) { | ||
if (!m || ail >= aN) return; | ||
const n = m[a[ail]]; | ||
if (!n) return; | ||
++ail; | ||
apply(aim, ail, n.swap); | ||
right(aim, ail, n.r); | ||
} | ||
|
||
function left(aim: number, m: WeightedMapTrie | undefined) { | ||
if (!m || aim >= aN) return; | ||
const n = m[a[aim]]; | ||
if (!n) return; | ||
++aim; | ||
right(aim, aim, n.r); | ||
left(aim, n.t); | ||
} | ||
|
||
left(ai, m); | ||
} | ||
|
||
ins(ai, bi, map); | ||
del(ai, bi, map); | ||
rep(ai, bi, map); | ||
swap(ai, bi, map); | ||
} | ||
|
||
let best: Node | undefined; | ||
// const bc2 = 2 * bc; | ||
while ((best = candidates.dequeue())) { | ||
if (best.ai === aN && best.bi === bN) break; | ||
|
||
opSwap(best); | ||
opIns(best); | ||
opDel(best); | ||
opMap(best); | ||
opSub(best); | ||
} | ||
|
||
// istanbul ignore else | ||
return best ? best.c : -1; | ||
} | ||
|
||
interface Pos { | ||
/** the offset in string `a` */ | ||
ai: number; | ||
/** the offset in string `b` */ | ||
bi: number; | ||
} | ||
|
||
interface Node extends Pos { | ||
/** the current cost */ | ||
c: number; | ||
} | ||
|
||
function compare(a: Node, b: Node): number { | ||
// Choose lowest cost or farthest Manhattan distance. | ||
return a.c - b.c || b.ai + b.bi - a.ai - a.bi; | ||
} |
23 changes: 23 additions & 0 deletions
23
packages/cspell-trie-lib/src/lib/suggestions/levenshtein.test.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import { levenshteinDistance } from './levenshtein'; | ||
|
||
describe('levenshtein', () => { | ||
test.each` | ||
left | right | expected | ||
${'abc'} | ${'abc'} | ${0} | ||
${'abc'} | ${'ab'} | ${1} | ||
${'abc'} | ${''} | ${3} | ||
${'kitten'} | ${'sitting'} | ${3} | ||
${'Saturday'} | ${'Sunday'} | ${3} | ||
${'ab'} | ${'ba'} | ${1} | ||
${'aba'} | ${'bab'} | ${2} | ||
${'abab'} | ${'baba'} | ${2} | ||
${'abab'} | ${'ababa'} | ${1} | ||
${'appear'} | ${'apple'} | ${3} | ||
${'appease'} | ${'apple'} | ${3} | ||
`('levenshteinDistance "$left" vs "$right"', ({ left, right, expected }) => { | ||
expect(levenshteinDistance(left, right)).toBe(expected); | ||
expect(levenshteinDistance(right, left)).toBe(expected); | ||
}); | ||
}); | ||
|
||
// cspell:ignore ababa |
Oops, something went wrong.