diff --git a/cspell.json b/cspell.json index 96b036d6..313ed5e3 100644 --- a/cspell.json +++ b/cspell.json @@ -10,6 +10,10 @@ "packages/@d-zero/page-cluster/src/__fixtures__/production-scale/**" ], "words": [ + // page-cluster clustering/distance terminology + "jaccard", + "medoids", + // "gaxios", "pngjs", diff --git a/packages/@d-zero/page-cluster/src/array-edit-distance.spec.ts b/packages/@d-zero/page-cluster/src/array-edit-distance.spec.ts new file mode 100644 index 00000000..6154d260 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/array-edit-distance.spec.ts @@ -0,0 +1,41 @@ +import { describe, expect, test } from 'vitest'; + +import { arrayEditDistance } from './array-edit-distance.js'; + +describe('arrayEditDistance', () => { + test('two empty arrays have distance 0', () => { + expect(arrayEditDistance([], [])).toBe(0); + }); + + test('identical arrays have distance 0', () => { + expect( + arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li', 'body>ul>li']), + ).toBe(0); + }); + + test('one empty array costs the length of the other (all insertions)', () => { + expect(arrayEditDistance([], ['a', 'b', 'c'])).toBe(3); + expect(arrayEditDistance(['a', 'b', 'c'], [])).toBe(3); + }); + + test('reordering the same elements is not free', () => { + // swapping the first two elements costs 2 substitutions, not 0 + expect(arrayEditDistance(['a', 'b', 'c'], ['b', 'a', 'c'])).toBe(2); + }); + + test('a single substitution costs 1', () => { + expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'x', 'c'])).toBe(1); + }); + + test('a single insertion costs 1', () => { + expect(arrayEditDistance(['a', 'c'], ['a', 'b', 'c'])).toBe(1); + }); + + test('a single deletion costs 1', () => { + expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'c'])).toBe(1); + }); + + test('mixed insertions, deletions and substitutions combine', () => { + expect(arrayEditDistance(['a', 'b', 'c', 'd'], ['a', 'x', 'c', 'e', 'd'])).toBe(2); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/array-edit-distance.ts b/packages/@d-zero/page-cluster/src/array-edit-distance.ts new file mode 100644 index 00000000..125f2c81 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/array-edit-distance.ts @@ -0,0 +1,62 @@ +/** + * Reads `row[index]`, throwing instead of returning `undefined`. The DP loop + * below only ever indexes within bounds it just built, so the thrown branch + * is unreachable in practice; it exists to satisfy `noUncheckedIndexedAccess` + * without a non-null assertion. Named `readDpValue` rather than `at` to avoid + * reading as (and being confused for) `Array.prototype.at`, whose negative- + * index-from-end semantics this helper does not share. + * @param row + * @param index + */ +function readDpValue(row: readonly number[], index: number): number { + const value = row[index]; + if (value === undefined) { + throw new Error('arrayEditDistance: DP row index out of bounds'); + } + return value; +} + +/** + * Element-wise Levenshtein distance between two `tokenize()` outputs, for + * the small set of comparisons that need order/nesting to matter (refining + * a merge distance between near-duplicate candidates, spot-checking cluster + * quality) rather than the set-based similarity used for bulk narrowing. + * Operates on whole array elements, not characters, so a single differing + * path costs 1 edit regardless of its string length. This is the same + * O(n*m) dynamic-programming shape as tree edit distance, but requires no + * tree: run directly on the flat leaf-path arrays `tokenize()` already + * produces, which is why it stays viable at the scale this comparison is + * meant for (small numbers of already-narrowed candidates, not all-pairs). + * @param a + * @param b + * @example + * ```ts + * arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li']); + * // 1 + * ``` + */ +export function arrayEditDistance(a: readonly string[], b: readonly string[]): number { + const rowCount = a.length; + const colCount = b.length; + + let previousRow = Array.from({ length: colCount + 1 }, (_, index) => index); + + for (let row = 1; row <= rowCount; row++) { + const currentRow = [row]; + for (let col = 1; col <= colCount; col++) { + currentRow.push( + a[row - 1] === b[col - 1] + ? readDpValue(previousRow, col - 1) + : 1 + + Math.min( + readDpValue(previousRow, col), + readDpValue(currentRow, col - 1), + readDpValue(previousRow, col - 1), + ), + ); + } + previousRow = currentRow; + } + + return readDpValue(previousRow, colCount); +} diff --git a/packages/@d-zero/page-cluster/src/jaccard-similarity.spec.ts b/packages/@d-zero/page-cluster/src/jaccard-similarity.spec.ts new file mode 100644 index 00000000..10c0d05f --- /dev/null +++ b/packages/@d-zero/page-cluster/src/jaccard-similarity.spec.ts @@ -0,0 +1,43 @@ +import { describe, expect, test } from 'vitest'; + +import { jaccardSimilarity } from './jaccard-similarity.js'; + +describe('jaccardSimilarity', () => { + test('identical sets return 1', () => { + const a = new Set(['body>ul>li', 'body>.card', 'body>.card>img']); + const b = new Set(['body>ul>li', 'body>.card', 'body>.card>img']); + expect(jaccardSimilarity(a, b)).toBe(1); + }); + + test('disjoint sets return 0', () => { + const a = new Set(['body>ul>li']); + const b = new Set(['body>.card']); + expect(jaccardSimilarity(a, b)).toBe(0); + }); + + test('partial overlap returns intersection over union', () => { + const a = new Set(['a', 'b', 'c']); + const b = new Set(['b', 'c', 'd']); + // intersection = {b, c} = 2, union = {a, b, c, d} = 4 + expect(jaccardSimilarity(a, b)).toBe(0.5); + }); + + test('one empty set returns 0', () => { + const a = new Set(['a']); + const b = new Set(); + expect(jaccardSimilarity(a, b)).toBe(0); + expect(jaccardSimilarity(b, a)).toBe(0); + }); + + test('both empty sets return 1', () => { + expect(jaccardSimilarity(new Set(), new Set())).toBe(1); + }); + + test('result is symmetric and correct when set sizes differ', () => { + const small = new Set(['a', 'b']); + const large = new Set(['a', 'b', 'c', 'd', 'e']); + // intersection = {a, b} = 2, union = {a, b, c, d, e} = 5 + expect(jaccardSimilarity(small, large)).toBe(0.4); + expect(jaccardSimilarity(large, small)).toBe(0.4); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/jaccard-similarity.ts b/packages/@d-zero/page-cluster/src/jaccard-similarity.ts new file mode 100644 index 00000000..1bc86e89 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/jaccard-similarity.ts @@ -0,0 +1,41 @@ +/** + * Structural-similarity primitive shared by two downstream stages that are + * not yet implemented: base-cluster generation over `tokenize()` output + * turned into sets (MinHash/LSH candidate scoring approximates this same + * ratio) and the default distance for merging medoids in the eventual + * hierarchical clustering step. Both need "how much of these two token sets + * overlaps" as a plain, parameter-free calculation, independent of whichever + * hashing/banding scheme ends up approximating it at scale. + * + * Two empty sets return `1`, not `0` or `NaN`: an empty `
` compared + * against another empty `` has no structural difference to report, so + * treating them as identical (rather than "undefined" or "no overlap") keeps + * the result usable directly as a similarity score without a caller-side + * special case. + * @param a + * @param b + * @example + * ```ts + * jaccardSimilarity(new Set(['body>ul>li']), new Set(['body>ul>li'])); + * // 1 + * ``` + */ +export function jaccardSimilarity( + a: ReadonlySet