Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
"packages/@d-zero/page-cluster/src/__fixtures__/production-scale/**"
],
"words": [
// page-cluster clustering/distance terminology
"jaccard",
"medoids",

//
"gaxios",
"pngjs",
Expand Down
41 changes: 41 additions & 0 deletions packages/@d-zero/page-cluster/src/array-edit-distance.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { describe, expect, test } from 'vitest';

import { arrayEditDistance } from './array-edit-distance.js';

describe('arrayEditDistance', () => {
test('two empty arrays have distance 0', () => {
expect(arrayEditDistance([], [])).toBe(0);
});

test('identical arrays have distance 0', () => {
expect(
arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li', 'body>ul>li']),
).toBe(0);
});

test('one empty array costs the length of the other (all insertions)', () => {
expect(arrayEditDistance([], ['a', 'b', 'c'])).toBe(3);
expect(arrayEditDistance(['a', 'b', 'c'], [])).toBe(3);
});

test('reordering the same elements is not free', () => {
// swapping the first two elements costs 2 substitutions, not 0
expect(arrayEditDistance(['a', 'b', 'c'], ['b', 'a', 'c'])).toBe(2);
});

test('a single substitution costs 1', () => {
expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'x', 'c'])).toBe(1);
});

test('a single insertion costs 1', () => {
expect(arrayEditDistance(['a', 'c'], ['a', 'b', 'c'])).toBe(1);
});

test('a single deletion costs 1', () => {
expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'c'])).toBe(1);
});

test('mixed insertions, deletions and substitutions combine', () => {
expect(arrayEditDistance(['a', 'b', 'c', 'd'], ['a', 'x', 'c', 'e', 'd'])).toBe(2);
});
});
62 changes: 62 additions & 0 deletions packages/@d-zero/page-cluster/src/array-edit-distance.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* Reads `row[index]`, throwing instead of returning `undefined`. The DP loop
* below only ever indexes within bounds it just built, so the thrown branch
* is unreachable in practice; it exists to satisfy `noUncheckedIndexedAccess`
* without a non-null assertion. Named `readDpValue` rather than `at` to avoid
* reading as (and being confused for) `Array.prototype.at`, whose negative-
* index-from-end semantics this helper does not share.
* @param row
* @param index
*/
function readDpValue(row: readonly number[], index: number): number {
const value = row[index];
if (value === undefined) {
throw new Error('arrayEditDistance: DP row index out of bounds');
}
return value;
}

/**
* Element-wise Levenshtein distance between two `tokenize()` outputs, for
* the small set of comparisons that need order/nesting to matter (refining
* a merge distance between near-duplicate candidates, spot-checking cluster
* quality) rather than the set-based similarity used for bulk narrowing.
* Operates on whole array elements, not characters, so a single differing
* path costs 1 edit regardless of its string length. This is the same
* O(n*m) dynamic-programming shape as tree edit distance, but requires no
* tree: run directly on the flat leaf-path arrays `tokenize()` already
* produces, which is why it stays viable at the scale this comparison is
* meant for (small numbers of already-narrowed candidates, not all-pairs).
* @param a
* @param b
* @example
* ```ts
* arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li']);
* // 1
* ```
*/
export function arrayEditDistance(a: readonly string[], b: readonly string[]): number {
const rowCount = a.length;
const colCount = b.length;

let previousRow = Array.from({ length: colCount + 1 }, (_, index) => index);

for (let row = 1; row <= rowCount; row++) {
const currentRow = [row];
for (let col = 1; col <= colCount; col++) {
currentRow.push(
a[row - 1] === b[col - 1]
? readDpValue(previousRow, col - 1)
: 1 +
Math.min(
readDpValue(previousRow, col),
readDpValue(currentRow, col - 1),
readDpValue(previousRow, col - 1),
),
);
}
previousRow = currentRow;
}

return readDpValue(previousRow, colCount);
}
43 changes: 43 additions & 0 deletions packages/@d-zero/page-cluster/src/jaccard-similarity.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { describe, expect, test } from 'vitest';

import { jaccardSimilarity } from './jaccard-similarity.js';

describe('jaccardSimilarity', () => {
test('identical sets return 1', () => {
const a = new Set(['body>ul>li', 'body>.card', 'body>.card>img']);
const b = new Set(['body>ul>li', 'body>.card', 'body>.card>img']);
expect(jaccardSimilarity(a, b)).toBe(1);
});

test('disjoint sets return 0', () => {
const a = new Set(['body>ul>li']);
const b = new Set(['body>.card']);
expect(jaccardSimilarity(a, b)).toBe(0);
});

test('partial overlap returns intersection over union', () => {
const a = new Set(['a', 'b', 'c']);
const b = new Set(['b', 'c', 'd']);
// intersection = {b, c} = 2, union = {a, b, c, d} = 4
expect(jaccardSimilarity(a, b)).toBe(0.5);
});

test('one empty set returns 0', () => {
const a = new Set(['a']);
const b = new Set();
expect(jaccardSimilarity(a, b)).toBe(0);
expect(jaccardSimilarity(b, a)).toBe(0);
});

test('both empty sets return 1', () => {
expect(jaccardSimilarity(new Set(), new Set())).toBe(1);
});

test('result is symmetric and correct when set sizes differ', () => {
const small = new Set(['a', 'b']);
const large = new Set(['a', 'b', 'c', 'd', 'e']);
// intersection = {a, b} = 2, union = {a, b, c, d, e} = 5
expect(jaccardSimilarity(small, large)).toBe(0.4);
expect(jaccardSimilarity(large, small)).toBe(0.4);
});
});
41 changes: 41 additions & 0 deletions packages/@d-zero/page-cluster/src/jaccard-similarity.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/**
* Structural-similarity primitive shared by two downstream stages that are
* not yet implemented: base-cluster generation over `tokenize()` output
* turned into sets (MinHash/LSH candidate scoring approximates this same
* ratio) and the default distance for merging medoids in the eventual
* hierarchical clustering step. Both need "how much of these two token sets
* overlaps" as a plain, parameter-free calculation, independent of whichever
* hashing/banding scheme ends up approximating it at scale.
*
* Two empty sets return `1`, not `0` or `NaN`: an empty `<body>` compared
* against another empty `<body>` has no structural difference to report, so
* treating them as identical (rather than "undefined" or "no overlap") keeps
* the result usable directly as a similarity score without a caller-side
* special case.
* @param a
* @param b
* @example
* ```ts
* jaccardSimilarity(new Set(['body>ul>li']), new Set(['body>ul>li']));
* // 1
* ```
*/
export function jaccardSimilarity(
a: ReadonlySet<string>,
b: ReadonlySet<string>,
): number {
if (a.size === 0 && b.size === 0) {
return 1;
}

let intersectionSize = 0;
const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
for (const token of smaller) {
if (larger.has(token)) {
intersectionSize++;
}
}

const unionSize = a.size + b.size - intersectionSize;
return intersectionSize / unionSize;
}
Loading