d-zero-dev · YusukeHirao · Jul 3, 2026 · Jul 3, 2026
@@ -10,6 +10,10 @@
 		"packages/@d-zero/page-cluster/src/__fixtures__/production-scale/**"
 	],
 	"words": [
+		// page-cluster clustering/distance terminology
+		"jaccard",
+		"medoids",
+
 		//
 		"gaxios",
 		"pngjs",

@@ -0,0 +1,41 @@
+import { describe, expect, test } from 'vitest';
+
+import { arrayEditDistance } from './array-edit-distance.js';
+
+describe('arrayEditDistance', () => {
+	test('two empty arrays have distance 0', () => {
+		expect(arrayEditDistance([], [])).toBe(0);
+	});
+
+	test('identical arrays have distance 0', () => {
+		expect(
+			arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li', 'body>ul>li']),
+		).toBe(0);
+	});
+
+	test('one empty array costs the length of the other (all insertions)', () => {
+		expect(arrayEditDistance([], ['a', 'b', 'c'])).toBe(3);
+		expect(arrayEditDistance(['a', 'b', 'c'], [])).toBe(3);
+	});
+
+	test('reordering the same elements is not free', () => {
+		// swapping the first two elements costs 2 substitutions, not 0
+		expect(arrayEditDistance(['a', 'b', 'c'], ['b', 'a', 'c'])).toBe(2);
+	});
+
+	test('a single substitution costs 1', () => {
+		expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'x', 'c'])).toBe(1);
+	});
+
+	test('a single insertion costs 1', () => {
+		expect(arrayEditDistance(['a', 'c'], ['a', 'b', 'c'])).toBe(1);
+	});
+
+	test('a single deletion costs 1', () => {
+		expect(arrayEditDistance(['a', 'b', 'c'], ['a', 'c'])).toBe(1);
+	});
+
+	test('mixed insertions, deletions and substitutions combine', () => {
+		expect(arrayEditDistance(['a', 'b', 'c', 'd'], ['a', 'x', 'c', 'e', 'd'])).toBe(2);
+	});
+});
@@ -0,0 +1,62 @@
+/**
+ * Reads `row[index]`, throwing instead of returning `undefined`. The DP loop
+ * below only ever indexes within bounds it just built, so the thrown branch
+ * is unreachable in practice; it exists to satisfy `noUncheckedIndexedAccess`
+ * without a non-null assertion. Named `readDpValue` rather than `at` to avoid
+ * reading as (and being confused for) `Array.prototype.at`, whose negative-
+ * index-from-end semantics this helper does not share.
+ * @param row
+ * @param index
+ */
+function readDpValue(row: readonly number[], index: number): number {
+	const value = row[index];
+	if (value === undefined) {
+		throw new Error('arrayEditDistance: DP row index out of bounds');
+	}
+	return value;
+}
+
+/**
+ * Element-wise Levenshtein distance between two `tokenize()` outputs, for
+ * the small set of comparisons that need order/nesting to matter (refining
+ * a merge distance between near-duplicate candidates, spot-checking cluster
+ * quality) rather than the set-based similarity used for bulk narrowing.
+ * Operates on whole array elements, not characters, so a single differing
+ * path costs 1 edit regardless of its string length. This is the same
+ * O(n*m) dynamic-programming shape as tree edit distance, but requires no
+ * tree: run directly on the flat leaf-path arrays `tokenize()` already
+ * produces, which is why it stays viable at the scale this comparison is
+ * meant for (small numbers of already-narrowed candidates, not all-pairs).
+ * @param a
+ * @param b
+ * @example
+ * ```ts
+ * arrayEditDistance(['body>ul>li', 'body>ul>li'], ['body>ul>li']);
+ * // 1
+ * ```
+ */
+export function arrayEditDistance(a: readonly string[], b: readonly string[]): number {
+	const rowCount = a.length;
+	const colCount = b.length;
+
+	let previousRow = Array.from({ length: colCount + 1 }, (_, index) => index);
+
+	for (let row = 1; row <= rowCount; row++) {
+		const currentRow = [row];
+		for (let col = 1; col <= colCount; col++) {
+			currentRow.push(
+				a[row - 1] === b[col - 1]
+					? readDpValue(previousRow, col - 1)
+					: 1 +
+							Math.min(
+								readDpValue(previousRow, col),
+								readDpValue(currentRow, col - 1),
+								readDpValue(previousRow, col - 1),
+							),
+			);
+		}
+		previousRow = currentRow;
+	}
+
+	return readDpValue(previousRow, colCount);
+}
@@ -0,0 +1,43 @@
+import { describe, expect, test } from 'vitest';
+
+import { jaccardSimilarity } from './jaccard-similarity.js';
+
+describe('jaccardSimilarity', () => {
+	test('identical sets return 1', () => {
+		const a = new Set(['body>ul>li', 'body>.card', 'body>.card>img']);
+		const b = new Set(['body>ul>li', 'body>.card', 'body>.card>img']);
+		expect(jaccardSimilarity(a, b)).toBe(1);
+	});
+
+	test('disjoint sets return 0', () => {
+		const a = new Set(['body>ul>li']);
+		const b = new Set(['body>.card']);
+		expect(jaccardSimilarity(a, b)).toBe(0);
+	});
+
+	test('partial overlap returns intersection over union', () => {
+		const a = new Set(['a', 'b', 'c']);
+		const b = new Set(['b', 'c', 'd']);
+		// intersection = {b, c} = 2, union = {a, b, c, d} = 4
+		expect(jaccardSimilarity(a, b)).toBe(0.5);
+	});
+
+	test('one empty set returns 0', () => {
+		const a = new Set(['a']);
+		const b = new Set();
+		expect(jaccardSimilarity(a, b)).toBe(0);
+		expect(jaccardSimilarity(b, a)).toBe(0);
+	});
+
+	test('both empty sets return 1', () => {
+		expect(jaccardSimilarity(new Set(), new Set())).toBe(1);
+	});
+
+	test('result is symmetric and correct when set sizes differ', () => {
+		const small = new Set(['a', 'b']);
+		const large = new Set(['a', 'b', 'c', 'd', 'e']);
+		// intersection = {a, b} = 2, union = {a, b, c, d, e} = 5
+		expect(jaccardSimilarity(small, large)).toBe(0.4);
+		expect(jaccardSimilarity(large, small)).toBe(0.4);
+	});
+});
@@ -0,0 +1,41 @@
+/**
+ * Structural-similarity primitive shared by two downstream stages that are
+ * not yet implemented: base-cluster generation over `tokenize()` output
+ * turned into sets (MinHash/LSH candidate scoring approximates this same
+ * ratio) and the default distance for merging medoids in the eventual
+ * hierarchical clustering step. Both need "how much of these two token sets
+ * overlaps" as a plain, parameter-free calculation, independent of whichever
+ * hashing/banding scheme ends up approximating it at scale.
+ *
+ * Two empty sets return `1`, not `0` or `NaN`: an empty `<body>` compared
+ * against another empty `<body>` has no structural difference to report, so
+ * treating them as identical (rather than "undefined" or "no overlap") keeps
+ * the result usable directly as a similarity score without a caller-side
+ * special case.
+ * @param a
+ * @param b
+ * @example
+ * ```ts
+ * jaccardSimilarity(new Set(['body>ul>li']), new Set(['body>ul>li']));
+ * // 1
+ * ```
+ */
+export function jaccardSimilarity(
+	a: ReadonlySet<string>,
+	b: ReadonlySet<string>,
+): number {
+	if (a.size === 0 && b.size === 0) {
+		return 1;
+	}
+
+	let intersectionSize = 0;
+	const [smaller, larger] = a.size <= b.size ? [a, b] : [b, a];
+	for (const token of smaller) {
+		if (larger.has(token)) {
+			intersectionSize++;
+		}
+	}
+
+	const unionSize = a.size + b.size - intersectionSize;
+	return intersectionSize / unionSize;
+}