From 87728a72e7028d1438b5f9270801c005690d23e9 Mon Sep 17 00:00:00 2001 From: Yusuke Hirao Date: Sat, 4 Jul 2026 01:08:29 +0900 Subject: [PATCH] feat(page-cluster): add complete-linkage structural clustering within a block Add resolveStructuralClusterKeys, which clusters pages already grouped into one blocking key (e.g. from resolveBlockingGroupKeys) by structural similarity, using complete-linkage hierarchical clustering computed via the NN-chain algorithm for O(n^2) time. Single-linkage (connected components of a similarity-threshold graph) was considered first but rejected: its "chaining" failure mode lets one unrepresentative page transitively merge two otherwise-unrelated templates, which defeats template detection. Complete-linkage requires every pair across two clusters to clear the threshold, ruling that out, at no extra asymptotic cost since the pairwise similarity matrix is computed either way. Add "medoid" and "Murtagh" to the cspell dictionary for the new file's JSDoc. --- cspell.json | 2 + .../resolve-structural-cluster-keys.spec.ts | 282 ++++++++++++++++++ .../src/resolve-structural-cluster-keys.ts | 263 ++++++++++++++++ 3 files changed, 547 insertions(+) create mode 100644 packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts diff --git a/cspell.json b/cspell.json index a046e6d4..bd69e85a 100644 --- a/cspell.json +++ b/cspell.json @@ -12,8 +12,10 @@ "words": [ // page-cluster clustering/distance terminology "jaccard", + "medoid", "medoids", "hrefs", + "Murtagh", // "gaxios", diff --git a/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts new file mode 100644 index 00000000..f8b7c7fc --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts @@ -0,0 +1,282 @@ +import { describe, expect, test } from 'vitest'; + +import { jaccardSimilarity } from './jaccard-similarity.js'; +import { resolveStructuralClusterKeys } from './resolve-structural-cluster-keys.js'; + +/** + * Naive, obviously-correct reference implementation of threshold-cut + * complete-linkage clustering: repeatedly rescans every live cluster pair + * and merges the single best (highest minimum-pairwise-similarity) pair, + * with no NN-chain bookkeeping. Used only to differentially verify the + * production NN-chain implementation, which computes the exact same + * clustering faster (O(n²) vs this function's O(n³)) — see + * resolve-structural-cluster-keys.ts's JSDoc for why NN-chain is a genuine + * speedup, not an approximation. Returns numeric cluster labels rather than + * `cluster:N` strings; label *values* are allowed to differ from the + * production function's own numbering (traversal order differs between the + * two algorithms), only the partition (which pages end up together) must + * match, which `samePartition` below checks. + * @param tokenSets + * @param threshold + */ +function bruteForceCompleteLinkage( + tokenSets: readonly ReadonlySet[], + threshold: number, +): number[] { + let clusters: number[][] = tokenSets.map((_, index) => [index]); + + for (;;) { + let bestPair: [number, number] | undefined; + let bestScore = Number.NEGATIVE_INFINITY; + for (let i = 0; i < clusters.length; i++) { + for (let j = i + 1; j < clusters.length; j++) { + let minSimilarity = Number.POSITIVE_INFINITY; + for (const p of clusters[i] ?? []) { + for (const q of clusters[j] ?? []) { + minSimilarity = Math.min( + minSimilarity, + jaccardSimilarity(tokenSets[p] ?? new Set(), tokenSets[q] ?? new Set()), + ); + } + } + if (minSimilarity > bestScore) { + bestScore = minSimilarity; + bestPair = [i, j]; + } + } + } + + if (!bestPair || bestScore < threshold) { + break; + } + + const [i, j] = bestPair; + clusters[i] = [...(clusters[i] ?? []), ...(clusters[j] ?? [])]; + clusters = clusters.filter((_, index) => index !== j); + } + + const labels = Array.from({ length: tokenSets.length }); + for (const [clusterIndex, members] of clusters.entries()) { + for (const member of members) { + labels[member] = clusterIndex; + } + } + return labels; +} + +/** + * Whether two label arrays (of any label type) describe the same partition + * — i.e. every pair of positions is grouped together in one array if and + * only if it is in the other. Deliberately ignores the concrete label + * values themselves, since two different (but equally valid) clustering + * algorithms/traversal orders may number the same groups differently. + * @param a + * @param b + */ +function samePartition(a: readonly unknown[], b: readonly unknown[]): boolean { + if (a.length !== b.length) { + return false; + } + for (let i = 0; i < a.length; i++) { + for (let j = i + 1; j < a.length; j++) { + if ((a[i] === a[j]) !== (b[i] === b[j])) { + return false; + } + } + } + return true; +} + +/** + * Deterministic PRNG (mulberry32) so the property test is reproducible across CI runs. + * @param seed + */ +function mulberry32(seed: number): () => number { + let state = seed; + return () => { + // `| 0` intentionally wraps to a signed 32-bit integer (mulberry32's + // overflow behavior); `Math.trunc()` alone would not wrap, so the + // usual unicorn/prefer-math-trunc autofix would silently change this + // PRNG's output sequence. + // eslint-disable-next-line unicorn/prefer-math-trunc + state = (state + 0x6d_2b_79_f5) | 0; + let t = state; + t = Math.imul(t ^ (t >>> 15), t | 1); + t ^= t + Math.imul(t ^ (t >>> 7), t | 61); + return ((t ^ (t >>> 14)) >>> 0) / 4_294_967_296; + }; +} + +/** + * Generates `count` random token sets drawn from a `vocabularySize`-word + * vocabulary (each word independently included with 50% probability), for + * the differential property test below. + * @param seed + * @param count + * @param vocabularySize + */ +function randomTokenSets( + seed: number, + count: number, + vocabularySize: number, +): Set[] { + const random = mulberry32(seed); + const vocabulary = Array.from( + { length: vocabularySize }, + (_, index) => `token-${index}`, + ); + return Array.from({ length: count }, () => { + const tokens = vocabulary.filter(() => random() < 0.5); + return new Set(tokens); + }); +} + +describe('resolveStructuralClusterKeys', () => { + test('an empty array returns an empty array', () => { + expect(resolveStructuralClusterKeys([])).toEqual([]); + }); + + test('a single token set forms its own cluster', () => { + const result = resolveStructuralClusterKeys([new Set(['body>header'])]); + expect(result).toEqual(['cluster:0']); + }); + + test('two identical token sets share a cluster key', () => { + const a = new Set(['body>header', 'body>main>.card', 'body>footer']); + const b = new Set(['body>header', 'body>main>.card', 'body>footer']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).toBe(result[1]); + }); + + test('a pair at exactly the default threshold (0.8) still merges', () => { + // shared = 8 tokens; a/b each add one unique token: intersection = 8, + // union = 10, similarity = 8/10 = 0.8 (the >= boundary is inclusive) + const shared = Array.from({ length: 8 }, (_, index) => `shared-${index}`); + const a = new Set([...shared, 'unique-a']); + const b = new Set([...shared, 'unique-b']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).toBe(result[1]); + }); + + test('a pair just below the default threshold does not merge', () => { + // shared = 7 tokens; a/b each add two unique tokens: intersection = 7, + // union = 11, similarity = 7/11 ≈ 0.636, below the default 0.8 + const shared = Array.from({ length: 7 }, (_, index) => `shared-${index}`); + const a = new Set([...shared, 'a1', 'a2']); + const b = new Set([...shared, 'b1', 'b2']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).not.toBe(result[1]); + }); + + test('a threshold assembled from arithmetic (0.1 + 0.2) still merges a pair at the equivalent exact boundary', () => { + // 0.1 + 0.2 === 0.30000000000000004, not the mathematically equivalent + // 0.3 — regression test for the floating-point boundary bug found by + // /code-review xhigh: comparing a pair's exact similarity against this + // threshold with no epsilon tolerance would wrongly reject a pair the + // caller intended to be at the (inclusive) boundary. + // shared = 3, a-only = 4, b-only = 3: intersection = 3, union = 10, + // similarity = 3/10 = 0.3 exactly + const a = new Set(['s1', 's2', 's3', 'a1', 'a2', 'a3', 'a4']); + const b = new Set(['s1', 's2', 's3', 'b1', 'b2', 'b3']); + const result = resolveStructuralClusterKeys([a, b], { + similarityThreshold: 0.1 + 0.2, + }); + expect(result[0]).toBe(result[1]); + }); + + test('similarityThreshold: 0 merges every page into a single cluster, however dissimilar', () => { + const result = resolveStructuralClusterKeys( + [new Set(['a']), new Set(['b']), new Set(['c'])], + { similarityThreshold: 0 }, + ); + expect(result[0]).toBe(result[1]); + expect(result[1]).toBe(result[2]); + }); + + test('similarityThreshold: 1 only merges pages with an identical token set', () => { + const result = resolveStructuralClusterKeys( + [new Set(['a', 'b']), new Set(['a', 'b']), new Set(['a', 'b', 'c'])], + { similarityThreshold: 1 }, + ); + expect(result[0]).toBe(result[1]); + expect(result[2]).not.toBe(result[0]); + }); + + test('complete-linkage refuses to chain A into C through a shared bridge B', () => { + // similarity(A,B) = |{a,b}| / |{a,b,c}| = 2/3 ≈ 0.667 + // similarity(B,C) = |{a,c}| / |{a,b,c,d}| = 2/4 = 0.5 + // similarity(A,C) = |{a}| / |{a,b,c,d}| = 1/4 = 0.25 + // With threshold 0.5: A-B and B-C both clear it, but A-C does not. + // Single-linkage/connected-components would merge all three via B; + // complete-linkage must not, because {A,B,C} would require every pair + // (including A-C) to clear the threshold. + const a = new Set(['a', 'b']); + const b = new Set(['a', 'b', 'c']); + const c = new Set(['a', 'c', 'd']); + const result = resolveStructuralClusterKeys([a, b, c], { similarityThreshold: 0.5 }); + + expect(result[0]).toBe(result[1]); + expect(result[2]).not.toBe(result[0]); + }); + + test('matches the exact output documented in the JSDoc @example', () => { + // Kept in sync with resolveStructuralClusterKeys's @example: if this + // ever fails, the JSDoc example is out of date and must be corrected + // alongside the implementation, not the other way around. + const result = resolveStructuralClusterKeys([ + new Set(['body>header', 'body>main>.card', 'body>footer']), + new Set(['body>header', 'body>main>.card', 'body>footer']), + new Set(['body>nav', 'body>main>form']), + ]); + expect(result).toEqual(['cluster:0', 'cluster:0', 'cluster:1']); + }); + + test('three mutually dissimilar token sets each form their own cluster', () => { + const result = resolveStructuralClusterKeys([ + new Set(['body>header']), + new Set(['body>nav', 'body>main>form']), + new Set(['body>aside', 'body>footer', 'body>footer>small']), + ]); + expect(new Set(result).size).toBe(3); + }); + + test.each([-0.1, 1.1, Number.NaN])( + 'rejects a similarityThreshold outside [0, 1] (%s)', + (similarityThreshold) => { + expect(() => resolveStructuralClusterKeys([], { similarityThreshold })).toThrow( + RangeError, + ); + }, + ); + + test.each([0, 1])( + 'accepts the boundary similarityThreshold values (%s)', + (similarityThreshold) => { + expect(() => + resolveStructuralClusterKeys([], { similarityThreshold }), + ).not.toThrow(); + }, + ); + + const propertyTestCases = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10].flatMap((seed) => + [4, 6, 10].flatMap((vocabularySize) => + [0.3, 0.5, 0.8].map( + (threshold) => [seed, vocabularySize, threshold] as [number, number, number], + ), + ), + ); + + test.each(propertyTestCases)( + 'matches a naive brute-force complete-linkage reference on random inputs (seed %s, vocabulary %s, threshold %s)', + (seed, vocabularySize, threshold) => { + const tokenSets = randomTokenSets(seed, 8, vocabularySize); + + const actual = resolveStructuralClusterKeys(tokenSets, { + similarityThreshold: threshold, + }); + const expected = bruteForceCompleteLinkage(tokenSets, threshold); + + expect(samePartition(actual, expected)).toBe(true); + }, + ); +}); diff --git a/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts new file mode 100644 index 00000000..a537b3ed --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts @@ -0,0 +1,263 @@ +import { jaccardSimilarity } from './jaccard-similarity.js'; + +/** + * @see resolveStructuralClusterKeys + */ +export type ResolveStructuralClusterKeysOptions = { + /** + * Minimum `jaccardSimilarity()` score required between *every* pair of + * pages within a cluster (complete-linkage criterion) for those pages to + * be grouped together. Must be a number in `[0, 1]` (`RangeError` + * otherwise). 0.8 is a starting-point heuristic, not validated against + * real corpora — tune per site once real cluster boundaries are + * inspected. + */ + similarityThreshold?: number; +}; + +const DEFAULT_SIMILARITY_THRESHOLD = 0.8; + +/** + * `jaccardSimilarity()` returns `intersectionSize / unionSize`, a + * floating-point division that can land a hair below the caller's intended + * threshold even when the two are mathematically equal (e.g. a threshold + * assembled from arithmetic like `0.1 + 0.2` is `0.30000000000000004`, not + * `0.3`), which would otherwise make a pair at the documented inclusive + * boundary fail the `>=` check it should pass. Subtracting this epsilon + * before comparing absorbs that rounding noise (same technique and value as + * `BOUNDARY_EPSILON` in `split-tokens-by-frequency.ts`). + */ +const BOUNDARY_EPSILON = 1e-9; + +/** + * Reads `values[index]`, throwing instead of returning `undefined`. Every + * call site here indexes within bounds it just established itself (loop + * ranges, or an index freshly returned by the same array's own scan), so the + * thrown branch is unreachable in practice; it exists to satisfy + * `noUncheckedIndexedAccess` without a non-null assertion (same rationale as + * `readDpValue` in `array-edit-distance.ts`, generalized to any array-like). + * @param values + * @param index + */ +function requireIndex(values: ArrayLike, index: number): T { + const value = values[index]; + if (value === undefined) { + throw new Error('resolveStructuralClusterKeys: index out of bounds'); + } + return value; +} + +/** + * Finds the representative (root) of `index`'s set, compressing every + * traversed link so future lookups on the same path are near-constant time. + * @param parent + * @param index + */ +function find(parent: Int32Array, index: number): number { + let root = index; + while (requireIndex(parent, root) !== root) { + root = requireIndex(parent, root); + } + let current = index; + while (current !== root) { + const next = requireIndex(parent, current); + parent[current] = root; + current = next; + } + return root; +} + +/** + * Complete-linkage hierarchical clustering of `tokenSets`, cut at + * `threshold`, computed via the NN-chain algorithm (Murtagh, F., 1983, "A + * Survey of Recent Advances in Hierarchical Clustering Algorithms," The + * Computer Journal 26(4)). NN-chain produces the exact same dendrogram as + * naively re-scanning every live cluster pair for the best merge at each + * step, but in O(n²) time instead of O(n³): each cluster follows a chain of + * mutually-improving nearest neighbors until it lands on a pair that are + * each other's nearest neighbor (a "reciprocal nearest neighbor", RNN); that + * pair's merge is provably a valid next step in the correct dendrogram. This + * is a genuine algorithmic speedup, not an approximation — see + * `resolveStructuralClusterKeys`'s JSDoc for why an approximation was + * rejected. + * + * Complete-linkage was chosen over single-linkage (connected components of + * the threshold graph) because single-linkage's "chaining" lets one + * unrepresentative page transitively merge two otherwise-unrelated + * templates — the opposite of what template detection needs. Complete- + * linkage requires *every* pair across two clusters to clear the threshold + * before merging them, which rules that out. Cluster-to-cluster similarity + * is maintained via the Lance-Williams update for complete-linkage: + * `similarity(merged, Z) = min(similarity(X, Z), similarity(Y, Z))`. + * + * The algorithm always runs every one of the `size - 1` possible merges to + * completion (down to a single root), never stopping early at `threshold`. + * This looks wasteful but isn't optional: Lance-Williams monotonicity + * (Lance, G. N. & Williams, W. T., 1967, "A General Theory of Classificatory + * Sorting Strategies," The Computer Journal 9(4)) guarantees no height + * inversions inside the dendrogram itself (a merge's similarity is always ≥ + * the similarity of every merge nested inside it), but says nothing about + * the chronological order in which independent, not-yet-connected + * chains happen to resolve their own RNN pairs — one chain can easily + * stumble onto a low-similarity RNN pair before a different, still-unvisited + * chain uncovers a high-similarity one elsewhere. Stopping the whole + * algorithm at the first below-threshold merge would therefore discard + * later, still-valid above-threshold merges (confirmed by this file's + * differential test against a naive reference — an earlier version of this + * function that broke early on the first below-threshold RNN pair failed it + * for exactly this reason). Instead, every merge is always folded into the + * `active`/`similarity` bookkeeping so the algorithm can keep discovering + * the rest of the true dendrogram, but only merges scoring `>= threshold` + * are recorded in `parent` (the union-find used for final membership). + * Monotonicity guarantees this is safe: any merge scoring `>= threshold` was + * necessarily built out of children merges that scored at least as high, so + * restricting the union-find to threshold-clearing merges — regardless of + * the chronological order they were discovered in — reconstructs exactly + * the correct threshold cut. + * @param tokenSets + * @param threshold + */ +function clusterByCompleteLinkage( + tokenSets: readonly ReadonlySet[], + threshold: number, +): number[] { + const size = tokenSets.length; + const parent = Int32Array.from({ length: size }, (_, index) => index); + + const similarity = new Float64Array(size * size); + for (let i = 0; i < size; i++) { + for (let j = i + 1; j < size; j++) { + const score = jaccardSimilarity( + requireIndex(tokenSets, i), + requireIndex(tokenSets, j), + ); + similarity[i * size + j] = score; + similarity[j * size + i] = score; + } + } + + const active = new Uint8Array(size).fill(1); + const chain: number[] = []; + + const findFreshStart = (): number => { + for (let index = 0; index < size; index++) { + if (requireIndex(active, index) === 1) { + return index; + } + } + throw new Error( + 'resolveStructuralClusterKeys: no active cluster left to resume from', + ); + }; + + let activeCount = size; + while (activeCount > 1) { + if (chain.length === 0) { + chain.push(findFreshStart()); + } + + const top = requireIndex(chain, chain.length - 1); + let best = -1; + let bestScore = Number.NEGATIVE_INFINITY; + for (let candidate = 0; candidate < size; candidate++) { + if (candidate !== top && requireIndex(active, candidate) === 1) { + const score = requireIndex(similarity, top * size + candidate); + if (score > bestScore) { + bestScore = score; + best = candidate; + } + } + } + + const secondFromTop = chain.length >= 2 ? chain.at(-2) : undefined; + if (best === secondFromTop) { + chain.pop(); + chain.pop(); + + const survivor = Math.min(top, best); + const dead = Math.max(top, best); + for (let candidate = 0; candidate < size; candidate++) { + if ( + candidate !== top && + candidate !== best && + requireIndex(active, candidate) === 1 + ) { + const merged = Math.min( + requireIndex(similarity, top * size + candidate), + requireIndex(similarity, best * size + candidate), + ); + similarity[survivor * size + candidate] = merged; + similarity[candidate * size + survivor] = merged; + } + } + + active[dead] = 0; + if (bestScore >= threshold - BOUNDARY_EPSILON) { + parent[find(parent, dead)] = find(parent, survivor); + } + activeCount--; + } else { + chain.push(best); + } + } + + return Array.from({ length: size }, (_, index) => find(parent, index)); +} + +/** + * Resolves, within a single already-blocked group of pages (e.g. one key + * from {@link ./resolve-blocking-group-keys.js | resolveBlockingGroupKeys}), + * which pages share a structural template. Returns one cluster key per + * page, in the same order as `tokenSets`. Does not call + * {@link ./tokenize.js | tokenize} itself (callers pass pages already + * tokenized and turned into `Set`s, mirroring + * {@link ./compute-document-frequency.js | computeDocumentFrequency}'s + * contract) and does not orchestrate multiple blocks — a heterogeneous + * corpus should be split into blocks by the caller before reaching this + * function. + * + * MinHash/LSH-based approximation and medoid-based refinement of these + * clusters are intentionally out of scope: NN-chain already computes the + * exact complete-linkage clustering in O(n²), so there is no accuracy being + * traded away by not approximating, and no evidence yet that O(n²) is a + * real bottleneck at the block sizes this function actually sees. + * @param tokenSets + * @param options + * @example + * ```ts + * resolveStructuralClusterKeys([ + * new Set(['body>header', 'body>main>.card', 'body>footer']), + * new Set(['body>header', 'body>main>.card', 'body>footer']), + * new Set(['body>nav', 'body>main>form']), + * ]); + * // ['cluster:0', 'cluster:0', 'cluster:1'] + * ``` + */ +export function resolveStructuralClusterKeys( + tokenSets: readonly ReadonlySet[], + options?: ResolveStructuralClusterKeysOptions, +): string[] { + const similarityThreshold = + options?.similarityThreshold ?? DEFAULT_SIMILARITY_THRESHOLD; + if (!(similarityThreshold >= 0 && similarityThreshold <= 1)) { + throw new RangeError( + `resolveStructuralClusterKeys: similarityThreshold must be between 0 and 1, got ${similarityThreshold}`, + ); + } + + if (tokenSets.length === 0) { + return []; + } + + const roots = clusterByCompleteLinkage(tokenSets, similarityThreshold); + + const rootToLabel = new Map(); + return roots.map((root) => { + let label = rootToLabel.get(root); + if (label === undefined) { + label = `cluster:${rootToLabel.size}`; + rootToLabel.set(root, label); + } + return label; + }); +}