diff --git a/cspell.json b/cspell.json index a046e6d4..bd69e85a 100644 --- a/cspell.json +++ b/cspell.json @@ -12,8 +12,10 @@ "words": [ // page-cluster clustering/distance terminology "jaccard", + "medoid", "medoids", "hrefs", + "Murtagh", // "gaxios", diff --git a/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts new file mode 100644 index 00000000..f8b7c7fc --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.spec.ts @@ -0,0 +1,282 @@ +import { describe, expect, test } from 'vitest'; + +import { jaccardSimilarity } from './jaccard-similarity.js'; +import { resolveStructuralClusterKeys } from './resolve-structural-cluster-keys.js'; + +/** + * Naive, obviously-correct reference implementation of threshold-cut + * complete-linkage clustering: repeatedly rescans every live cluster pair + * and merges the single best (highest minimum-pairwise-similarity) pair, + * with no NN-chain bookkeeping. Used only to differentially verify the + * production NN-chain implementation, which computes the exact same + * clustering faster (O(n²) vs this function's O(n³)) — see + * resolve-structural-cluster-keys.ts's JSDoc for why NN-chain is a genuine + * speedup, not an approximation. Returns numeric cluster labels rather than + * `cluster:N` strings; label *values* are allowed to differ from the + * production function's own numbering (traversal order differs between the + * two algorithms), only the partition (which pages end up together) must + * match, which `samePartition` below checks. + * @param tokenSets + * @param threshold + */ +function bruteForceCompleteLinkage( + tokenSets: readonly ReadonlySet[], + threshold: number, +): number[] { + let clusters: number[][] = tokenSets.map((_, index) => [index]); + + for (;;) { + let bestPair: [number, number] | undefined; + let bestScore = Number.NEGATIVE_INFINITY; + for (let i = 0; i < clusters.length; i++) { + for (let j = i + 1; j < clusters.length; j++) { + let minSimilarity = Number.POSITIVE_INFINITY; + for (const p of clusters[i] ?? []) { + for (const q of clusters[j] ?? []) { + minSimilarity = Math.min( + minSimilarity, + jaccardSimilarity(tokenSets[p] ?? new Set(), tokenSets[q] ?? new Set()), + ); + } + } + if (minSimilarity > bestScore) { + bestScore = minSimilarity; + bestPair = [i, j]; + } + } + } + + if (!bestPair || bestScore < threshold) { + break; + } + + const [i, j] = bestPair; + clusters[i] = [...(clusters[i] ?? []), ...(clusters[j] ?? [])]; + clusters = clusters.filter((_, index) => index !== j); + } + + const labels = Array.from({ length: tokenSets.length }); + for (const [clusterIndex, members] of clusters.entries()) { + for (const member of members) { + labels[member] = clusterIndex; + } + } + return labels; +} + +/** + * Whether two label arrays (of any label type) describe the same partition + * — i.e. every pair of positions is grouped together in one array if and + * only if it is in the other. Deliberately ignores the concrete label + * values themselves, since two different (but equally valid) clustering + * algorithms/traversal orders may number the same groups differently. + * @param a + * @param b + */ +function samePartition(a: readonly unknown[], b: readonly unknown[]): boolean { + if (a.length !== b.length) { + return false; + } + for (let i = 0; i < a.length; i++) { + for (let j = i + 1; j < a.length; j++) { + if ((a[i] === a[j]) !== (b[i] === b[j])) { + return false; + } + } + } + return true; +} + +/** + * Deterministic PRNG (mulberry32) so the property test is reproducible across CI runs. + * @param seed + */ +function mulberry32(seed: number): () => number { + let state = seed; + return () => { + // `| 0` intentionally wraps to a signed 32-bit integer (mulberry32's + // overflow behavior); `Math.trunc()` alone would not wrap, so the + // usual unicorn/prefer-math-trunc autofix would silently change this + // PRNG's output sequence. + // eslint-disable-next-line unicorn/prefer-math-trunc + state = (state + 0x6d_2b_79_f5) | 0; + let t = state; + t = Math.imul(t ^ (t >>> 15), t | 1); + t ^= t + Math.imul(t ^ (t >>> 7), t | 61); + return ((t ^ (t >>> 14)) >>> 0) / 4_294_967_296; + }; +} + +/** + * Generates `count` random token sets drawn from a `vocabularySize`-word + * vocabulary (each word independently included with 50% probability), for + * the differential property test below. + * @param seed + * @param count + * @param vocabularySize + */ +function randomTokenSets( + seed: number, + count: number, + vocabularySize: number, +): Set[] { + const random = mulberry32(seed); + const vocabulary = Array.from( + { length: vocabularySize }, + (_, index) => `token-${index}`, + ); + return Array.from({ length: count }, () => { + const tokens = vocabulary.filter(() => random() < 0.5); + return new Set(tokens); + }); +} + +describe('resolveStructuralClusterKeys', () => { + test('an empty array returns an empty array', () => { + expect(resolveStructuralClusterKeys([])).toEqual([]); + }); + + test('a single token set forms its own cluster', () => { + const result = resolveStructuralClusterKeys([new Set(['body>header'])]); + expect(result).toEqual(['cluster:0']); + }); + + test('two identical token sets share a cluster key', () => { + const a = new Set(['body>header', 'body>main>.card', 'body>footer']); + const b = new Set(['body>header', 'body>main>.card', 'body>footer']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).toBe(result[1]); + }); + + test('a pair at exactly the default threshold (0.8) still merges', () => { + // shared = 8 tokens; a/b each add one unique token: intersection = 8, + // union = 10, similarity = 8/10 = 0.8 (the >= boundary is inclusive) + const shared = Array.from({ length: 8 }, (_, index) => `shared-${index}`); + const a = new Set([...shared, 'unique-a']); + const b = new Set([...shared, 'unique-b']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).toBe(result[1]); + }); + + test('a pair just below the default threshold does not merge', () => { + // shared = 7 tokens; a/b each add two unique tokens: intersection = 7, + // union = 11, similarity = 7/11 ≈ 0.636, below the default 0.8 + const shared = Array.from({ length: 7 }, (_, index) => `shared-${index}`); + const a = new Set([...shared, 'a1', 'a2']); + const b = new Set([...shared, 'b1', 'b2']); + const result = resolveStructuralClusterKeys([a, b]); + expect(result[0]).not.toBe(result[1]); + }); + + test('a threshold assembled from arithmetic (0.1 + 0.2) still merges a pair at the equivalent exact boundary', () => { + // 0.1 + 0.2 === 0.30000000000000004, not the mathematically equivalent + // 0.3 — regression test for the floating-point boundary bug found by + // /code-review xhigh: comparing a pair's exact similarity against this + // threshold with no epsilon tolerance would wrongly reject a pair the + // caller intended to be at the (inclusive) boundary. + // shared = 3, a-only = 4, b-only = 3: intersection = 3, union = 10, + // similarity = 3/10 = 0.3 exactly + const a = new Set(['s1', 's2', 's3', 'a1', 'a2', 'a3', 'a4']); + const b = new Set(['s1', 's2', 's3', 'b1', 'b2', 'b3']); + const result = resolveStructuralClusterKeys([a, b], { + similarityThreshold: 0.1 + 0.2, + }); + expect(result[0]).toBe(result[1]); + }); + + test('similarityThreshold: 0 merges every page into a single cluster, however dissimilar', () => { + const result = resolveStructuralClusterKeys( + [new Set(['a']), new Set(['b']), new Set(['c'])], + { similarityThreshold: 0 }, + ); + expect(result[0]).toBe(result[1]); + expect(result[1]).toBe(result[2]); + }); + + test('similarityThreshold: 1 only merges pages with an identical token set', () => { + const result = resolveStructuralClusterKeys( + [new Set(['a', 'b']), new Set(['a', 'b']), new Set(['a', 'b', 'c'])], + { similarityThreshold: 1 }, + ); + expect(result[0]).toBe(result[1]); + expect(result[2]).not.toBe(result[0]); + }); + + test('complete-linkage refuses to chain A into C through a shared bridge B', () => { + // similarity(A,B) = |{a,b}| / |{a,b,c}| = 2/3 ≈ 0.667 + // similarity(B,C) = |{a,c}| / |{a,b,c,d}| = 2/4 = 0.5 + // similarity(A,C) = |{a}| / |{a,b,c,d}| = 1/4 = 0.25 + // With threshold 0.5: A-B and B-C both clear it, but A-C does not. + // Single-linkage/connected-components would merge all three via B; + // complete-linkage must not, because {A,B,C} would require every pair + // (including A-C) to clear the threshold. + const a = new Set(['a', 'b']); + const b = new Set(['a', 'b', 'c']); + const c = new Set(['a', 'c', 'd']); + const result = resolveStructuralClusterKeys([a, b, c], { similarityThreshold: 0.5 }); + + expect(result[0]).toBe(result[1]); + expect(result[2]).not.toBe(result[0]); + }); + + test('matches the exact output documented in the JSDoc @example', () => { + // Kept in sync with resolveStructuralClusterKeys's @example: if this + // ever fails, the JSDoc example is out of date and must be corrected + // alongside the implementation, not the other way around. + const result = resolveStructuralClusterKeys([ + new Set(['body>header', 'body>main>.card', 'body>footer']), + new Set(['body>header', 'body>main>.card', 'body>footer']), + new Set(['body>nav', 'body>main>form']), + ]); + expect(result).toEqual(['cluster:0', 'cluster:0', 'cluster:1']); + }); + + test('three mutually dissimilar token sets each form their own cluster', () => { + const result = resolveStructuralClusterKeys([ + new Set(['body>header']), + new Set(['body>nav', 'body>main>form']), + new Set(['body>aside', 'body>footer', 'body>footer>small']), + ]); + expect(new Set(result).size).toBe(3); + }); + + test.each([-0.1, 1.1, Number.NaN])( + 'rejects a similarityThreshold outside [0, 1] (%s)', + (similarityThreshold) => { + expect(() => resolveStructuralClusterKeys([], { similarityThreshold })).toThrow( + RangeError, + ); + }, + ); + + test.each([0, 1])( + 'accepts the boundary similarityThreshold values (%s)', + (similarityThreshold) => { + expect(() => + resolveStructuralClusterKeys([], { similarityThreshold }), + ).not.toThrow(); + }, + ); + + const propertyTestCases = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10].flatMap((seed) => + [4, 6, 10].flatMap((vocabularySize) => + [0.3, 0.5, 0.8].map( + (threshold) => [seed, vocabularySize, threshold] as [number, number, number], + ), + ), + ); + + test.each(propertyTestCases)( + 'matches a naive brute-force complete-linkage reference on random inputs (seed %s, vocabulary %s, threshold %s)', + (seed, vocabularySize, threshold) => { + const tokenSets = randomTokenSets(seed, 8, vocabularySize); + + const actual = resolveStructuralClusterKeys(tokenSets, { + similarityThreshold: threshold, + }); + const expected = bruteForceCompleteLinkage(tokenSets, threshold); + + expect(samePartition(actual, expected)).toBe(true); + }, + ); +}); diff --git a/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts new file mode 100644 index 00000000..a537b3ed --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-structural-cluster-keys.ts @@ -0,0 +1,263 @@ +import { jaccardSimilarity } from './jaccard-similarity.js'; + +/** + * @see resolveStructuralClusterKeys + */ +export type ResolveStructuralClusterKeysOptions = { + /** + * Minimum `jaccardSimilarity()` score required between *every* pair of + * pages within a cluster (complete-linkage criterion) for those pages to + * be grouped together. Must be a number in `[0, 1]` (`RangeError` + * otherwise). 0.8 is a starting-point heuristic, not validated against + * real corpora — tune per site once real cluster boundaries are + * inspected. + */ + similarityThreshold?: number; +}; + +const DEFAULT_SIMILARITY_THRESHOLD = 0.8; + +/** + * `jaccardSimilarity()` returns `intersectionSize / unionSize`, a + * floating-point division that can land a hair below the caller's intended + * threshold even when the two are mathematically equal (e.g. a threshold + * assembled from arithmetic like `0.1 + 0.2` is `0.30000000000000004`, not + * `0.3`), which would otherwise make a pair at the documented inclusive + * boundary fail the `>=` check it should pass. Subtracting this epsilon + * before comparing absorbs that rounding noise (same technique and value as + * `BOUNDARY_EPSILON` in `split-tokens-by-frequency.ts`). + */ +const BOUNDARY_EPSILON = 1e-9; + +/** + * Reads `values[index]`, throwing instead of returning `undefined`. Every + * call site here indexes within bounds it just established itself (loop + * ranges, or an index freshly returned by the same array's own scan), so the + * thrown branch is unreachable in practice; it exists to satisfy + * `noUncheckedIndexedAccess` without a non-null assertion (same rationale as + * `readDpValue` in `array-edit-distance.ts`, generalized to any array-like). + * @param values + * @param index + */ +function requireIndex(values: ArrayLike, index: number): T { + const value = values[index]; + if (value === undefined) { + throw new Error('resolveStructuralClusterKeys: index out of bounds'); + } + return value; +} + +/** + * Finds the representative (root) of `index`'s set, compressing every + * traversed link so future lookups on the same path are near-constant time. + * @param parent + * @param index + */ +function find(parent: Int32Array, index: number): number { + let root = index; + while (requireIndex(parent, root) !== root) { + root = requireIndex(parent, root); + } + let current = index; + while (current !== root) { + const next = requireIndex(parent, current); + parent[current] = root; + current = next; + } + return root; +} + +/** + * Complete-linkage hierarchical clustering of `tokenSets`, cut at + * `threshold`, computed via the NN-chain algorithm (Murtagh, F., 1983, "A + * Survey of Recent Advances in Hierarchical Clustering Algorithms," The + * Computer Journal 26(4)). NN-chain produces the exact same dendrogram as + * naively re-scanning every live cluster pair for the best merge at each + * step, but in O(n²) time instead of O(n³): each cluster follows a chain of + * mutually-improving nearest neighbors until it lands on a pair that are + * each other's nearest neighbor (a "reciprocal nearest neighbor", RNN); that + * pair's merge is provably a valid next step in the correct dendrogram. This + * is a genuine algorithmic speedup, not an approximation — see + * `resolveStructuralClusterKeys`'s JSDoc for why an approximation was + * rejected. + * + * Complete-linkage was chosen over single-linkage (connected components of + * the threshold graph) because single-linkage's "chaining" lets one + * unrepresentative page transitively merge two otherwise-unrelated + * templates — the opposite of what template detection needs. Complete- + * linkage requires *every* pair across two clusters to clear the threshold + * before merging them, which rules that out. Cluster-to-cluster similarity + * is maintained via the Lance-Williams update for complete-linkage: + * `similarity(merged, Z) = min(similarity(X, Z), similarity(Y, Z))`. + * + * The algorithm always runs every one of the `size - 1` possible merges to + * completion (down to a single root), never stopping early at `threshold`. + * This looks wasteful but isn't optional: Lance-Williams monotonicity + * (Lance, G. N. & Williams, W. T., 1967, "A General Theory of Classificatory + * Sorting Strategies," The Computer Journal 9(4)) guarantees no height + * inversions inside the dendrogram itself (a merge's similarity is always ≥ + * the similarity of every merge nested inside it), but says nothing about + * the chronological order in which independent, not-yet-connected + * chains happen to resolve their own RNN pairs — one chain can easily + * stumble onto a low-similarity RNN pair before a different, still-unvisited + * chain uncovers a high-similarity one elsewhere. Stopping the whole + * algorithm at the first below-threshold merge would therefore discard + * later, still-valid above-threshold merges (confirmed by this file's + * differential test against a naive reference — an earlier version of this + * function that broke early on the first below-threshold RNN pair failed it + * for exactly this reason). Instead, every merge is always folded into the + * `active`/`similarity` bookkeeping so the algorithm can keep discovering + * the rest of the true dendrogram, but only merges scoring `>= threshold` + * are recorded in `parent` (the union-find used for final membership). + * Monotonicity guarantees this is safe: any merge scoring `>= threshold` was + * necessarily built out of children merges that scored at least as high, so + * restricting the union-find to threshold-clearing merges — regardless of + * the chronological order they were discovered in — reconstructs exactly + * the correct threshold cut. + * @param tokenSets + * @param threshold + */ +function clusterByCompleteLinkage( + tokenSets: readonly ReadonlySet[], + threshold: number, +): number[] { + const size = tokenSets.length; + const parent = Int32Array.from({ length: size }, (_, index) => index); + + const similarity = new Float64Array(size * size); + for (let i = 0; i < size; i++) { + for (let j = i + 1; j < size; j++) { + const score = jaccardSimilarity( + requireIndex(tokenSets, i), + requireIndex(tokenSets, j), + ); + similarity[i * size + j] = score; + similarity[j * size + i] = score; + } + } + + const active = new Uint8Array(size).fill(1); + const chain: number[] = []; + + const findFreshStart = (): number => { + for (let index = 0; index < size; index++) { + if (requireIndex(active, index) === 1) { + return index; + } + } + throw new Error( + 'resolveStructuralClusterKeys: no active cluster left to resume from', + ); + }; + + let activeCount = size; + while (activeCount > 1) { + if (chain.length === 0) { + chain.push(findFreshStart()); + } + + const top = requireIndex(chain, chain.length - 1); + let best = -1; + let bestScore = Number.NEGATIVE_INFINITY; + for (let candidate = 0; candidate < size; candidate++) { + if (candidate !== top && requireIndex(active, candidate) === 1) { + const score = requireIndex(similarity, top * size + candidate); + if (score > bestScore) { + bestScore = score; + best = candidate; + } + } + } + + const secondFromTop = chain.length >= 2 ? chain.at(-2) : undefined; + if (best === secondFromTop) { + chain.pop(); + chain.pop(); + + const survivor = Math.min(top, best); + const dead = Math.max(top, best); + for (let candidate = 0; candidate < size; candidate++) { + if ( + candidate !== top && + candidate !== best && + requireIndex(active, candidate) === 1 + ) { + const merged = Math.min( + requireIndex(similarity, top * size + candidate), + requireIndex(similarity, best * size + candidate), + ); + similarity[survivor * size + candidate] = merged; + similarity[candidate * size + survivor] = merged; + } + } + + active[dead] = 0; + if (bestScore >= threshold - BOUNDARY_EPSILON) { + parent[find(parent, dead)] = find(parent, survivor); + } + activeCount--; + } else { + chain.push(best); + } + } + + return Array.from({ length: size }, (_, index) => find(parent, index)); +} + +/** + * Resolves, within a single already-blocked group of pages (e.g. one key + * from {@link ./resolve-blocking-group-keys.js | resolveBlockingGroupKeys}), + * which pages share a structural template. Returns one cluster key per + * page, in the same order as `tokenSets`. Does not call + * {@link ./tokenize.js | tokenize} itself (callers pass pages already + * tokenized and turned into `Set`s, mirroring + * {@link ./compute-document-frequency.js | computeDocumentFrequency}'s + * contract) and does not orchestrate multiple blocks — a heterogeneous + * corpus should be split into blocks by the caller before reaching this + * function. + * + * MinHash/LSH-based approximation and medoid-based refinement of these + * clusters are intentionally out of scope: NN-chain already computes the + * exact complete-linkage clustering in O(n²), so there is no accuracy being + * traded away by not approximating, and no evidence yet that O(n²) is a + * real bottleneck at the block sizes this function actually sees. + * @param tokenSets + * @param options + * @example + * ```ts + * resolveStructuralClusterKeys([ + * new Set(['body>header', 'body>main>.card', 'body>footer']), + * new Set(['body>header', 'body>main>.card', 'body>footer']), + * new Set(['body>nav', 'body>main>form']), + * ]); + * // ['cluster:0', 'cluster:0', 'cluster:1'] + * ``` + */ +export function resolveStructuralClusterKeys( + tokenSets: readonly ReadonlySet[], + options?: ResolveStructuralClusterKeysOptions, +): string[] { + const similarityThreshold = + options?.similarityThreshold ?? DEFAULT_SIMILARITY_THRESHOLD; + if (!(similarityThreshold >= 0 && similarityThreshold <= 1)) { + throw new RangeError( + `resolveStructuralClusterKeys: similarityThreshold must be between 0 and 1, got ${similarityThreshold}`, + ); + } + + if (tokenSets.length === 0) { + return []; + } + + const roots = clusterByCompleteLinkage(tokenSets, similarityThreshold); + + const rootToLabel = new Map(); + return roots.map((root) => { + let label = rootToLabel.get(root); + if (label === undefined) { + label = `cluster:${rootToLabel.size}`; + rootToLabel.set(root, label); + } + return label; + }); +}