From 68acfe840a8e38b506972f65fb51c726385b4541 Mon Sep 17 00:00:00 2001 From: Yusuke Hirao Date: Fri, 3 Jul 2026 18:09:37 +0900 Subject: [PATCH] feat(page-cluster): add frequency-based template/content token split Add computeDocumentFrequency() and splitTokensByFrequency(), a preprocessing layer that separates a page's shared site chrome (header/nav/footer) from its page-specific content by document frequency, before either half is compared with jaccardSimilarity(). A single flat Jaccard over a page's full token set has two failure modes: common chrome dilutes genuine content differences at loose similarity thresholds, and page-specific content variation (e.g. a freeform CMS block-editor page, where the exact block mix differs per page) swamps a real layout match. Splitting first and comparing each axis separately fixes both. Validated against two real crawls: a small single-layout corporate site (a few hundred pages) showed a clean bimodal frequency split stable across a wide threshold range; a much larger site that turned out to be a federation of independent sub-sections (no single section covering even half the pages) showed the split requires a homogeneous input, and recovers cleanly once scoped to one section. code-review (xhigh) surfaced 9 findings, all in the frequency-cutoff comparison: unvalidated threshold allowing degenerate cutoffs (0, NaN, or a percentage instead of a fraction), floating-point rounding at the documented inclusive boundary, and pageCount being passable out of sync with the documentFrequency it was computed from. Fixed by validating threshold eagerly, applying an epsilon tolerance to the boundary comparison, and bundling pageCount with documentFrequency into one DocumentFrequency result so they cannot be passed independently. --- cspell.json | 3 + .../src/compute-document-frequency.spec.ts | 51 +++++++ .../src/compute-document-frequency.ts | 44 ++++++ .../src/split-tokens-by-frequency.spec.ts | 134 ++++++++++++++++++ .../src/split-tokens-by-frequency.ts | 101 +++++++++++++ packages/@d-zero/page-cluster/src/types.ts | 14 ++ 6 files changed, 347 insertions(+) create mode 100644 packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/compute-document-frequency.ts create mode 100644 packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts diff --git a/cspell.json b/cspell.json index 96b036d6..89d0ef91 100644 --- a/cspell.json +++ b/cspell.json @@ -10,6 +10,9 @@ "packages/@d-zero/page-cluster/src/__fixtures__/production-scale/**" ], "words": [ + // page-cluster clustering/distance terminology + "jaccard", + // "gaxios", "pngjs", diff --git a/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts b/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts new file mode 100644 index 00000000..1ab48252 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts @@ -0,0 +1,51 @@ +import { describe, expect, test } from 'vitest'; + +import { computeDocumentFrequency } from './compute-document-frequency.js'; + +describe('computeDocumentFrequency', () => { + test('empty input returns an empty map and a page count of 0', () => { + expect(computeDocumentFrequency([])).toEqual({ + documentFrequency: new Map(), + pageCount: 0, + }); + }); + + test('pageCount reflects the number of token sets passed in', () => { + const sets = [new Set(['a']), new Set(['b']), new Set(['c'])]; + expect(computeDocumentFrequency(sets).pageCount).toBe(3); + }); + + test('a token present in every set gets a count equal to the set count', () => { + const sets = [ + new Set(['body>header>a']), + new Set(['body>header>a']), + new Set(['body>header>a']), + ]; + expect(computeDocumentFrequency(sets)).toEqual({ + documentFrequency: new Map([['body>header>a', 3]]), + pageCount: 3, + }); + }); + + test('a token present in only one set gets a count of 1', () => { + const sets = [new Set(['body>main>p']), new Set(['body>footer>span'])]; + const { documentFrequency } = computeDocumentFrequency(sets); + expect(documentFrequency.get('body>main>p')).toBe(1); + expect(documentFrequency.get('body>footer>span')).toBe(1); + }); + + test('counts multiple distinct tokens independently across overlapping sets', () => { + const sets = [new Set(['a', 'b']), new Set(['b', 'c']), new Set(['b'])]; + const { documentFrequency } = computeDocumentFrequency(sets); + expect(documentFrequency.get('a')).toBe(1); + expect(documentFrequency.get('b')).toBe(3); + expect(documentFrequency.get('c')).toBe(1); + }); + + test('a single empty set contributes no frequency entries but still counts as one page', () => { + expect(computeDocumentFrequency([new Set()])).toEqual({ + documentFrequency: new Map(), + pageCount: 1, + }); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/compute-document-frequency.ts b/packages/@d-zero/page-cluster/src/compute-document-frequency.ts new file mode 100644 index 00000000..8f5dc1dd --- /dev/null +++ b/packages/@d-zero/page-cluster/src/compute-document-frequency.ts @@ -0,0 +1,44 @@ +import type { DocumentFrequency } from './types.js'; + +/** + * Counts, for each token, how many of the given per-page token sets contain + * it. This is the first half of separating a page's shared site chrome + * (header/nav/footer) from its page-specific content: a token that recurs + * across nearly every page in `tokenSets` is chrome, one that shows up on + * only a handful of pages is content — see `splitTokensByFrequency`, which + * consumes this result to make that call per token. + * + * `tokenSets` must be a *homogeneous* page collection (typically one site, + * or one section of a large multi-template site), not an arbitrary pool. + * Real-data validation against a small single-layout corporate site (a few + * hundred pages) found a clean bimodal frequency split (site chrome tokens + * showed up on 95%+ of pages, content tokens on well under 50%, with + * nothing in between). The same computation against the *whole* crawl of a + * much larger site that turned out to be a federation of independent + * sub-sections (the largest covering under half of all pages) found no + * token crossing even a 50% document-frequency threshold: with no single + * dominant layout, frequency-based chrome detection needs a mostly- + * homogeneous input to work at all. Splitting such a site into its + * sections first (by URL path, or by a coarse structural clustering pass) + * and calling this function per section recovered the same clean bimodal + * split. Grouping heterogeneous pages before calling this function is the + * caller's responsibility; this function has no way to detect that its + * input mixes multiple layouts. + * @param tokenSets + * @example + * ```ts + * computeDocumentFrequency([new Set(['body>header>a']), new Set(['body>header>a'])]); + * // { documentFrequency: Map { 'body>header>a' => 2 }, pageCount: 2 } + * ``` + */ +export function computeDocumentFrequency( + tokenSets: readonly ReadonlySet[], +): DocumentFrequency { + const documentFrequency = new Map(); + for (const tokens of tokenSets) { + for (const token of tokens) { + documentFrequency.set(token, (documentFrequency.get(token) ?? 0) + 1); + } + } + return { documentFrequency, pageCount: tokenSets.length }; +} diff --git a/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts new file mode 100644 index 00000000..edc4107a --- /dev/null +++ b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts @@ -0,0 +1,134 @@ +import { describe, expect, test } from 'vitest'; + +import { splitTokensByFrequency } from './split-tokens-by-frequency.js'; + +describe('splitTokensByFrequency', () => { + test('a token at or above the frequency threshold is classified as template', () => { + const corpusFrequency = { + documentFrequency: new Map([['body>header>a', 9]]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['body>header>a']), + corpusFrequency, + 0.9, + ); + expect(result.templateTokens).toEqual(new Set(['body>header>a'])); + expect(result.contentTokens).toEqual(new Set()); + }); + + test('a token below the frequency threshold is classified as content', () => { + const corpusFrequency = { + documentFrequency: new Map([['body>main>p', 1]]), + pageCount: 10, + }; + const result = splitTokensByFrequency(new Set(['body>main>p']), corpusFrequency, 0.9); + expect(result.templateTokens).toEqual(new Set()); + expect(result.contentTokens).toEqual(new Set(['body>main>p'])); + }); + + test('a token missing from the document-frequency map is treated as frequency 0 (content)', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + const result = splitTokensByFrequency( + new Set(['body>main>unseen']), + corpusFrequency, + 0.9, + ); + expect(result.contentTokens).toEqual(new Set(['body>main>unseen'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('frequency exactly at the threshold counts as template (inclusive boundary)', () => { + const corpusFrequency = { documentFrequency: new Map([['exact', 9]]), pageCount: 10 }; + const result = splitTokensByFrequency(new Set(['exact']), corpusFrequency, 0.9); + expect(result.templateTokens).toEqual(new Set(['exact'])); + }); + + test('frequency exactly at a threshold that overshoots by floating-point error still counts as template', () => { + // 0.55 * 100 === 55.00000000000001 in IEEE-754, which would fail a naive + // `frequency >= threshold * pageCount` check for a token at exactly the + // documented inclusive boundary + const corpusFrequency = { + documentFrequency: new Map([['boundary', 55]]), + pageCount: 100, + }; + const result = splitTokensByFrequency(new Set(['boundary']), corpusFrequency, 0.55); + expect(result.templateTokens).toEqual(new Set(['boundary'])); + }); + + test('default threshold is 0.9 when omitted', () => { + const corpusFrequency = { + documentFrequency: new Map([['just-below-default', 8]]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['just-below-default']), + corpusFrequency, + ); + expect(result.contentTokens).toEqual(new Set(['just-below-default'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('a custom threshold is honored', () => { + const corpusFrequency = { documentFrequency: new Map([['half', 5]]), pageCount: 10 }; + expect( + splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.5).templateTokens, + ).toEqual(new Set(['half'])); + expect( + splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.6).contentTokens, + ).toEqual(new Set(['half'])); + }); + + test('an empty token set returns two empty sets', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + const result = splitTokensByFrequency(new Set(), corpusFrequency); + expect(result.templateTokens).toEqual(new Set()); + expect(result.contentTokens).toEqual(new Set()); + }); + + test('a zero-page corpus classifies every token as content (no evidence of repetition)', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 0 }; + const result = splitTokensByFrequency(new Set(['anything']), corpusFrequency); + expect(result.contentTokens).toEqual(new Set(['anything'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('splits a mixed set of high- and low-frequency tokens correctly', () => { + const corpusFrequency = { + documentFrequency: new Map([ + ['chrome-a', 10], + ['chrome-b', 9], + ['content-a', 2], + ['content-b', 1], + ]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['chrome-a', 'chrome-b', 'content-a', 'content-b']), + corpusFrequency, + 0.9, + ); + expect(result.templateTokens).toEqual(new Set(['chrome-a', 'chrome-b'])); + expect(result.contentTokens).toEqual(new Set(['content-a', 'content-b'])); + }); + + test.each([0, -0.5, 1.5, 90, Number.NaN])( + 'rejects an out-of-range threshold (%s)', + (threshold) => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + expect(() => + splitTokensByFrequency(new Set(['a']), corpusFrequency, threshold), + ).toThrow(RangeError); + }, + ); + + test('accepts a threshold of exactly 1 (upper boundary is inclusive)', () => { + const corpusFrequency = { + documentFrequency: new Map([['always', 10]]), + pageCount: 10, + }; + expect(() => + splitTokensByFrequency(new Set(['always']), corpusFrequency, 1), + ).not.toThrow(); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts new file mode 100644 index 00000000..6954b725 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts @@ -0,0 +1,101 @@ +import type { DocumentFrequency } from './types.js'; + +/** + * Default document-frequency cutoff: a token present in at least 90% of the + * pages passed to `computeDocumentFrequency` is treated as site chrome. + * Real-data validation against a small single-layout corporate site (a few + * hundred pages) found the corpus's tokens cleanly bimodal — the same set + * of chrome tokens was identified whether the cutoff was set anywhere from + * 50% to 95% — so the exact value is not sensitive within that range for a + * homogeneous corpus. 90% was picked as a value comfortably inside that + * stable range rather than at either edge. + */ +const DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD = 0.9; + +/** + * `threshold * pageCount` is a floating-point product and can overshoot the + * intended integer boundary (verified: `0.55 * 100 === 55.00000000000001` + * in JS), which would otherwise make a token at the exact documented + * inclusive boundary (`frequency === threshold * pageCount`) fail the + * `>=` check it should pass. Subtracting this epsilon before comparing + * absorbs that rounding noise without being large enough to affect any + * genuinely-below-threshold token (frequencies are integers, so the true + * gap between "at the boundary" and "one below it" is always >= 1). + */ +const BOUNDARY_EPSILON = 1e-9; + +/** + * Splits one page's tokens into "template" (site chrome: header/nav/footer, + * or any other structure repeated across most of the corpus) and "content" + * (page-specific structure), using each token's document frequency from + * `computeDocumentFrequency`. Comparing these two groups separately with + * `jaccardSimilarity()` — rather than the page's full token set at once — + * is what fixes two failures a single flat Jaccard has: common chrome + * diluting genuine content differences at loose similarity thresholds, and + * page-specific content differences (e.g. a freeform CMS block editor page, + * where the exact block mix varies per page) swamping a real *layout* + * match. See `computeDocumentFrequency`'s JSDoc for why `corpusFrequency` + * must come from a homogeneous page collection for this split to work. + * + * `corpusFrequency` bundles `documentFrequency` with the `pageCount` it was + * computed from (rather than taking `pageCount` as a separate argument) so + * the two can never be passed out of sync — e.g. a caller re-slicing or + * filtering the page list after computing frequencies but before using + * them, which would otherwise silently produce a wrong cutoff with no error + * raised anywhere. + * + * A token absent from `documentFrequency` is treated as frequency 0 (i.e. + * content): it never appeared in the corpus the frequency map was built + * from, so it cannot be corpus-wide chrome. If `pageCount` is 0 (empty + * corpus), every token is classified as content for the same reason: with + * no pages to have observed repetition across, nothing can be confirmed as + * chrome. + * + * `threshold` must be a fraction in `(0, 1]`, not a percentage — passing + * `90` instead of `0.9` would make the cutoff exceed every possible + * frequency and misclassify even universal chrome as content, so this is + * validated eagerly rather than left to fail silently downstream. + * @param tokens + * @param corpusFrequency + * @param threshold + * @example + * ```ts + * const corpusFrequency = computeDocumentFrequency(allPagesTokenSets); + * splitTokensByFrequency(pageTokens, corpusFrequency); + * // { templateTokens: Set(...), contentTokens: Set(...) } + * ``` + */ +export function splitTokensByFrequency( + tokens: ReadonlySet, + corpusFrequency: DocumentFrequency, + threshold: number = DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD, +): { templateTokens: Set; contentTokens: Set } { + if (!(threshold > 0 && threshold <= 1)) { + throw new RangeError( + `splitTokensByFrequency: threshold must be a fraction in (0, 1], got ${threshold}`, + ); + } + + const { documentFrequency, pageCount } = corpusFrequency; + const templateTokens = new Set(); + const contentTokens = new Set(); + + if (pageCount === 0) { + for (const token of tokens) { + contentTokens.add(token); + } + return { templateTokens, contentTokens }; + } + + const cutoff = threshold * pageCount - BOUNDARY_EPSILON; + for (const token of tokens) { + const frequency = documentFrequency.get(token) ?? 0; + if (frequency >= cutoff) { + templateTokens.add(token); + } else { + contentTokens.add(token); + } + } + + return { templateTokens, contentTokens }; +} diff --git a/packages/@d-zero/page-cluster/src/types.ts b/packages/@d-zero/page-cluster/src/types.ts index df3b8cf9..e1fe903e 100644 --- a/packages/@d-zero/page-cluster/src/types.ts +++ b/packages/@d-zero/page-cluster/src/types.ts @@ -38,6 +38,20 @@ export type Frame = { pendingPaths: string[]; }; +/** + * Result of {@link ../compute-document-frequency.js | computeDocumentFrequency}: how many pages (out of `pageCount`) contain each token. + * + * `pageCount` travels bundled with `documentFrequency` rather than being a + * separate argument at call sites that consume it (e.g. + * `splitTokensByFrequency`), so the two can never be passed out of sync with + * each other (e.g. a caller re-slicing the page list after computing + * frequencies but before using them). + */ +export type DocumentFrequency = { + documentFrequency: ReadonlyMap; + pageCount: number; +}; + /** * Tags whose contents are hashed instead of being tokenized further. */