diff --git a/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts b/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts new file mode 100644 index 00000000..1ab48252 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/compute-document-frequency.spec.ts @@ -0,0 +1,51 @@ +import { describe, expect, test } from 'vitest'; + +import { computeDocumentFrequency } from './compute-document-frequency.js'; + +describe('computeDocumentFrequency', () => { + test('empty input returns an empty map and a page count of 0', () => { + expect(computeDocumentFrequency([])).toEqual({ + documentFrequency: new Map(), + pageCount: 0, + }); + }); + + test('pageCount reflects the number of token sets passed in', () => { + const sets = [new Set(['a']), new Set(['b']), new Set(['c'])]; + expect(computeDocumentFrequency(sets).pageCount).toBe(3); + }); + + test('a token present in every set gets a count equal to the set count', () => { + const sets = [ + new Set(['body>header>a']), + new Set(['body>header>a']), + new Set(['body>header>a']), + ]; + expect(computeDocumentFrequency(sets)).toEqual({ + documentFrequency: new Map([['body>header>a', 3]]), + pageCount: 3, + }); + }); + + test('a token present in only one set gets a count of 1', () => { + const sets = [new Set(['body>main>p']), new Set(['body>footer>span'])]; + const { documentFrequency } = computeDocumentFrequency(sets); + expect(documentFrequency.get('body>main>p')).toBe(1); + expect(documentFrequency.get('body>footer>span')).toBe(1); + }); + + test('counts multiple distinct tokens independently across overlapping sets', () => { + const sets = [new Set(['a', 'b']), new Set(['b', 'c']), new Set(['b'])]; + const { documentFrequency } = computeDocumentFrequency(sets); + expect(documentFrequency.get('a')).toBe(1); + expect(documentFrequency.get('b')).toBe(3); + expect(documentFrequency.get('c')).toBe(1); + }); + + test('a single empty set contributes no frequency entries but still counts as one page', () => { + expect(computeDocumentFrequency([new Set()])).toEqual({ + documentFrequency: new Map(), + pageCount: 1, + }); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/compute-document-frequency.ts b/packages/@d-zero/page-cluster/src/compute-document-frequency.ts new file mode 100644 index 00000000..8f5dc1dd --- /dev/null +++ b/packages/@d-zero/page-cluster/src/compute-document-frequency.ts @@ -0,0 +1,44 @@ +import type { DocumentFrequency } from './types.js'; + +/** + * Counts, for each token, how many of the given per-page token sets contain + * it. This is the first half of separating a page's shared site chrome + * (header/nav/footer) from its page-specific content: a token that recurs + * across nearly every page in `tokenSets` is chrome, one that shows up on + * only a handful of pages is content — see `splitTokensByFrequency`, which + * consumes this result to make that call per token. + * + * `tokenSets` must be a *homogeneous* page collection (typically one site, + * or one section of a large multi-template site), not an arbitrary pool. + * Real-data validation against a small single-layout corporate site (a few + * hundred pages) found a clean bimodal frequency split (site chrome tokens + * showed up on 95%+ of pages, content tokens on well under 50%, with + * nothing in between). The same computation against the *whole* crawl of a + * much larger site that turned out to be a federation of independent + * sub-sections (the largest covering under half of all pages) found no + * token crossing even a 50% document-frequency threshold: with no single + * dominant layout, frequency-based chrome detection needs a mostly- + * homogeneous input to work at all. Splitting such a site into its + * sections first (by URL path, or by a coarse structural clustering pass) + * and calling this function per section recovered the same clean bimodal + * split. Grouping heterogeneous pages before calling this function is the + * caller's responsibility; this function has no way to detect that its + * input mixes multiple layouts. + * @param tokenSets + * @example + * ```ts + * computeDocumentFrequency([new Set(['body>header>a']), new Set(['body>header>a'])]); + * // { documentFrequency: Map { 'body>header>a' => 2 }, pageCount: 2 } + * ``` + */ +export function computeDocumentFrequency( + tokenSets: readonly ReadonlySet[], +): DocumentFrequency { + const documentFrequency = new Map(); + for (const tokens of tokenSets) { + for (const token of tokens) { + documentFrequency.set(token, (documentFrequency.get(token) ?? 0) + 1); + } + } + return { documentFrequency, pageCount: tokenSets.length }; +} diff --git a/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts new file mode 100644 index 00000000..edc4107a --- /dev/null +++ b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts @@ -0,0 +1,134 @@ +import { describe, expect, test } from 'vitest'; + +import { splitTokensByFrequency } from './split-tokens-by-frequency.js'; + +describe('splitTokensByFrequency', () => { + test('a token at or above the frequency threshold is classified as template', () => { + const corpusFrequency = { + documentFrequency: new Map([['body>header>a', 9]]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['body>header>a']), + corpusFrequency, + 0.9, + ); + expect(result.templateTokens).toEqual(new Set(['body>header>a'])); + expect(result.contentTokens).toEqual(new Set()); + }); + + test('a token below the frequency threshold is classified as content', () => { + const corpusFrequency = { + documentFrequency: new Map([['body>main>p', 1]]), + pageCount: 10, + }; + const result = splitTokensByFrequency(new Set(['body>main>p']), corpusFrequency, 0.9); + expect(result.templateTokens).toEqual(new Set()); + expect(result.contentTokens).toEqual(new Set(['body>main>p'])); + }); + + test('a token missing from the document-frequency map is treated as frequency 0 (content)', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + const result = splitTokensByFrequency( + new Set(['body>main>unseen']), + corpusFrequency, + 0.9, + ); + expect(result.contentTokens).toEqual(new Set(['body>main>unseen'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('frequency exactly at the threshold counts as template (inclusive boundary)', () => { + const corpusFrequency = { documentFrequency: new Map([['exact', 9]]), pageCount: 10 }; + const result = splitTokensByFrequency(new Set(['exact']), corpusFrequency, 0.9); + expect(result.templateTokens).toEqual(new Set(['exact'])); + }); + + test('frequency exactly at a threshold that overshoots by floating-point error still counts as template', () => { + // 0.55 * 100 === 55.00000000000001 in IEEE-754, which would fail a naive + // `frequency >= threshold * pageCount` check for a token at exactly the + // documented inclusive boundary + const corpusFrequency = { + documentFrequency: new Map([['boundary', 55]]), + pageCount: 100, + }; + const result = splitTokensByFrequency(new Set(['boundary']), corpusFrequency, 0.55); + expect(result.templateTokens).toEqual(new Set(['boundary'])); + }); + + test('default threshold is 0.9 when omitted', () => { + const corpusFrequency = { + documentFrequency: new Map([['just-below-default', 8]]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['just-below-default']), + corpusFrequency, + ); + expect(result.contentTokens).toEqual(new Set(['just-below-default'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('a custom threshold is honored', () => { + const corpusFrequency = { documentFrequency: new Map([['half', 5]]), pageCount: 10 }; + expect( + splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.5).templateTokens, + ).toEqual(new Set(['half'])); + expect( + splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.6).contentTokens, + ).toEqual(new Set(['half'])); + }); + + test('an empty token set returns two empty sets', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + const result = splitTokensByFrequency(new Set(), corpusFrequency); + expect(result.templateTokens).toEqual(new Set()); + expect(result.contentTokens).toEqual(new Set()); + }); + + test('a zero-page corpus classifies every token as content (no evidence of repetition)', () => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 0 }; + const result = splitTokensByFrequency(new Set(['anything']), corpusFrequency); + expect(result.contentTokens).toEqual(new Set(['anything'])); + expect(result.templateTokens).toEqual(new Set()); + }); + + test('splits a mixed set of high- and low-frequency tokens correctly', () => { + const corpusFrequency = { + documentFrequency: new Map([ + ['chrome-a', 10], + ['chrome-b', 9], + ['content-a', 2], + ['content-b', 1], + ]), + pageCount: 10, + }; + const result = splitTokensByFrequency( + new Set(['chrome-a', 'chrome-b', 'content-a', 'content-b']), + corpusFrequency, + 0.9, + ); + expect(result.templateTokens).toEqual(new Set(['chrome-a', 'chrome-b'])); + expect(result.contentTokens).toEqual(new Set(['content-a', 'content-b'])); + }); + + test.each([0, -0.5, 1.5, 90, Number.NaN])( + 'rejects an out-of-range threshold (%s)', + (threshold) => { + const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 }; + expect(() => + splitTokensByFrequency(new Set(['a']), corpusFrequency, threshold), + ).toThrow(RangeError); + }, + ); + + test('accepts a threshold of exactly 1 (upper boundary is inclusive)', () => { + const corpusFrequency = { + documentFrequency: new Map([['always', 10]]), + pageCount: 10, + }; + expect(() => + splitTokensByFrequency(new Set(['always']), corpusFrequency, 1), + ).not.toThrow(); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts new file mode 100644 index 00000000..6954b725 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts @@ -0,0 +1,101 @@ +import type { DocumentFrequency } from './types.js'; + +/** + * Default document-frequency cutoff: a token present in at least 90% of the + * pages passed to `computeDocumentFrequency` is treated as site chrome. + * Real-data validation against a small single-layout corporate site (a few + * hundred pages) found the corpus's tokens cleanly bimodal — the same set + * of chrome tokens was identified whether the cutoff was set anywhere from + * 50% to 95% — so the exact value is not sensitive within that range for a + * homogeneous corpus. 90% was picked as a value comfortably inside that + * stable range rather than at either edge. + */ +const DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD = 0.9; + +/** + * `threshold * pageCount` is a floating-point product and can overshoot the + * intended integer boundary (verified: `0.55 * 100 === 55.00000000000001` + * in JS), which would otherwise make a token at the exact documented + * inclusive boundary (`frequency === threshold * pageCount`) fail the + * `>=` check it should pass. Subtracting this epsilon before comparing + * absorbs that rounding noise without being large enough to affect any + * genuinely-below-threshold token (frequencies are integers, so the true + * gap between "at the boundary" and "one below it" is always >= 1). + */ +const BOUNDARY_EPSILON = 1e-9; + +/** + * Splits one page's tokens into "template" (site chrome: header/nav/footer, + * or any other structure repeated across most of the corpus) and "content" + * (page-specific structure), using each token's document frequency from + * `computeDocumentFrequency`. Comparing these two groups separately with + * `jaccardSimilarity()` — rather than the page's full token set at once — + * is what fixes two failures a single flat Jaccard has: common chrome + * diluting genuine content differences at loose similarity thresholds, and + * page-specific content differences (e.g. a freeform CMS block editor page, + * where the exact block mix varies per page) swamping a real *layout* + * match. See `computeDocumentFrequency`'s JSDoc for why `corpusFrequency` + * must come from a homogeneous page collection for this split to work. + * + * `corpusFrequency` bundles `documentFrequency` with the `pageCount` it was + * computed from (rather than taking `pageCount` as a separate argument) so + * the two can never be passed out of sync — e.g. a caller re-slicing or + * filtering the page list after computing frequencies but before using + * them, which would otherwise silently produce a wrong cutoff with no error + * raised anywhere. + * + * A token absent from `documentFrequency` is treated as frequency 0 (i.e. + * content): it never appeared in the corpus the frequency map was built + * from, so it cannot be corpus-wide chrome. If `pageCount` is 0 (empty + * corpus), every token is classified as content for the same reason: with + * no pages to have observed repetition across, nothing can be confirmed as + * chrome. + * + * `threshold` must be a fraction in `(0, 1]`, not a percentage — passing + * `90` instead of `0.9` would make the cutoff exceed every possible + * frequency and misclassify even universal chrome as content, so this is + * validated eagerly rather than left to fail silently downstream. + * @param tokens + * @param corpusFrequency + * @param threshold + * @example + * ```ts + * const corpusFrequency = computeDocumentFrequency(allPagesTokenSets); + * splitTokensByFrequency(pageTokens, corpusFrequency); + * // { templateTokens: Set(...), contentTokens: Set(...) } + * ``` + */ +export function splitTokensByFrequency( + tokens: ReadonlySet, + corpusFrequency: DocumentFrequency, + threshold: number = DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD, +): { templateTokens: Set; contentTokens: Set } { + if (!(threshold > 0 && threshold <= 1)) { + throw new RangeError( + `splitTokensByFrequency: threshold must be a fraction in (0, 1], got ${threshold}`, + ); + } + + const { documentFrequency, pageCount } = corpusFrequency; + const templateTokens = new Set(); + const contentTokens = new Set(); + + if (pageCount === 0) { + for (const token of tokens) { + contentTokens.add(token); + } + return { templateTokens, contentTokens }; + } + + const cutoff = threshold * pageCount - BOUNDARY_EPSILON; + for (const token of tokens) { + const frequency = documentFrequency.get(token) ?? 0; + if (frequency >= cutoff) { + templateTokens.add(token); + } else { + contentTokens.add(token); + } + } + + return { templateTokens, contentTokens }; +} diff --git a/packages/@d-zero/page-cluster/src/types.ts b/packages/@d-zero/page-cluster/src/types.ts index df3b8cf9..e1fe903e 100644 --- a/packages/@d-zero/page-cluster/src/types.ts +++ b/packages/@d-zero/page-cluster/src/types.ts @@ -38,6 +38,20 @@ export type Frame = { pendingPaths: string[]; }; +/** + * Result of {@link ../compute-document-frequency.js | computeDocumentFrequency}: how many pages (out of `pageCount`) contain each token. + * + * `pageCount` travels bundled with `documentFrequency` rather than being a + * separate argument at call sites that consume it (e.g. + * `splitTokensByFrequency`), so the two can never be passed out of sync with + * each other (e.g. a caller re-slicing the page list after computing + * frequencies but before using them). + */ +export type DocumentFrequency = { + documentFrequency: ReadonlyMap; + pageCount: number; +}; + /** * Tags whose contents are hashed instead of being tokenized further. */