d-zero-dev · YusukeHirao · Jul 3, 2026 · Jul 3, 2026 · Jul 3, 2026
@@ -0,0 +1,51 @@
+import { describe, expect, test } from 'vitest';
+
+import { computeDocumentFrequency } from './compute-document-frequency.js';
+
+describe('computeDocumentFrequency', () => {
+	test('empty input returns an empty map and a page count of 0', () => {
+		expect(computeDocumentFrequency([])).toEqual({
+			documentFrequency: new Map(),
+			pageCount: 0,
+		});
+	});
+
+	test('pageCount reflects the number of token sets passed in', () => {
+		const sets = [new Set(['a']), new Set(['b']), new Set(['c'])];
+		expect(computeDocumentFrequency(sets).pageCount).toBe(3);
+	});
+
+	test('a token present in every set gets a count equal to the set count', () => {
+		const sets = [
+			new Set(['body>header>a']),
+			new Set(['body>header>a']),
+			new Set(['body>header>a']),
+		];
+		expect(computeDocumentFrequency(sets)).toEqual({
+			documentFrequency: new Map([['body>header>a', 3]]),
+			pageCount: 3,
+		});
+	});
+
+	test('a token present in only one set gets a count of 1', () => {
+		const sets = [new Set(['body>main>p']), new Set(['body>footer>span'])];
+		const { documentFrequency } = computeDocumentFrequency(sets);
+		expect(documentFrequency.get('body>main>p')).toBe(1);
+		expect(documentFrequency.get('body>footer>span')).toBe(1);
+	});
+
+	test('counts multiple distinct tokens independently across overlapping sets', () => {
+		const sets = [new Set(['a', 'b']), new Set(['b', 'c']), new Set(['b'])];
+		const { documentFrequency } = computeDocumentFrequency(sets);
+		expect(documentFrequency.get('a')).toBe(1);
+		expect(documentFrequency.get('b')).toBe(3);
+		expect(documentFrequency.get('c')).toBe(1);
+	});
+
+	test('a single empty set contributes no frequency entries but still counts as one page', () => {
+		expect(computeDocumentFrequency([new Set()])).toEqual({
+			documentFrequency: new Map(),
+			pageCount: 1,
+		});
+	});
+});
@@ -0,0 +1,44 @@
+import type { DocumentFrequency } from './types.js';
+
+/**
+ * Counts, for each token, how many of the given per-page token sets contain
+ * it. This is the first half of separating a page's shared site chrome
+ * (header/nav/footer) from its page-specific content: a token that recurs
+ * across nearly every page in `tokenSets` is chrome, one that shows up on
+ * only a handful of pages is content — see `splitTokensByFrequency`, which
+ * consumes this result to make that call per token.
+ *
+ * `tokenSets` must be a *homogeneous* page collection (typically one site,
+ * or one section of a large multi-template site), not an arbitrary pool.
+ * Real-data validation against a small single-layout corporate site (a few
+ * hundred pages) found a clean bimodal frequency split (site chrome tokens
+ * showed up on 95%+ of pages, content tokens on well under 50%, with
+ * nothing in between). The same computation against the *whole* crawl of a
+ * much larger site that turned out to be a federation of independent
+ * sub-sections (the largest covering under half of all pages) found no
+ * token crossing even a 50% document-frequency threshold: with no single
+ * dominant layout, frequency-based chrome detection needs a mostly-
+ * homogeneous input to work at all. Splitting such a site into its
+ * sections first (by URL path, or by a coarse structural clustering pass)
+ * and calling this function per section recovered the same clean bimodal
+ * split. Grouping heterogeneous pages before calling this function is the
+ * caller's responsibility; this function has no way to detect that its
+ * input mixes multiple layouts.
+ * @param tokenSets
+ * @example
+ * ```ts
+ * computeDocumentFrequency([new Set(['body>header>a']), new Set(['body>header>a'])]);
+ * // { documentFrequency: Map { 'body>header>a' => 2 }, pageCount: 2 }
+ * ```
+ */
+export function computeDocumentFrequency(
+	tokenSets: readonly ReadonlySet<string>[],
+): DocumentFrequency {
+	const documentFrequency = new Map<string, number>();
+	for (const tokens of tokenSets) {
+		for (const token of tokens) {
+			documentFrequency.set(token, (documentFrequency.get(token) ?? 0) + 1);
+		}
+	}
+	return { documentFrequency, pageCount: tokenSets.length };
+}
@@ -0,0 +1,134 @@
+import { describe, expect, test } from 'vitest';
+
+import { splitTokensByFrequency } from './split-tokens-by-frequency.js';
+
+describe('splitTokensByFrequency', () => {
+	test('a token at or above the frequency threshold is classified as template', () => {
+		const corpusFrequency = {
+			documentFrequency: new Map([['body>header>a', 9]]),
+			pageCount: 10,
+		};
+		const result = splitTokensByFrequency(
+			new Set(['body>header>a']),
+			corpusFrequency,
+			0.9,
+		);
+		expect(result.templateTokens).toEqual(new Set(['body>header>a']));
+		expect(result.contentTokens).toEqual(new Set());
+	});
+
+	test('a token below the frequency threshold is classified as content', () => {
+		const corpusFrequency = {
+			documentFrequency: new Map([['body>main>p', 1]]),
+			pageCount: 10,
+		};
+		const result = splitTokensByFrequency(new Set(['body>main>p']), corpusFrequency, 0.9);
+		expect(result.templateTokens).toEqual(new Set());
+		expect(result.contentTokens).toEqual(new Set(['body>main>p']));
+	});
+
+	test('a token missing from the document-frequency map is treated as frequency 0 (content)', () => {
+		const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
+		const result = splitTokensByFrequency(
+			new Set(['body>main>unseen']),
+			corpusFrequency,
+			0.9,
+		);
+		expect(result.contentTokens).toEqual(new Set(['body>main>unseen']));
+		expect(result.templateTokens).toEqual(new Set());
+	});
+
+	test('frequency exactly at the threshold counts as template (inclusive boundary)', () => {
+		const corpusFrequency = { documentFrequency: new Map([['exact', 9]]), pageCount: 10 };
+		const result = splitTokensByFrequency(new Set(['exact']), corpusFrequency, 0.9);
+		expect(result.templateTokens).toEqual(new Set(['exact']));
+	});
+
+	test('frequency exactly at a threshold that overshoots by floating-point error still counts as template', () => {
+		// 0.55 * 100 === 55.00000000000001 in IEEE-754, which would fail a naive
+		// `frequency >= threshold * pageCount` check for a token at exactly the
+		// documented inclusive boundary
+		const corpusFrequency = {
+			documentFrequency: new Map([['boundary', 55]]),
+			pageCount: 100,
+		};
+		const result = splitTokensByFrequency(new Set(['boundary']), corpusFrequency, 0.55);
+		expect(result.templateTokens).toEqual(new Set(['boundary']));
+	});
+
+	test('default threshold is 0.9 when omitted', () => {
+		const corpusFrequency = {
+			documentFrequency: new Map([['just-below-default', 8]]),
+			pageCount: 10,
+		};
+		const result = splitTokensByFrequency(
+			new Set(['just-below-default']),
+			corpusFrequency,
+		);
+		expect(result.contentTokens).toEqual(new Set(['just-below-default']));
+		expect(result.templateTokens).toEqual(new Set());
+	});
+
+	test('a custom threshold is honored', () => {
+		const corpusFrequency = { documentFrequency: new Map([['half', 5]]), pageCount: 10 };
+		expect(
+			splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.5).templateTokens,
+		).toEqual(new Set(['half']));
+		expect(
+			splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.6).contentTokens,
+		).toEqual(new Set(['half']));
+	});
+
+	test('an empty token set returns two empty sets', () => {
+		const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
+		const result = splitTokensByFrequency(new Set(), corpusFrequency);
+		expect(result.templateTokens).toEqual(new Set());
+		expect(result.contentTokens).toEqual(new Set());
+	});
+
+	test('a zero-page corpus classifies every token as content (no evidence of repetition)', () => {
+		const corpusFrequency = { documentFrequency: new Map(), pageCount: 0 };
+		const result = splitTokensByFrequency(new Set(['anything']), corpusFrequency);
+		expect(result.contentTokens).toEqual(new Set(['anything']));
+		expect(result.templateTokens).toEqual(new Set());
+	});
+
+	test('splits a mixed set of high- and low-frequency tokens correctly', () => {
+		const corpusFrequency = {
+			documentFrequency: new Map([
+				['chrome-a', 10],
+				['chrome-b', 9],
+				['content-a', 2],
+				['content-b', 1],
+			]),
+			pageCount: 10,
+		};
+		const result = splitTokensByFrequency(
+			new Set(['chrome-a', 'chrome-b', 'content-a', 'content-b']),
+			corpusFrequency,
+			0.9,
+		);
+		expect(result.templateTokens).toEqual(new Set(['chrome-a', 'chrome-b']));
+		expect(result.contentTokens).toEqual(new Set(['content-a', 'content-b']));
+	});
+
+	test.each([0, -0.5, 1.5, 90, Number.NaN])(
+		'rejects an out-of-range threshold (%s)',
+		(threshold) => {
+			const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
+			expect(() =>
+				splitTokensByFrequency(new Set(['a']), corpusFrequency, threshold),
+			).toThrow(RangeError);
+		},
+	);
+
+	test('accepts a threshold of exactly 1 (upper boundary is inclusive)', () => {
+		const corpusFrequency = {
+			documentFrequency: new Map([['always', 10]]),
+			pageCount: 10,
+		};
+		expect(() =>
+			splitTokensByFrequency(new Set(['always']), corpusFrequency, 1),
+		).not.toThrow();
+	});
+});
@@ -0,0 +1,101 @@
+import type { DocumentFrequency } from './types.js';
+
+/**
+ * Default document-frequency cutoff: a token present in at least 90% of the
+ * pages passed to `computeDocumentFrequency` is treated as site chrome.
+ * Real-data validation against a small single-layout corporate site (a few
+ * hundred pages) found the corpus's tokens cleanly bimodal — the same set
+ * of chrome tokens was identified whether the cutoff was set anywhere from
+ * 50% to 95% — so the exact value is not sensitive within that range for a
+ * homogeneous corpus. 90% was picked as a value comfortably inside that
+ * stable range rather than at either edge.
+ */
+const DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD = 0.9;
+
+/**
+ * `threshold * pageCount` is a floating-point product and can overshoot the
+ * intended integer boundary (verified: `0.55 * 100 === 55.00000000000001`
+ * in JS), which would otherwise make a token at the exact documented
+ * inclusive boundary (`frequency === threshold * pageCount`) fail the
+ * `>=` check it should pass. Subtracting this epsilon before comparing
+ * absorbs that rounding noise without being large enough to affect any
+ * genuinely-below-threshold token (frequencies are integers, so the true
+ * gap between "at the boundary" and "one below it" is always >= 1).
+ */
+const BOUNDARY_EPSILON = 1e-9;
+
+/**
+ * Splits one page's tokens into "template" (site chrome: header/nav/footer,
+ * or any other structure repeated across most of the corpus) and "content"
+ * (page-specific structure), using each token's document frequency from
+ * `computeDocumentFrequency`. Comparing these two groups separately with
+ * `jaccardSimilarity()` — rather than the page's full token set at once —
+ * is what fixes two failures a single flat Jaccard has: common chrome
+ * diluting genuine content differences at loose similarity thresholds, and
+ * page-specific content differences (e.g. a freeform CMS block editor page,
+ * where the exact block mix varies per page) swamping a real *layout*
+ * match. See `computeDocumentFrequency`'s JSDoc for why `corpusFrequency`
+ * must come from a homogeneous page collection for this split to work.
+ *
+ * `corpusFrequency` bundles `documentFrequency` with the `pageCount` it was
+ * computed from (rather than taking `pageCount` as a separate argument) so
+ * the two can never be passed out of sync — e.g. a caller re-slicing or
+ * filtering the page list after computing frequencies but before using
+ * them, which would otherwise silently produce a wrong cutoff with no error
+ * raised anywhere.
+ *
+ * A token absent from `documentFrequency` is treated as frequency 0 (i.e.
+ * content): it never appeared in the corpus the frequency map was built
+ * from, so it cannot be corpus-wide chrome. If `pageCount` is 0 (empty
+ * corpus), every token is classified as content for the same reason: with
+ * no pages to have observed repetition across, nothing can be confirmed as
+ * chrome.
+ *
+ * `threshold` must be a fraction in `(0, 1]`, not a percentage — passing
+ * `90` instead of `0.9` would make the cutoff exceed every possible
+ * frequency and misclassify even universal chrome as content, so this is
+ * validated eagerly rather than left to fail silently downstream.
+ * @param tokens
+ * @param corpusFrequency
+ * @param threshold
+ * @example
+ * ```ts
+ * const corpusFrequency = computeDocumentFrequency(allPagesTokenSets);
+ * splitTokensByFrequency(pageTokens, corpusFrequency);
+ * // { templateTokens: Set(...), contentTokens: Set(...) }
+ * ```
+ */
+export function splitTokensByFrequency(
+	tokens: ReadonlySet<string>,
+	corpusFrequency: DocumentFrequency,
+	threshold: number = DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD,
+): { templateTokens: Set<string>; contentTokens: Set<string> } {
+	if (!(threshold > 0 && threshold <= 1)) {
+		throw new RangeError(
+			`splitTokensByFrequency: threshold must be a fraction in (0, 1], got ${threshold}`,
+		);
+	}
+
+	const { documentFrequency, pageCount } = corpusFrequency;
+	const templateTokens = new Set<string>();
+	const contentTokens = new Set<string>();
+
+	if (pageCount === 0) {
+		for (const token of tokens) {
+			contentTokens.add(token);
+		}
+		return { templateTokens, contentTokens };
+	}
+
+	const cutoff = threshold * pageCount - BOUNDARY_EPSILON;
+	for (const token of tokens) {
+		const frequency = documentFrequency.get(token) ?? 0;
+		if (frequency >= cutoff) {
+			templateTokens.add(token);
+		} else {
+			contentTokens.add(token);
+		}
+	}
+
+	return { templateTokens, contentTokens };
+}
@@ -38,6 +38,20 @@ export type Frame = {
 	pendingPaths: string[];
 };
 
+/**
+ * Result of {@link ../compute-document-frequency.js | computeDocumentFrequency}: how many pages (out of `pageCount`) contain each token.
+ *
+ * `pageCount` travels bundled with `documentFrequency` rather than being a
+ * separate argument at call sites that consume it (e.g.
+ * `splitTokensByFrequency`), so the two can never be passed out of sync with
+ * each other (e.g. a caller re-slicing the page list after computing
+ * frequencies but before using them).
+ */
+export type DocumentFrequency = {
+	documentFrequency: ReadonlyMap<string, number>;
+	pageCount: number;
+};
+
 /**
  * Tags whose contents are hashed instead of being tokenized further.
  */