Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import { describe, expect, test } from 'vitest';

import { computeDocumentFrequency } from './compute-document-frequency.js';

describe('computeDocumentFrequency', () => {
test('empty input returns an empty map and a page count of 0', () => {
expect(computeDocumentFrequency([])).toEqual({
documentFrequency: new Map(),
pageCount: 0,
});
});

test('pageCount reflects the number of token sets passed in', () => {
const sets = [new Set(['a']), new Set(['b']), new Set(['c'])];
expect(computeDocumentFrequency(sets).pageCount).toBe(3);
});

test('a token present in every set gets a count equal to the set count', () => {
const sets = [
new Set(['body>header>a']),
new Set(['body>header>a']),
new Set(['body>header>a']),
];
expect(computeDocumentFrequency(sets)).toEqual({
documentFrequency: new Map([['body>header>a', 3]]),
pageCount: 3,
});
});

test('a token present in only one set gets a count of 1', () => {
const sets = [new Set(['body>main>p']), new Set(['body>footer>span'])];
const { documentFrequency } = computeDocumentFrequency(sets);
expect(documentFrequency.get('body>main>p')).toBe(1);
expect(documentFrequency.get('body>footer>span')).toBe(1);
});

test('counts multiple distinct tokens independently across overlapping sets', () => {
const sets = [new Set(['a', 'b']), new Set(['b', 'c']), new Set(['b'])];
const { documentFrequency } = computeDocumentFrequency(sets);
expect(documentFrequency.get('a')).toBe(1);
expect(documentFrequency.get('b')).toBe(3);
expect(documentFrequency.get('c')).toBe(1);
});

test('a single empty set contributes no frequency entries but still counts as one page', () => {
expect(computeDocumentFrequency([new Set()])).toEqual({
documentFrequency: new Map(),
pageCount: 1,
});
});
});
44 changes: 44 additions & 0 deletions packages/@d-zero/page-cluster/src/compute-document-frequency.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import type { DocumentFrequency } from './types.js';

/**
* Counts, for each token, how many of the given per-page token sets contain
* it. This is the first half of separating a page's shared site chrome
* (header/nav/footer) from its page-specific content: a token that recurs
* across nearly every page in `tokenSets` is chrome, one that shows up on
* only a handful of pages is content — see `splitTokensByFrequency`, which
* consumes this result to make that call per token.
*
* `tokenSets` must be a *homogeneous* page collection (typically one site,
* or one section of a large multi-template site), not an arbitrary pool.
* Real-data validation against a small single-layout corporate site (a few
* hundred pages) found a clean bimodal frequency split (site chrome tokens
* showed up on 95%+ of pages, content tokens on well under 50%, with
* nothing in between). The same computation against the *whole* crawl of a
* much larger site that turned out to be a federation of independent
* sub-sections (the largest covering under half of all pages) found no
* token crossing even a 50% document-frequency threshold: with no single
* dominant layout, frequency-based chrome detection needs a mostly-
* homogeneous input to work at all. Splitting such a site into its
* sections first (by URL path, or by a coarse structural clustering pass)
* and calling this function per section recovered the same clean bimodal
* split. Grouping heterogeneous pages before calling this function is the
* caller's responsibility; this function has no way to detect that its
* input mixes multiple layouts.
* @param tokenSets
* @example
* ```ts
* computeDocumentFrequency([new Set(['body>header>a']), new Set(['body>header>a'])]);
* // { documentFrequency: Map { 'body>header>a' => 2 }, pageCount: 2 }
* ```
*/
export function computeDocumentFrequency(
tokenSets: readonly ReadonlySet<string>[],
): DocumentFrequency {
const documentFrequency = new Map<string, number>();
for (const tokens of tokenSets) {
for (const token of tokens) {
documentFrequency.set(token, (documentFrequency.get(token) ?? 0) + 1);
}
}
return { documentFrequency, pageCount: tokenSets.length };
}
134 changes: 134 additions & 0 deletions packages/@d-zero/page-cluster/src/split-tokens-by-frequency.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import { describe, expect, test } from 'vitest';

import { splitTokensByFrequency } from './split-tokens-by-frequency.js';

describe('splitTokensByFrequency', () => {
test('a token at or above the frequency threshold is classified as template', () => {
const corpusFrequency = {
documentFrequency: new Map([['body>header>a', 9]]),
pageCount: 10,
};
const result = splitTokensByFrequency(
new Set(['body>header>a']),
corpusFrequency,
0.9,
);
expect(result.templateTokens).toEqual(new Set(['body>header>a']));
expect(result.contentTokens).toEqual(new Set());
});

test('a token below the frequency threshold is classified as content', () => {
const corpusFrequency = {
documentFrequency: new Map([['body>main>p', 1]]),
pageCount: 10,
};
const result = splitTokensByFrequency(new Set(['body>main>p']), corpusFrequency, 0.9);
expect(result.templateTokens).toEqual(new Set());
expect(result.contentTokens).toEqual(new Set(['body>main>p']));
});

test('a token missing from the document-frequency map is treated as frequency 0 (content)', () => {
const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
const result = splitTokensByFrequency(
new Set(['body>main>unseen']),
corpusFrequency,
0.9,
);
expect(result.contentTokens).toEqual(new Set(['body>main>unseen']));
expect(result.templateTokens).toEqual(new Set());
});

test('frequency exactly at the threshold counts as template (inclusive boundary)', () => {
const corpusFrequency = { documentFrequency: new Map([['exact', 9]]), pageCount: 10 };
const result = splitTokensByFrequency(new Set(['exact']), corpusFrequency, 0.9);
expect(result.templateTokens).toEqual(new Set(['exact']));
});

test('frequency exactly at a threshold that overshoots by floating-point error still counts as template', () => {
// 0.55 * 100 === 55.00000000000001 in IEEE-754, which would fail a naive
// `frequency >= threshold * pageCount` check for a token at exactly the
// documented inclusive boundary
const corpusFrequency = {
documentFrequency: new Map([['boundary', 55]]),
pageCount: 100,
};
const result = splitTokensByFrequency(new Set(['boundary']), corpusFrequency, 0.55);
expect(result.templateTokens).toEqual(new Set(['boundary']));
});

test('default threshold is 0.9 when omitted', () => {
const corpusFrequency = {
documentFrequency: new Map([['just-below-default', 8]]),
pageCount: 10,
};
const result = splitTokensByFrequency(
new Set(['just-below-default']),
corpusFrequency,
);
expect(result.contentTokens).toEqual(new Set(['just-below-default']));
expect(result.templateTokens).toEqual(new Set());
});

test('a custom threshold is honored', () => {
const corpusFrequency = { documentFrequency: new Map([['half', 5]]), pageCount: 10 };
expect(
splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.5).templateTokens,
).toEqual(new Set(['half']));
expect(
splitTokensByFrequency(new Set(['half']), corpusFrequency, 0.6).contentTokens,
).toEqual(new Set(['half']));
});

test('an empty token set returns two empty sets', () => {
const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
const result = splitTokensByFrequency(new Set(), corpusFrequency);
expect(result.templateTokens).toEqual(new Set());
expect(result.contentTokens).toEqual(new Set());
});

test('a zero-page corpus classifies every token as content (no evidence of repetition)', () => {
const corpusFrequency = { documentFrequency: new Map(), pageCount: 0 };
const result = splitTokensByFrequency(new Set(['anything']), corpusFrequency);
expect(result.contentTokens).toEqual(new Set(['anything']));
expect(result.templateTokens).toEqual(new Set());
});

test('splits a mixed set of high- and low-frequency tokens correctly', () => {
const corpusFrequency = {
documentFrequency: new Map([
['chrome-a', 10],
['chrome-b', 9],
['content-a', 2],
['content-b', 1],
]),
pageCount: 10,
};
const result = splitTokensByFrequency(
new Set(['chrome-a', 'chrome-b', 'content-a', 'content-b']),
corpusFrequency,
0.9,
);
expect(result.templateTokens).toEqual(new Set(['chrome-a', 'chrome-b']));
expect(result.contentTokens).toEqual(new Set(['content-a', 'content-b']));
});

test.each([0, -0.5, 1.5, 90, Number.NaN])(
'rejects an out-of-range threshold (%s)',
(threshold) => {
const corpusFrequency = { documentFrequency: new Map(), pageCount: 10 };
expect(() =>
splitTokensByFrequency(new Set(['a']), corpusFrequency, threshold),
).toThrow(RangeError);
},
);

test('accepts a threshold of exactly 1 (upper boundary is inclusive)', () => {
const corpusFrequency = {
documentFrequency: new Map([['always', 10]]),
pageCount: 10,
};
expect(() =>
splitTokensByFrequency(new Set(['always']), corpusFrequency, 1),
).not.toThrow();
});
});
101 changes: 101 additions & 0 deletions packages/@d-zero/page-cluster/src/split-tokens-by-frequency.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import type { DocumentFrequency } from './types.js';

/**
* Default document-frequency cutoff: a token present in at least 90% of the
* pages passed to `computeDocumentFrequency` is treated as site chrome.
* Real-data validation against a small single-layout corporate site (a few
* hundred pages) found the corpus's tokens cleanly bimodal — the same set
* of chrome tokens was identified whether the cutoff was set anywhere from
* 50% to 95% — so the exact value is not sensitive within that range for a
* homogeneous corpus. 90% was picked as a value comfortably inside that
* stable range rather than at either edge.
*/
const DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD = 0.9;

/**
* `threshold * pageCount` is a floating-point product and can overshoot the
* intended integer boundary (verified: `0.55 * 100 === 55.00000000000001`
* in JS), which would otherwise make a token at the exact documented
* inclusive boundary (`frequency === threshold * pageCount`) fail the
* `>=` check it should pass. Subtracting this epsilon before comparing
* absorbs that rounding noise without being large enough to affect any
* genuinely-below-threshold token (frequencies are integers, so the true
* gap between "at the boundary" and "one below it" is always >= 1).
*/
const BOUNDARY_EPSILON = 1e-9;

/**
* Splits one page's tokens into "template" (site chrome: header/nav/footer,
* or any other structure repeated across most of the corpus) and "content"
* (page-specific structure), using each token's document frequency from
* `computeDocumentFrequency`. Comparing these two groups separately with
* `jaccardSimilarity()` — rather than the page's full token set at once —
* is what fixes two failures a single flat Jaccard has: common chrome
* diluting genuine content differences at loose similarity thresholds, and
* page-specific content differences (e.g. a freeform CMS block editor page,
* where the exact block mix varies per page) swamping a real *layout*
* match. See `computeDocumentFrequency`'s JSDoc for why `corpusFrequency`
* must come from a homogeneous page collection for this split to work.
*
* `corpusFrequency` bundles `documentFrequency` with the `pageCount` it was
* computed from (rather than taking `pageCount` as a separate argument) so
* the two can never be passed out of sync — e.g. a caller re-slicing or
* filtering the page list after computing frequencies but before using
* them, which would otherwise silently produce a wrong cutoff with no error
* raised anywhere.
*
* A token absent from `documentFrequency` is treated as frequency 0 (i.e.
* content): it never appeared in the corpus the frequency map was built
* from, so it cannot be corpus-wide chrome. If `pageCount` is 0 (empty
* corpus), every token is classified as content for the same reason: with
* no pages to have observed repetition across, nothing can be confirmed as
* chrome.
*
* `threshold` must be a fraction in `(0, 1]`, not a percentage — passing
* `90` instead of `0.9` would make the cutoff exceed every possible
* frequency and misclassify even universal chrome as content, so this is
* validated eagerly rather than left to fail silently downstream.
* @param tokens
* @param corpusFrequency
* @param threshold
* @example
* ```ts
* const corpusFrequency = computeDocumentFrequency(allPagesTokenSets);
* splitTokensByFrequency(pageTokens, corpusFrequency);
* // { templateTokens: Set(...), contentTokens: Set(...) }
* ```
*/
export function splitTokensByFrequency(
tokens: ReadonlySet<string>,
corpusFrequency: DocumentFrequency,
threshold: number = DEFAULT_TEMPLATE_FREQUENCY_THRESHOLD,
): { templateTokens: Set<string>; contentTokens: Set<string> } {
if (!(threshold > 0 && threshold <= 1)) {
throw new RangeError(
`splitTokensByFrequency: threshold must be a fraction in (0, 1], got ${threshold}`,
);
}

const { documentFrequency, pageCount } = corpusFrequency;
const templateTokens = new Set<string>();
const contentTokens = new Set<string>();

if (pageCount === 0) {
for (const token of tokens) {
contentTokens.add(token);
}
return { templateTokens, contentTokens };
}

const cutoff = threshold * pageCount - BOUNDARY_EPSILON;
for (const token of tokens) {
const frequency = documentFrequency.get(token) ?? 0;
if (frequency >= cutoff) {
templateTokens.add(token);
} else {
contentTokens.add(token);
}
}

return { templateTokens, contentTokens };
}
14 changes: 14 additions & 0 deletions packages/@d-zero/page-cluster/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,20 @@ export type Frame = {
pendingPaths: string[];
};

/**
* Result of {@link ../compute-document-frequency.js | computeDocumentFrequency}: how many pages (out of `pageCount`) contain each token.
*
* `pageCount` travels bundled with `documentFrequency` rather than being a
* separate argument at call sites that consume it (e.g.
* `splitTokensByFrequency`), so the two can never be passed out of sync with
* each other (e.g. a caller re-slicing the page list after computing
* frequencies but before using them).
*/
export type DocumentFrequency = {
documentFrequency: ReadonlyMap<string, number>;
pageCount: number;
};

/**
* Tags whose contents are hashed instead of being tokenized further.
*/
Expand Down
Loading