Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import { describe, expect, test } from 'vitest';

import { resolveBlockingGroupKeys } from './resolve-blocking-group-keys.js';

describe('resolveBlockingGroupKeys', () => {
test('an empty page list returns an empty array', () => {
expect(resolveBlockingGroupKeys([])).toEqual([]);
});

// Shared by the next two tests: a.css is loaded only by the two dept-a
// pages (2/4 = 50%, below the common-href threshold) while common.css is
// loaded by all 4 pages (100%, above it).
const pagesWithOneCommonAndOneDistinctiveHref = [
{
paths: ['dept-a', 'news', '1'],
stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'],
},
{
paths: ['dept-a', 'news', '2'],
stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'],
},
{ paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] },
{ paths: ['dept-c', 'contact'], stylesheetHrefs: ['https://example.com/common.css'] },
];

test('pages sharing a distinctive stylesheet (below the common-href threshold) get the same css: key', () => {
const result = resolveBlockingGroupKeys(pagesWithOneCommonAndOneDistinctiveHref);

expect(result[0]).toBe('css:ac39f3dbf4cdfdbf');
expect(result[1]).toBe('css:ac39f3dbf4cdfdbf');
});

test('a stylesheet shared by every page in the batch carries no discriminative signal and falls back to the path key for all of them', () => {
// common.css above is present on all 4 pages (100%), so document-frequency
// filtering strips it before hashing; unrelated dept-b/dept-c pages must
// not be merged into one group just because they both load only that
// site-wide common stylesheet.
const result = resolveBlockingGroupKeys(pagesWithOneCommonAndOneDistinctiveHref);

expect(result[2]).toBe('path:dept-b');
expect(result[3]).toBe('path:dept-c');
expect(result[2]).not.toBe(result[3]);
});

test('a stylesheet that is only common relative to stylesheet-bearing pages is still filtered out, even when diluted by stylesheet-less pages', () => {
// Without excluding stylesheet-less pages from the document-frequency
// denominator, common.css's frequency would read as 2/10 = 20% (well
// below the threshold) instead of 2/2 = 100%, wrongly treating it as
// distinctive and merging two unrelated pages into one css: group.
const result = resolveBlockingGroupKeys([
{ paths: ['dept-a', 'about'], stylesheetHrefs: ['https://example.com/common.css'] },
{ paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] },
...Array.from({ length: 8 }, (_, i) => ({
paths: ['dept-c', `page-${i}`],
stylesheetHrefs: [],
})),
]);

expect(result[0]).toBe('path:dept-a');
expect(result[1]).toBe('path:dept-b');
});

test('pages sharing a multi-href distinctive stylesheet set (not just a single href) get the same css: key', () => {
const result = resolveBlockingGroupKeys([
{
paths: ['dept-a', 'news', '1'],
stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/b.css'],
},
{
paths: ['dept-a', 'news', '2'],
stylesheetHrefs: ['https://example.com/b.css', 'https://example.com/a.css'],
},
{ paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/c.css'] },
]);

expect(result[0]).toBe(result[1]);
expect(result[0]).toMatch(/^css:/);
});

test('pages with no stylesheets at all never share a css: bucket and fall back to their own path key', () => {
const result = resolveBlockingGroupKeys([
{ paths: ['dept-d', 'x'], stylesheetHrefs: [] },
{ paths: ['dept-e', 'y'], stylesheetHrefs: [] },
]);

expect(result).toEqual(['path:dept-d', 'path:dept-e']);
});

test('a stylesheet-based group smaller than minCssGroupSize falls back to the path key', () => {
// a third, unrelated page keeps a.css's document frequency at 2/3 (below
// the common-href threshold) instead of 2/2 = 100%, which would
// otherwise get a.css itself filtered out as non-discriminative.
const pages = [
{ paths: ['dept-a', 'news', '1'], stylesheetHrefs: ['https://example.com/a.css'] },
{ paths: ['dept-a', 'news', '2'], stylesheetHrefs: ['https://example.com/a.css'] },
{ paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/b.css'] },
];

expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 2 })[0]).toBe(
'css:ac39f3dbf4cdfdbf',
);
expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 3 })[0]).toBe(
'path:dept-a',
);
expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 3 })[1]).toBe(
'path:dept-a',
);
});

test('pathDepth is forwarded to derivePathGroupKey for fallback keys', () => {
const result = resolveBlockingGroupKeys(
[{ paths: ['dept-a', 'news', '1'], stylesheetHrefs: [] }],
{ pathDepth: 2 },
);

expect(result).toEqual(['path:dept-a/news']);
});

test('hrefCommonThreshold is forwarded to splitTokensByFrequency', () => {
// a.css is shared by 2 of 4 pages (50%). With a threshold looser than
// that (0.4), it counts as "common" too and both dept-a pages fall back
// to their path key instead of matching via css:.
const result = resolveBlockingGroupKeys(
[
{
paths: ['dept-a', 'news', '1'],
stylesheetHrefs: ['https://example.com/a.css'],
},
{
paths: ['dept-a', 'news', '2'],
stylesheetHrefs: ['https://example.com/a.css'],
},
{ paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/b.css'] },
{ paths: ['dept-c', 'contact'], stylesheetHrefs: ['https://example.com/c.css'] },
],
{ hrefCommonThreshold: 0.4 },
);

expect(result[0]).toBe('path:dept-a');
expect(result[1]).toBe('path:dept-a');
});

test.each([1, 0, -1, 0.5, Number.NaN])(
'rejects a minCssGroupSize below 2 (%s)',
(minCssGroupSize) => {
expect(() => resolveBlockingGroupKeys([], { minCssGroupSize })).toThrow(RangeError);
},
);

test('rejects an invalid pathDepth eagerly, even with an empty page list', () => {
expect(() => resolveBlockingGroupKeys([], { pathDepth: 0 })).toThrow(RangeError);
});

test('rejects an invalid hrefCommonThreshold eagerly, even with an empty page list', () => {
expect(() => resolveBlockingGroupKeys([], { hrefCommonThreshold: -1 })).toThrow(
RangeError,
);
});
});
167 changes: 167 additions & 0 deletions packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import { computeDocumentFrequency } from './compute-document-frequency.js';
import { derivePathGroupKey } from './derive-path-group-key.js';
import { deriveStylesheetGroupKey } from './derive-stylesheet-group-key.js';
import { splitTokensByFrequency } from './split-tokens-by-frequency.js';

/**
* The two blocking signals {@link ./derive-path-group-key.js | derivePathGroupKey}
* and {@link ./derive-stylesheet-group-key.js | deriveStylesheetGroupKey} need,
* bundled per page so `resolveBlockingGroupKeys` can compute both without the
* caller re-deriving them separately.
*/
export type PageBlockingSignals = {
paths: readonly string[];
stylesheetHrefs: readonly string[];
};

/**
* @see resolveBlockingGroupKeys
*/
export type ResolveBlockingGroupKeysOptions = {
/** Forwarded to `derivePathGroupKey` as-is. */
pathDepth?: number;
/**
* Minimum number of pages that must share a stylesheet-derived key before
* it's trusted as real evidence, rather than a coincidence. Must be at
* least 2: a page always "shares" its own key with itself, so 1 would
* accept every stylesheet-bearing page unconditionally and make this
* check a no-op. This is a structural floor (below 2, no pair of distinct
* pages can exist at all), not a statistically-derived
* confidence threshold — entity-resolution blocking literature has no
* closed-form value for "how many shared pages prove a true match", so
* this is a starting default to be tuned against real corpora, not a
* validated constant.
*/
minCssGroupSize?: number;
/** Forwarded to `splitTokensByFrequency` as-is. */
hrefCommonThreshold?: number;
};

const DEFAULT_MIN_CSS_GROUP_SIZE = 2;

/**
* Resolves, per page, which of the two independent blocking signals — the
* exact stylesheet set or the URL path — to actually use as that page's
* grouping key. Returns one key per page, in the same order as `pages`.
*
* Literature on entity-resolution blocking (Michelson & Knoblock's DNF
* scheme, canopy clustering, ensemble blocking) combines independent
* blocking predicates with OR to generate *candidate pairs* for a later
* similarity/classification pass. This function instead commits each page to
* exactly one final key with no later refinement step, so OR-merging the two
* signals (via union-find over shared keys) doesn't apply here: a union of
* equivalence relations can only ever coarsen a partition, never split it,
* but the whole point of preferring the stylesheet signal is that it *splits*
* pages a URL-path-only grouping would otherwise lump together (confirmed
* against real crawl data: a single page embedded under an otherwise-uniform
* URL section, but loading a completely different stylesheet set, is exactly
* the case a path-only key misses and a stylesheet key catches). A
* priority-with-fallback decision — try the strong signal, fall back to the
* weak one — is the applicable pattern here, not OR-merge.
*
* Before comparing stylesheet sets, this reuses
* {@link ./compute-document-frequency.js | computeDocumentFrequency} and
* {@link ./split-tokens-by-frequency.js | splitTokensByFrequency} — originally
* built to separate a page's site-wide chrome from its page-specific HTML
* structure — to strip stylesheet hrefs that recur across most of `pages`
* (e.g. a shared reset/font stylesheet) before hashing. Without this, two
* pages from otherwise-unrelated sections that happen to load only that one
* shared stylesheet would satisfy `minCssGroupSize` and be wrongly treated as
* the same template family: the problem there isn't too few pages sharing
* the key (raising `minCssGroupSize` doesn't fix it), it's that the key
* itself carries no discriminative information. A page whose stylesheet set
* is empty, or becomes empty after this filtering, always falls back to the
* path key — loading no distinctive stylesheet is an absence of evidence,
* not evidence of a shared template, so it must never itself become a
* matching signal.
*
* Document frequency is computed only over pages that load at least one
* stylesheet: including stylesheet-less pages in the denominator would dilute
* every href's frequency ratio (e.g. a stylesheet loaded by 100% of the pages
* that load *any* stylesheet would read as a low, "distinctive" frequency if
* most pages in the batch load none), letting a genuinely non-discriminative,
* site-wide stylesheet slip through the common-href filter.
*
* Like `computeDocumentFrequency` itself, this expects `pages` to be a
* roughly homogeneous batch (one site, or one section of a large
* multi-template site) — see that function's JSDoc for why a federation of
* independently-templated sub-sections defeats frequency-based filtering.
* Splitting a heterogeneous crawl into sections before calling this function
* is the caller's responsibility.
*
* This filtering needs enough stylesheet-bearing pages to tell "loaded by
* every page that has any stylesheet" apart from "coincidentally the only
* stylesheet two pages happen to load": with only two stylesheet-bearing
* pages in the whole batch and nothing else to contrast against, any
* stylesheet they share reads as 100% common and gets filtered out,
* producing a path-key fallback even when the two pages are a genuine
* template match. A third, differently-styled page (as in the example below)
* is what gives the shared stylesheet a frequency below the common-href
* cutoff.
* @param pages
* @param options
* @example
* ```ts
* resolveBlockingGroupKeys([
* { paths: ['dept-a', 'news', '1'], stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'] },
* { paths: ['dept-a', 'news', '2'], stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'] },
* { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] },
* ]);
* // ['css:<hash of a.css>', 'css:<hash of a.css>', 'path:dept-b']
* // common.css is loaded by all 3 pages and is filtered out as non-discriminative chrome.
* ```
*/
export function resolveBlockingGroupKeys(
pages: readonly PageBlockingSignals[],
options?: ResolveBlockingGroupKeysOptions,
): string[] {
const pathDepth = options?.pathDepth;
const minCssGroupSize = options?.minCssGroupSize ?? DEFAULT_MIN_CSS_GROUP_SIZE;
const hrefCommonThreshold = options?.hrefCommonThreshold;

if (!(Number.isInteger(minCssGroupSize) && minCssGroupSize >= 2)) {
throw new RangeError(
`resolveBlockingGroupKeys: minCssGroupSize must be an integer >= 2, got ${minCssGroupSize}`,
);
}
// Eagerly delegate pathDepth/hrefCommonThreshold validation to the
// functions that own it, instead of only discovering an invalid option
// once some page's data happens to reach that branch below.
derivePathGroupKey([], pathDepth);
splitTokensByFrequency(
new Set(),
{ documentFrequency: new Map(), pageCount: 0 },
hrefCommonThreshold,
);

const hrefSets = pages.map((page) => new Set(page.stylesheetHrefs));
// Pages with no stylesheets at all must not count toward the denominator:
// see the JSDoc note above on document-frequency dilution.
const corpusFrequency = computeDocumentFrequency(
hrefSets.filter((hrefSet) => hrefSet.size > 0),
);

const distinctiveHrefs = hrefSets.map(
(hrefSet) =>
splitTokensByFrequency(hrefSet, corpusFrequency, hrefCommonThreshold).contentTokens,
);

const cssKeys = distinctiveHrefs.map((hrefs) =>
hrefs.size === 0 ? undefined : deriveStylesheetGroupKey([...hrefs]),
);

const cssKeyCounts = new Map<string, number>();
for (const cssKey of cssKeys) {
if (cssKey !== undefined) {
cssKeyCounts.set(cssKey, (cssKeyCounts.get(cssKey) ?? 0) + 1);
}
}

return pages.map((page, index) => {
const cssKey = cssKeys[index];
if (cssKey !== undefined && (cssKeyCounts.get(cssKey) ?? 0) >= minCssGroupSize) {
return `css:${cssKey}`;
}
return `path:${derivePathGroupKey(page.paths, pathDepth)}`;
});
}
Loading