From eeb937f09e4cde20eba1e29dbf41e65d97c4603e Mon Sep 17 00:00:00 2001 From: Yusuke Hirao Date: Fri, 3 Jul 2026 22:31:59 +0900 Subject: [PATCH] feat(page-cluster): resolve which blocking key to use per page Combine the independent URL-path and stylesheet blocking keys into a single grouping key per page, preferring the stylesheet signal when it is backed by enough shared pages and falling back to the URL path otherwise. Reuses computeDocumentFrequency/splitTokensByFrequency to filter out stylesheets common across most of the batch (e.g. a shared reset/font file) before hashing, so two unrelated pages that only share such a file are not wrongly merged. --- .../src/resolve-blocking-group-keys.spec.ts | 159 +++++++++++++++++ .../src/resolve-blocking-group-keys.ts | 167 ++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.ts diff --git a/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.spec.ts b/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.spec.ts new file mode 100644 index 00000000..df3d6782 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.spec.ts @@ -0,0 +1,159 @@ +import { describe, expect, test } from 'vitest'; + +import { resolveBlockingGroupKeys } from './resolve-blocking-group-keys.js'; + +describe('resolveBlockingGroupKeys', () => { + test('an empty page list returns an empty array', () => { + expect(resolveBlockingGroupKeys([])).toEqual([]); + }); + + // Shared by the next two tests: a.css is loaded only by the two dept-a + // pages (2/4 = 50%, below the common-href threshold) while common.css is + // loaded by all 4 pages (100%, above it). + const pagesWithOneCommonAndOneDistinctiveHref = [ + { + paths: ['dept-a', 'news', '1'], + stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'], + }, + { + paths: ['dept-a', 'news', '2'], + stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'], + }, + { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] }, + { paths: ['dept-c', 'contact'], stylesheetHrefs: ['https://example.com/common.css'] }, + ]; + + test('pages sharing a distinctive stylesheet (below the common-href threshold) get the same css: key', () => { + const result = resolveBlockingGroupKeys(pagesWithOneCommonAndOneDistinctiveHref); + + expect(result[0]).toBe('css:ac39f3dbf4cdfdbf'); + expect(result[1]).toBe('css:ac39f3dbf4cdfdbf'); + }); + + test('a stylesheet shared by every page in the batch carries no discriminative signal and falls back to the path key for all of them', () => { + // common.css above is present on all 4 pages (100%), so document-frequency + // filtering strips it before hashing; unrelated dept-b/dept-c pages must + // not be merged into one group just because they both load only that + // site-wide common stylesheet. + const result = resolveBlockingGroupKeys(pagesWithOneCommonAndOneDistinctiveHref); + + expect(result[2]).toBe('path:dept-b'); + expect(result[3]).toBe('path:dept-c'); + expect(result[2]).not.toBe(result[3]); + }); + + test('a stylesheet that is only common relative to stylesheet-bearing pages is still filtered out, even when diluted by stylesheet-less pages', () => { + // Without excluding stylesheet-less pages from the document-frequency + // denominator, common.css's frequency would read as 2/10 = 20% (well + // below the threshold) instead of 2/2 = 100%, wrongly treating it as + // distinctive and merging two unrelated pages into one css: group. + const result = resolveBlockingGroupKeys([ + { paths: ['dept-a', 'about'], stylesheetHrefs: ['https://example.com/common.css'] }, + { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] }, + ...Array.from({ length: 8 }, (_, i) => ({ + paths: ['dept-c', `page-${i}`], + stylesheetHrefs: [], + })), + ]); + + expect(result[0]).toBe('path:dept-a'); + expect(result[1]).toBe('path:dept-b'); + }); + + test('pages sharing a multi-href distinctive stylesheet set (not just a single href) get the same css: key', () => { + const result = resolveBlockingGroupKeys([ + { + paths: ['dept-a', 'news', '1'], + stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/b.css'], + }, + { + paths: ['dept-a', 'news', '2'], + stylesheetHrefs: ['https://example.com/b.css', 'https://example.com/a.css'], + }, + { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/c.css'] }, + ]); + + expect(result[0]).toBe(result[1]); + expect(result[0]).toMatch(/^css:/); + }); + + test('pages with no stylesheets at all never share a css: bucket and fall back to their own path key', () => { + const result = resolveBlockingGroupKeys([ + { paths: ['dept-d', 'x'], stylesheetHrefs: [] }, + { paths: ['dept-e', 'y'], stylesheetHrefs: [] }, + ]); + + expect(result).toEqual(['path:dept-d', 'path:dept-e']); + }); + + test('a stylesheet-based group smaller than minCssGroupSize falls back to the path key', () => { + // a third, unrelated page keeps a.css's document frequency at 2/3 (below + // the common-href threshold) instead of 2/2 = 100%, which would + // otherwise get a.css itself filtered out as non-discriminative. + const pages = [ + { paths: ['dept-a', 'news', '1'], stylesheetHrefs: ['https://example.com/a.css'] }, + { paths: ['dept-a', 'news', '2'], stylesheetHrefs: ['https://example.com/a.css'] }, + { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/b.css'] }, + ]; + + expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 2 })[0]).toBe( + 'css:ac39f3dbf4cdfdbf', + ); + expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 3 })[0]).toBe( + 'path:dept-a', + ); + expect(resolveBlockingGroupKeys(pages, { minCssGroupSize: 3 })[1]).toBe( + 'path:dept-a', + ); + }); + + test('pathDepth is forwarded to derivePathGroupKey for fallback keys', () => { + const result = resolveBlockingGroupKeys( + [{ paths: ['dept-a', 'news', '1'], stylesheetHrefs: [] }], + { pathDepth: 2 }, + ); + + expect(result).toEqual(['path:dept-a/news']); + }); + + test('hrefCommonThreshold is forwarded to splitTokensByFrequency', () => { + // a.css is shared by 2 of 4 pages (50%). With a threshold looser than + // that (0.4), it counts as "common" too and both dept-a pages fall back + // to their path key instead of matching via css:. + const result = resolveBlockingGroupKeys( + [ + { + paths: ['dept-a', 'news', '1'], + stylesheetHrefs: ['https://example.com/a.css'], + }, + { + paths: ['dept-a', 'news', '2'], + stylesheetHrefs: ['https://example.com/a.css'], + }, + { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/b.css'] }, + { paths: ['dept-c', 'contact'], stylesheetHrefs: ['https://example.com/c.css'] }, + ], + { hrefCommonThreshold: 0.4 }, + ); + + expect(result[0]).toBe('path:dept-a'); + expect(result[1]).toBe('path:dept-a'); + }); + + test.each([1, 0, -1, 0.5, Number.NaN])( + 'rejects a minCssGroupSize below 2 (%s)', + (minCssGroupSize) => { + expect(() => resolveBlockingGroupKeys([], { minCssGroupSize })).toThrow(RangeError); + }, + ); + + test('rejects an invalid pathDepth eagerly, even with an empty page list', () => { + expect(() => resolveBlockingGroupKeys([], { pathDepth: 0 })).toThrow(RangeError); + }); + + test('rejects an invalid hrefCommonThreshold eagerly, even with an empty page list', () => { + expect(() => resolveBlockingGroupKeys([], { hrefCommonThreshold: -1 })).toThrow( + RangeError, + ); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.ts b/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.ts new file mode 100644 index 00000000..4e9c3428 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/resolve-blocking-group-keys.ts @@ -0,0 +1,167 @@ +import { computeDocumentFrequency } from './compute-document-frequency.js'; +import { derivePathGroupKey } from './derive-path-group-key.js'; +import { deriveStylesheetGroupKey } from './derive-stylesheet-group-key.js'; +import { splitTokensByFrequency } from './split-tokens-by-frequency.js'; + +/** + * The two blocking signals {@link ./derive-path-group-key.js | derivePathGroupKey} + * and {@link ./derive-stylesheet-group-key.js | deriveStylesheetGroupKey} need, + * bundled per page so `resolveBlockingGroupKeys` can compute both without the + * caller re-deriving them separately. + */ +export type PageBlockingSignals = { + paths: readonly string[]; + stylesheetHrefs: readonly string[]; +}; + +/** + * @see resolveBlockingGroupKeys + */ +export type ResolveBlockingGroupKeysOptions = { + /** Forwarded to `derivePathGroupKey` as-is. */ + pathDepth?: number; + /** + * Minimum number of pages that must share a stylesheet-derived key before + * it's trusted as real evidence, rather than a coincidence. Must be at + * least 2: a page always "shares" its own key with itself, so 1 would + * accept every stylesheet-bearing page unconditionally and make this + * check a no-op. This is a structural floor (below 2, no pair of distinct + * pages can exist at all), not a statistically-derived + * confidence threshold — entity-resolution blocking literature has no + * closed-form value for "how many shared pages prove a true match", so + * this is a starting default to be tuned against real corpora, not a + * validated constant. + */ + minCssGroupSize?: number; + /** Forwarded to `splitTokensByFrequency` as-is. */ + hrefCommonThreshold?: number; +}; + +const DEFAULT_MIN_CSS_GROUP_SIZE = 2; + +/** + * Resolves, per page, which of the two independent blocking signals — the + * exact stylesheet set or the URL path — to actually use as that page's + * grouping key. Returns one key per page, in the same order as `pages`. + * + * Literature on entity-resolution blocking (Michelson & Knoblock's DNF + * scheme, canopy clustering, ensemble blocking) combines independent + * blocking predicates with OR to generate *candidate pairs* for a later + * similarity/classification pass. This function instead commits each page to + * exactly one final key with no later refinement step, so OR-merging the two + * signals (via union-find over shared keys) doesn't apply here: a union of + * equivalence relations can only ever coarsen a partition, never split it, + * but the whole point of preferring the stylesheet signal is that it *splits* + * pages a URL-path-only grouping would otherwise lump together (confirmed + * against real crawl data: a single page embedded under an otherwise-uniform + * URL section, but loading a completely different stylesheet set, is exactly + * the case a path-only key misses and a stylesheet key catches). A + * priority-with-fallback decision — try the strong signal, fall back to the + * weak one — is the applicable pattern here, not OR-merge. + * + * Before comparing stylesheet sets, this reuses + * {@link ./compute-document-frequency.js | computeDocumentFrequency} and + * {@link ./split-tokens-by-frequency.js | splitTokensByFrequency} — originally + * built to separate a page's site-wide chrome from its page-specific HTML + * structure — to strip stylesheet hrefs that recur across most of `pages` + * (e.g. a shared reset/font stylesheet) before hashing. Without this, two + * pages from otherwise-unrelated sections that happen to load only that one + * shared stylesheet would satisfy `minCssGroupSize` and be wrongly treated as + * the same template family: the problem there isn't too few pages sharing + * the key (raising `minCssGroupSize` doesn't fix it), it's that the key + * itself carries no discriminative information. A page whose stylesheet set + * is empty, or becomes empty after this filtering, always falls back to the + * path key — loading no distinctive stylesheet is an absence of evidence, + * not evidence of a shared template, so it must never itself become a + * matching signal. + * + * Document frequency is computed only over pages that load at least one + * stylesheet: including stylesheet-less pages in the denominator would dilute + * every href's frequency ratio (e.g. a stylesheet loaded by 100% of the pages + * that load *any* stylesheet would read as a low, "distinctive" frequency if + * most pages in the batch load none), letting a genuinely non-discriminative, + * site-wide stylesheet slip through the common-href filter. + * + * Like `computeDocumentFrequency` itself, this expects `pages` to be a + * roughly homogeneous batch (one site, or one section of a large + * multi-template site) — see that function's JSDoc for why a federation of + * independently-templated sub-sections defeats frequency-based filtering. + * Splitting a heterogeneous crawl into sections before calling this function + * is the caller's responsibility. + * + * This filtering needs enough stylesheet-bearing pages to tell "loaded by + * every page that has any stylesheet" apart from "coincidentally the only + * stylesheet two pages happen to load": with only two stylesheet-bearing + * pages in the whole batch and nothing else to contrast against, any + * stylesheet they share reads as 100% common and gets filtered out, + * producing a path-key fallback even when the two pages are a genuine + * template match. A third, differently-styled page (as in the example below) + * is what gives the shared stylesheet a frequency below the common-href + * cutoff. + * @param pages + * @param options + * @example + * ```ts + * resolveBlockingGroupKeys([ + * { paths: ['dept-a', 'news', '1'], stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'] }, + * { paths: ['dept-a', 'news', '2'], stylesheetHrefs: ['https://example.com/a.css', 'https://example.com/common.css'] }, + * { paths: ['dept-b', 'about'], stylesheetHrefs: ['https://example.com/common.css'] }, + * ]); + * // ['css:', 'css:', 'path:dept-b'] + * // common.css is loaded by all 3 pages and is filtered out as non-discriminative chrome. + * ``` + */ +export function resolveBlockingGroupKeys( + pages: readonly PageBlockingSignals[], + options?: ResolveBlockingGroupKeysOptions, +): string[] { + const pathDepth = options?.pathDepth; + const minCssGroupSize = options?.minCssGroupSize ?? DEFAULT_MIN_CSS_GROUP_SIZE; + const hrefCommonThreshold = options?.hrefCommonThreshold; + + if (!(Number.isInteger(minCssGroupSize) && minCssGroupSize >= 2)) { + throw new RangeError( + `resolveBlockingGroupKeys: minCssGroupSize must be an integer >= 2, got ${minCssGroupSize}`, + ); + } + // Eagerly delegate pathDepth/hrefCommonThreshold validation to the + // functions that own it, instead of only discovering an invalid option + // once some page's data happens to reach that branch below. + derivePathGroupKey([], pathDepth); + splitTokensByFrequency( + new Set(), + { documentFrequency: new Map(), pageCount: 0 }, + hrefCommonThreshold, + ); + + const hrefSets = pages.map((page) => new Set(page.stylesheetHrefs)); + // Pages with no stylesheets at all must not count toward the denominator: + // see the JSDoc note above on document-frequency dilution. + const corpusFrequency = computeDocumentFrequency( + hrefSets.filter((hrefSet) => hrefSet.size > 0), + ); + + const distinctiveHrefs = hrefSets.map( + (hrefSet) => + splitTokensByFrequency(hrefSet, corpusFrequency, hrefCommonThreshold).contentTokens, + ); + + const cssKeys = distinctiveHrefs.map((hrefs) => + hrefs.size === 0 ? undefined : deriveStylesheetGroupKey([...hrefs]), + ); + + const cssKeyCounts = new Map(); + for (const cssKey of cssKeys) { + if (cssKey !== undefined) { + cssKeyCounts.set(cssKey, (cssKeyCounts.get(cssKey) ?? 0) + 1); + } + } + + return pages.map((page, index) => { + const cssKey = cssKeys[index]; + if (cssKey !== undefined && (cssKeyCounts.get(cssKey) ?? 0) >= minCssGroupSize) { + return `css:${cssKey}`; + } + return `path:${derivePathGroupKey(page.paths, pathDepth)}`; + }); +}