From fb294645aef8337e738f8c77c88135b88f58b5e8 Mon Sep 17 00:00:00 2001 From: Yusuke Hirao Date: Fri, 3 Jul 2026 21:05:44 +0900 Subject: [PATCH 1/2] feat(page-cluster): add URL-path and stylesheet blocking-key derivation Add derivePathGroupKey and deriveStylesheetGroupKey, two independent "blocking key" functions (record-linkage sense) that turn a page's URL path segments and loaded-stylesheet list into coarse partition keys. These feed a future classifier's homogeneous-group discovery step, ahead of the expensive structural comparison tokenize()/jaccardSimilarity() already provide. Export HASH_LENGTH from hash-content.ts so deriveStylesheetGroupKey reuses the same hash truncation length instead of picking its own. Add "hrefs" to the cspell word list, used across the new files' JSDoc and tests. --- cspell.json | 1 + .../src/derive-path-group-key.spec.ts | 50 +++++++++++++ .../page-cluster/src/derive-path-group-key.ts | 50 +++++++++++++ .../src/derive-stylesheet-group-key.spec.ts | 74 +++++++++++++++++++ .../src/derive-stylesheet-group-key.ts | 43 +++++++++++ .../@d-zero/page-cluster/src/hash-content.ts | 6 +- 6 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 packages/@d-zero/page-cluster/src/derive-path-group-key.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/derive-path-group-key.ts create mode 100644 packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.spec.ts create mode 100644 packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.ts diff --git a/cspell.json b/cspell.json index 313ed5e3..a046e6d4 100644 --- a/cspell.json +++ b/cspell.json @@ -13,6 +13,7 @@ // page-cluster clustering/distance terminology "jaccard", "medoids", + "hrefs", // "gaxios", diff --git a/packages/@d-zero/page-cluster/src/derive-path-group-key.spec.ts b/packages/@d-zero/page-cluster/src/derive-path-group-key.spec.ts new file mode 100644 index 00000000..86a07ddc --- /dev/null +++ b/packages/@d-zero/page-cluster/src/derive-path-group-key.spec.ts @@ -0,0 +1,50 @@ +import { describe, expect, test } from 'vitest'; + +import { derivePathGroupKey } from './derive-path-group-key.js'; + +describe('derivePathGroupKey', () => { + test('an empty path array (root page) returns an empty string', () => { + expect(derivePathGroupKey([])).toBe(''); + }); + + test('defaults to depth 1, keeping only the top-level segment', () => { + expect(derivePathGroupKey(['dept-a', 'news', '123'])).toBe('dept-a'); + }); + + test('depth 2 keeps the first two segments joined by "/"', () => { + expect(derivePathGroupKey(['dept-a', 'news', '123'], 2)).toBe('dept-a/news'); + }); + + test('depth 3 keeps all segments when the array has exactly that many', () => { + expect(derivePathGroupKey(['dept-a', 'news', '123'], 3)).toBe('dept-a/news/123'); + }); + + test('a depth larger than the array length returns all available segments', () => { + expect(derivePathGroupKey(['about'], 5)).toBe('about'); + }); + + test('a single-segment path with the default depth returns that segment', () => { + expect(derivePathGroupKey(['about'])).toBe('about'); + }); + + test.each([0, -1, 0.5, Number.NaN])( + 'rejects a non-positive-integer depth (%s)', + (depth) => { + expect(() => derivePathGroupKey(['dept-a'], depth)).toThrow(RangeError); + }, + ); + + test('a trailing empty segment (from a directory-style URL) does not change the key', () => { + // `parseUrl('https://example.com/dept-a/').paths` is `['dept-a', '']`, + // while `parseUrl('https://example.com/dept-a').paths` is `['dept-a']`. + expect(derivePathGroupKey(['dept-a', ''], 2)).toBe(derivePathGroupKey(['dept-a'], 2)); + }); + + test('a root path made of only an empty segment returns an empty string', () => { + expect(derivePathGroupKey([''])).toBe(''); + }); + + test('an empty segment anywhere in the array is filtered, not just a trailing one', () => { + expect(derivePathGroupKey(['dept-a', '', 'news'], 3)).toBe('dept-a/news'); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/derive-path-group-key.ts b/packages/@d-zero/page-cluster/src/derive-path-group-key.ts new file mode 100644 index 00000000..08727b4d --- /dev/null +++ b/packages/@d-zero/page-cluster/src/derive-path-group-key.ts @@ -0,0 +1,50 @@ +/** + * Derives a coarse grouping key from a page's URL path segments (e.g. the + * `paths` field of `@d-zero/shared/parse-url`'s `ExURL`), keeping only the + * leading `depth` segments. This is a *blocking key* in the record-linkage + * sense: a cheap, coarse partition applied before any expensive structural + * comparison (`jaccardSimilarity`/`arrayEditDistance` on `tokenize()` + * output), not a similarity score by itself. Real-data validation on a large + * multi-section site found that a single site-wide document-frequency + * computation ({@link ./compute-document-frequency.js | computeDocumentFrequency}) + * fails when the site is actually a federation of independently-templated + * sub-sections; splitting pages by their top-level URL segment first and + * computing frequency per group recovered a working split. This function + * produces that split key. + * + * Deliberately returns only this one signal rather than merging it with + * other blocking signals (e.g. a stylesheet-based key) into a single + * composite key: literature on entity-resolution blocking (e.g. Michelson & + * Knoblock's DNF blocking scheme) finds that combining independent blocking + * predicates with AND into one key is inferior to keeping them independent + * and combining candidate pairs with OR — that combination decision belongs + * to the caller that actually groups pages, not to this function. + * + * Empty segments anywhere in `paths` are dropped before slicing, so the + * trailing `''` that `ExURL.paths` produces for a directory-style URL (one + * ending in `/`) doesn't fragment a section's key from the same section's + * non-trailing-slash URLs. + * @param paths + * @param depth + * @example + * ```ts + * derivePathGroupKey(['dept-a', 'news', '123']); + * // 'dept-a' + * derivePathGroupKey(['dept-a', 'news', '123'], 2); + * // 'dept-a/news' + * derivePathGroupKey([]); + * // '' + * ``` + */ +export function derivePathGroupKey(paths: readonly string[], depth: number = 1): string { + if (!(Number.isInteger(depth) && depth > 0)) { + throw new RangeError( + `derivePathGroupKey: depth must be a positive integer, got ${depth}`, + ); + } + + // See the trailing-slash note above. + const segments = paths.filter((segment) => segment !== ''); + + return segments.slice(0, depth).join('/'); +} diff --git a/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.spec.ts b/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.spec.ts new file mode 100644 index 00000000..c1442b6e --- /dev/null +++ b/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.spec.ts @@ -0,0 +1,74 @@ +import { describe, expect, test } from 'vitest'; + +import { deriveStylesheetGroupKey } from './derive-stylesheet-group-key.js'; + +describe('deriveStylesheetGroupKey', () => { + test('an empty array returns a fixed, pinned key', () => { + expect(deriveStylesheetGroupKey([])).toBe('4f53cda18c2baa0c'); + }); + + test('a known two-href input hashes to a pinned literal key', () => { + expect( + deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/b.css', + ]), + ).toBe('9742767276316d59'); + }); + + test('is deterministic for the same input', () => { + const hrefs = [ + 'https://example.com/assets/site.css', + 'https://example.com/assets/theme.css', + ]; + expect(deriveStylesheetGroupKey(hrefs)).toBe(deriveStylesheetGroupKey(hrefs)); + }); + + test('is independent of input order', () => { + const a = deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/b.css', + ]); + const b = deriveStylesheetGroupKey([ + 'https://example.com/b.css', + 'https://example.com/a.css', + ]); + expect(a).toBe(b); + }); + + test('different stylesheet sets produce different keys', () => { + const a = deriveStylesheetGroupKey(['https://example.com/a.css']); + const b = deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/b.css', + ]); + expect(a).not.toBe(b); + }); + + test('duplicate hrefs do not change the key compared to the deduplicated set', () => { + const withDuplicates = deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/a.css', + 'https://example.com/b.css', + ]); + const deduplicated = deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/b.css', + ]); + expect(withDuplicates).toBe(deduplicated); + }); + + test('a single href containing a space is not confused with two separately joined hrefs', () => { + // A naive "\n"-join followed by whitespace-collapsing normalization + // would make these two collide; JSON-serializing before hashing keeps + // them distinct. + const twoHrefs = deriveStylesheetGroupKey([ + 'https://example.com/a.css', + 'https://example.com/b.css', + ]); + const oneHrefWithSpace = deriveStylesheetGroupKey([ + 'https://example.com/a.css b.css', + ]); + expect(twoHrefs).not.toBe(oneHrefWithSpace); + }); +}); diff --git a/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.ts b/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.ts new file mode 100644 index 00000000..42a937b6 --- /dev/null +++ b/packages/@d-zero/page-cluster/src/derive-stylesheet-group-key.ts @@ -0,0 +1,43 @@ +import { hash } from '@d-zero/shared/hash'; + +import { HASH_LENGTH } from './hash-content.js'; + +/** + * Derives a coarse grouping key from the set of stylesheet URLs a page + * loads. This is a *blocking key* in the record-linkage sense (see + * {@link ./derive-path-group-key.js | derivePathGroupKey}): pages sharing + * the exact same stylesheet set are near-certainly the same template + * family, making this a strong but sparse signal — many pages load few or + * no stylesheets, so this key is meant to be used alongside, not instead + * of, weaker-but-always-present signals like a URL-path-based key. + * + * `stylesheetHrefs` must already be resolved to a form that is comparable + * across the whole corpus (e.g. absolute URLs). This function only compares + * the strings it is given: two pages that both reference the same + * unresolved relative href text (e.g. both link `href="style.css"`) but + * from different directories, and would therefore load different physical + * files, produce the same key here unless the caller has already resolved + * each href against its page's URL before calling. + * + * Input order does not affect the result: the arrangement of `` tags + * in a document has no bearing on template identity, so hrefs are sorted + * (and deduplicated, since a repeated href contributes no extra information + * about what the page loads) before hashing. The sorted list is + * JSON-serialized before hashing rather than joined with a plain delimiter + * (e.g. `"\n"`) so that no character sequence inside one href can be + * mistaken for a boundary between two hrefs. Hashing (via SHA-256, reusing + * {@link ./hash-content.js | HASH_LENGTH} for the same truncation length the + * package's other hashed keys use) keeps the key a fixed size regardless of + * how many stylesheets a page loads or how long their URLs are. + * @param stylesheetHrefs + * @example + * ```ts + * deriveStylesheetGroupKey(['https://example.com/assets/site.css', 'https://example.com/assets/theme.css']); + * deriveStylesheetGroupKey(['https://example.com/assets/theme.css', 'https://example.com/assets/site.css']); + * // same result for both calls above — order-independent + * ``` + */ +export function deriveStylesheetGroupKey(stylesheetHrefs: readonly string[]): string { + const sorted = [...new Set(stylesheetHrefs)].toSorted(); + return hash(JSON.stringify(sorted)).slice(0, HASH_LENGTH); +} diff --git a/packages/@d-zero/page-cluster/src/hash-content.ts b/packages/@d-zero/page-cluster/src/hash-content.ts index 57c65fe8..0ff71b70 100644 --- a/packages/@d-zero/page-cluster/src/hash-content.ts +++ b/packages/@d-zero/page-cluster/src/hash-content.ts @@ -8,8 +8,12 @@ import { normalizeForHash } from './normalize-for-hash.js'; * to answer "did this script/style/svg/comment's content change", not * resist deliberate collision attacks, so the reduced collision resistance * of a truncated digest is an acceptable trade-off. + * + * Exported so other hashed-key producers in this package (e.g. + * `deriveStylesheetGroupKey`) use the same truncation length instead of + * picking their own. */ -const HASH_LENGTH = 16; +export const HASH_LENGTH = 16; /** * Hashes `script`/`style`/`svg`/`noscript`/comment content instead of From 67e4fad2d7d3216cf0f705f8572ef5c39085c27f Mon Sep 17 00:00:00 2001 From: Yusuke Hirao Date: Fri, 3 Jul 2026 21:14:40 +0900 Subject: [PATCH 2/2] docs(page-cluster): document derivePathGroupKey's depth validation constraint --- packages/@d-zero/page-cluster/src/derive-path-group-key.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/packages/@d-zero/page-cluster/src/derive-path-group-key.ts b/packages/@d-zero/page-cluster/src/derive-path-group-key.ts index 08727b4d..40d154b6 100644 --- a/packages/@d-zero/page-cluster/src/derive-path-group-key.ts +++ b/packages/@d-zero/page-cluster/src/derive-path-group-key.ts @@ -24,6 +24,11 @@ * trailing `''` that `ExURL.paths` produces for a directory-style URL (one * ending in `/`) doesn't fragment a section's key from the same section's * non-trailing-slash URLs. + * + * `depth` must be a positive integer. A non-positive or fractional depth has + * no sensible interpretation as "how many leading segments to keep", so it + * is rejected eagerly rather than silently coerced or left to produce a + * confusing result (e.g. slicing with a negative or fractional length). * @param paths * @param depth * @example