Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// page-cluster clustering/distance terminology
"jaccard",
"medoids",
"hrefs",

//
"gaxios",
Expand Down
50 changes: 50 additions & 0 deletions packages/@d-zero/page-cluster/src/derive-path-group-key.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import { describe, expect, test } from 'vitest';

import { derivePathGroupKey } from './derive-path-group-key.js';

describe('derivePathGroupKey', () => {
test('an empty path array (root page) returns an empty string', () => {
expect(derivePathGroupKey([])).toBe('');
});

test('defaults to depth 1, keeping only the top-level segment', () => {
expect(derivePathGroupKey(['dept-a', 'news', '123'])).toBe('dept-a');
});

test('depth 2 keeps the first two segments joined by "/"', () => {
expect(derivePathGroupKey(['dept-a', 'news', '123'], 2)).toBe('dept-a/news');
});

test('depth 3 keeps all segments when the array has exactly that many', () => {
expect(derivePathGroupKey(['dept-a', 'news', '123'], 3)).toBe('dept-a/news/123');
});

test('a depth larger than the array length returns all available segments', () => {
expect(derivePathGroupKey(['about'], 5)).toBe('about');
});

test('a single-segment path with the default depth returns that segment', () => {
expect(derivePathGroupKey(['about'])).toBe('about');
});

test.each([0, -1, 0.5, Number.NaN])(
'rejects a non-positive-integer depth (%s)',
(depth) => {
expect(() => derivePathGroupKey(['dept-a'], depth)).toThrow(RangeError);
},
);

test('a trailing empty segment (from a directory-style URL) does not change the key', () => {
// `parseUrl('https://example.com/dept-a/').paths` is `['dept-a', '']`,
// while `parseUrl('https://example.com/dept-a').paths` is `['dept-a']`.
expect(derivePathGroupKey(['dept-a', ''], 2)).toBe(derivePathGroupKey(['dept-a'], 2));
});

test('a root path made of only an empty segment returns an empty string', () => {
expect(derivePathGroupKey([''])).toBe('');
});

test('an empty segment anywhere in the array is filtered, not just a trailing one', () => {
expect(derivePathGroupKey(['dept-a', '', 'news'], 3)).toBe('dept-a/news');
});
});
55 changes: 55 additions & 0 deletions packages/@d-zero/page-cluster/src/derive-path-group-key.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/**
* Derives a coarse grouping key from a page's URL path segments (e.g. the
* `paths` field of `@d-zero/shared/parse-url`'s `ExURL`), keeping only the
* leading `depth` segments. This is a *blocking key* in the record-linkage
* sense: a cheap, coarse partition applied before any expensive structural
* comparison (`jaccardSimilarity`/`arrayEditDistance` on `tokenize()`
* output), not a similarity score by itself. Real-data validation on a large
* multi-section site found that a single site-wide document-frequency
* computation ({@link ./compute-document-frequency.js | computeDocumentFrequency})
* fails when the site is actually a federation of independently-templated
* sub-sections; splitting pages by their top-level URL segment first and
* computing frequency per group recovered a working split. This function
* produces that split key.
*
* Deliberately returns only this one signal rather than merging it with
* other blocking signals (e.g. a stylesheet-based key) into a single
* composite key: literature on entity-resolution blocking (e.g. Michelson &
* Knoblock's DNF blocking scheme) finds that combining independent blocking
* predicates with AND into one key is inferior to keeping them independent
* and combining candidate pairs with OR — that combination decision belongs
* to the caller that actually groups pages, not to this function.
*
* Empty segments anywhere in `paths` are dropped before slicing, so the
* trailing `''` that `ExURL.paths` produces for a directory-style URL (one
* ending in `/`) doesn't fragment a section's key from the same section's
* non-trailing-slash URLs.
*
* `depth` must be a positive integer. A non-positive or fractional depth has
* no sensible interpretation as "how many leading segments to keep", so it
* is rejected eagerly rather than silently coerced or left to produce a
* confusing result (e.g. slicing with a negative or fractional length).
* @param paths
* @param depth
* @example
* ```ts
* derivePathGroupKey(['dept-a', 'news', '123']);
* // 'dept-a'
* derivePathGroupKey(['dept-a', 'news', '123'], 2);
* // 'dept-a/news'
* derivePathGroupKey([]);
* // ''
* ```
*/
export function derivePathGroupKey(paths: readonly string[], depth: number = 1): string {
if (!(Number.isInteger(depth) && depth > 0)) {
throw new RangeError(
`derivePathGroupKey: depth must be a positive integer, got ${depth}`,
);
}

// See the trailing-slash note above.
const segments = paths.filter((segment) => segment !== '');

return segments.slice(0, depth).join('/');
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { describe, expect, test } from 'vitest';

import { deriveStylesheetGroupKey } from './derive-stylesheet-group-key.js';

describe('deriveStylesheetGroupKey', () => {
test('an empty array returns a fixed, pinned key', () => {
expect(deriveStylesheetGroupKey([])).toBe('4f53cda18c2baa0c');
});

test('a known two-href input hashes to a pinned literal key', () => {
expect(
deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/b.css',
]),
).toBe('9742767276316d59');
});

test('is deterministic for the same input', () => {
const hrefs = [
'https://example.com/assets/site.css',
'https://example.com/assets/theme.css',
];
expect(deriveStylesheetGroupKey(hrefs)).toBe(deriveStylesheetGroupKey(hrefs));
});

test('is independent of input order', () => {
const a = deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/b.css',
]);
const b = deriveStylesheetGroupKey([
'https://example.com/b.css',
'https://example.com/a.css',
]);
expect(a).toBe(b);
});

test('different stylesheet sets produce different keys', () => {
const a = deriveStylesheetGroupKey(['https://example.com/a.css']);
const b = deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/b.css',
]);
expect(a).not.toBe(b);
});

test('duplicate hrefs do not change the key compared to the deduplicated set', () => {
const withDuplicates = deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/a.css',
'https://example.com/b.css',
]);
const deduplicated = deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/b.css',
]);
expect(withDuplicates).toBe(deduplicated);
});

test('a single href containing a space is not confused with two separately joined hrefs', () => {
// A naive "\n"-join followed by whitespace-collapsing normalization
// would make these two collide; JSON-serializing before hashing keeps
// them distinct.
const twoHrefs = deriveStylesheetGroupKey([
'https://example.com/a.css',
'https://example.com/b.css',
]);
const oneHrefWithSpace = deriveStylesheetGroupKey([
'https://example.com/a.css b.css',
]);
expect(twoHrefs).not.toBe(oneHrefWithSpace);
});
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import { hash } from '@d-zero/shared/hash';

import { HASH_LENGTH } from './hash-content.js';

/**
* Derives a coarse grouping key from the set of stylesheet URLs a page
* loads. This is a *blocking key* in the record-linkage sense (see
* {@link ./derive-path-group-key.js | derivePathGroupKey}): pages sharing
* the exact same stylesheet set are near-certainly the same template
* family, making this a strong but sparse signal — many pages load few or
* no stylesheets, so this key is meant to be used alongside, not instead
* of, weaker-but-always-present signals like a URL-path-based key.
*
* `stylesheetHrefs` must already be resolved to a form that is comparable
* across the whole corpus (e.g. absolute URLs). This function only compares
* the strings it is given: two pages that both reference the same
* unresolved relative href text (e.g. both link `href="style.css"`) but
* from different directories, and would therefore load different physical
* files, produce the same key here unless the caller has already resolved
* each href against its page's URL before calling.
*
* Input order does not affect the result: the arrangement of `<link>` tags
* in a document has no bearing on template identity, so hrefs are sorted
* (and deduplicated, since a repeated href contributes no extra information
* about what the page loads) before hashing. The sorted list is
* JSON-serialized before hashing rather than joined with a plain delimiter
* (e.g. `"\n"`) so that no character sequence inside one href can be
* mistaken for a boundary between two hrefs. Hashing (via SHA-256, reusing
* {@link ./hash-content.js | HASH_LENGTH} for the same truncation length the
* package's other hashed keys use) keeps the key a fixed size regardless of
* how many stylesheets a page loads or how long their URLs are.
* @param stylesheetHrefs
* @example
* ```ts
* deriveStylesheetGroupKey(['https://example.com/assets/site.css', 'https://example.com/assets/theme.css']);
* deriveStylesheetGroupKey(['https://example.com/assets/theme.css', 'https://example.com/assets/site.css']);
* // same result for both calls above — order-independent
* ```
*/
export function deriveStylesheetGroupKey(stylesheetHrefs: readonly string[]): string {
const sorted = [...new Set(stylesheetHrefs)].toSorted();
return hash(JSON.stringify(sorted)).slice(0, HASH_LENGTH);
}
6 changes: 5 additions & 1 deletion packages/@d-zero/page-cluster/src/hash-content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ import { normalizeForHash } from './normalize-for-hash.js';
* to answer "did this script/style/svg/comment's content change", not
* resist deliberate collision attacks, so the reduced collision resistance
* of a truncated digest is an acceptable trade-off.
*
* Exported so other hashed-key producers in this package (e.g.
* `deriveStylesheetGroupKey`) use the same truncation length instead of
* picking their own.
*/
const HASH_LENGTH = 16;
export const HASH_LENGTH = 16;

/**
* Hashes `script`/`style`/`svg`/`noscript`/comment content instead of
Expand Down
Loading