diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index c6cba0cb..4f44c1e1 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -628,6 +628,25 @@ updatePage(pageData) の処理: # destPageId の anchors / images は「置き換え」で保存(→「再スクレイプ時の…」参照) ``` +### 被リンク/参照の redirect 透過解決(#71) + +被リンク(incoming links / referrers)は **読み取り時に redirect を透過解決**する。アンカーがリダイレクト元(例: `http://x` が `https://x` に 301)を指していても、そのリンクは最終宛先(canonical ページ)の被リンクとして集約される。これにより `http`/`https` の別や、同一ページへ至る複数のリダイレクト経路があっても、被リンクが正規ページに合算され分裂しない。 + +**解決規則:** `redirectDestId` は `#linkRedirectSources` が常に**最終宛先まで pre-flatten** する(`A → B → X` のとき A も B も `redirectDestId = X`)。そのため再帰的なチェーン走査は不要で、`COALESCE(target.redirectDestId, target.id)` の **1 ホップ**で最終宛先が求まる。これは `redirectTable()`(`A.redirectDestId = B.id` UNION identity)と同一セマンティクス。 + +**読み取り経路間の一貫性:** 以下はすべて同じ規則で解決する。 + +| 関数 | パッケージ | 用途 | +| ------------------------------------------------------- | ---------- | ------------------------------------------------ | +| `getPagesWithRels`(`redirect.from`/`fromId` = 経由元) | crawler | report(Google Sheets) | +| `getReferrersOfPage`(`through`/`throughId` = 経由元) | crawler | `Page.getReferrers`/`getRequests` フォールバック | +| `getPageDetail.inboundLinks` | query | viewer / mcp / cli | +| `listPageLinks.referrerCount` | query | viewer | + +`through` / `throughId` は「アンカーが実際に指した URL(= リダイレクト元)」で、report の `[REDIRECTED FROM]` 注記に使う。 + +**意図的な非対称性(発リンクは解決しない):** **inbound(被リンク)**は redirect 透過で canonical に集約する一方、**outbound(発リンク)**は `getPageDetail.outboundLinks` がアンカーの **raw な指し先**(例: `http://x`)をそのまま返す。これは「このページは古い/リダイレクトする URL にリンクしている」という監査シグナルを保持するための設計。**この非対称性を「統一」しようとしないこと**(発リンク側を解決すると監査情報が失われる)。 + ### 再スクレイプ時の anchors / images(置き換えセマンティクス) 同一ページは 1 クロール内でも複数回 `updatePage` されうる。最も多いのは **多対一リダイレクト**: 多数の旧 URL が 301 で 1 つの宛先ページ D に集約されると、クローラはリダイレクト元 URL を 1 つずつスクレイプし、そのたびに D を再取得して D の anchors / images を保存する(`crawl --resume` で実行をまたいでも同様)。 diff --git a/packages/@nitpicker/crawler/src/archive/database.spec.ts b/packages/@nitpicker/crawler/src/archive/database.spec.ts index a99305e1..7c96a1b6 100644 --- a/packages/@nitpicker/crawler/src/archive/database.spec.ts +++ b/packages/@nitpicker/crawler/src/archive/database.spec.ts @@ -730,6 +730,126 @@ describe('re-scrape: 同一ページの再 updatePage', () => { } }); + it('被リンクを redirect 越しに解決する: http 元へのリンクが https 宛先の被リンクに合算される (#71)', async () => { + const dbPath = path.resolve(workingDir, 'referrers-redirect-merge.sqlite'); + const db = await Database.connect({ workingDir, filename: dbPath }); + const destUrl = 'https://localhost/page'; + const srcUrl = 'http://localhost/page'; + + try { + // 1) https 宛先(実コンテンツ)。 + await db.updatePage( + { + url: parseUrl(destUrl)!, + redirectPaths: [], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 100, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: 'Page' }, + anchorList: [], + imageList: [], + html: '', + isSkipped: false, + }, + workingDir, + true, + ); + + // 2) http 元が https 宛先へ 301(src.redirectDestId = dest.id)。 + await db.updatePage( + { + url: parseUrl(srcUrl)!, + redirectPaths: [destUrl], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 100, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: 'Page (http)' }, + anchorList: [], + imageList: [], + html: '', + isSkipped: false, + }, + workingDir, + true, + ); + + // 3) 一方は https 宛先を直リンク、もう一方は http 元をリンク。 + await db.updatePage( + { + url: parseUrl('http://localhost/linker-https')!, + redirectPaths: [], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 100, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: 'Linker https' }, + anchorList: [ + { href: parseUrl(destUrl)!, textContent: 'direct', isExternal: false }, + ], + imageList: [], + html: '', + isSkipped: false, + }, + workingDir, + true, + ); + await db.updatePage( + { + url: parseUrl('http://localhost/linker-http')!, + redirectPaths: [], + isExternal: false, + status: 200, + statusText: 'OK', + contentLength: 100, + contentType: 'text/html', + responseHeaders: {}, + meta: { title: 'Linker http' }, + anchorList: [ + { href: parseUrl(srcUrl)!, textContent: 'via http', isExternal: false }, + ], + imageList: [], + html: '', + isSkipped: false, + }, + workingDir, + true, + ); + + const knex = db.getKnex(); + const [dest] = await knex.from('pages').select('id').where('url', destUrl); + + // 両リンクが宛先の被リンクに合算される(http/https で分裂しない)。 + const refs = await db.getReferrersOfPage(dest.id); + const urls = refs.map((r) => r.url).toSorted(); + expect(urls).toEqual([ + 'http://localhost/linker-http', + 'http://localhost/linker-https', + ]); + + // through はアンカーが実際に指した URL(直リンクなら宛先、redirect 経由なら元)を返す。 + const viaHttp = refs.find((r) => r.url === 'http://localhost/linker-http'); + const direct = refs.find((r) => r.url === 'http://localhost/linker-https'); + expect(viaHttp!.through).toBe(srcUrl); + expect(direct!.through).toBe(destUrl); + + // 元(http)ページ側の被リンクは空(宛先に付け替わるため二重計上しない)。 + const [src] = await knex.from('pages').select('id').where('url', srcUrl); + const srcRefs = await db.getReferrersOfPage(src.id); + expect(srcRefs).toHaveLength(0); + } finally { + await db.destroy(); + await remove(dbPath); + } + }); + it('ページ内に正当な同一リンク(ヘッダー/フッター重複)がある場合、再スクレイプでも件数を保持する', async () => { // 実アーカイブの「重複」の大半は、全ページのヘッダー/フッターに同じリンクが // 並ぶ正当なページ内重複。delete-then-insert は anchorList をそのまま入れ直す diff --git a/packages/@nitpicker/crawler/src/archive/database.ts b/packages/@nitpicker/crawler/src/archive/database.ts index fab489a6..95c9709d 100644 --- a/packages/@nitpicker/crawler/src/archive/database.ts +++ b/packages/@nitpicker/crawler/src/archive/database.ts @@ -498,6 +498,14 @@ export class Database extends EventEmitter { } /** * Retrieves pages that link to a specific page (incoming links / referrers). + * + * Incoming links are resolved **through redirects**: an anchor pointing at a + * redirect source (e.g. `http://x` that 301s to `https://x`) counts as a + * referrer of the redirect's final destination, not of the source. This keeps + * backlinks merged on the canonical page instead of splitting them across the + * `http`/`https` (or any redirect source/dest) pair. The resolution mirrors + * `redirectTable()` — `redirectDestId` is pre-flattened to the final + * destination, so `COALESCE(target.redirectDestId, target.id)` is a single hop. * @param pageId - The database ID of the target page. * @returns An array of referrer records with URL, hash, and text content. */ @@ -505,10 +513,21 @@ export class Database extends EventEmitter { @retry(retrySetting) async getReferrersOfPage(pageId: number) { const res = await this.#instance - .select('pages.url', 'anchors.hash', 'anchors.textContent') + .select( + 'referrer.url', + // `through` / `throughId` = the URL the anchor actually pointed at (the + // redirect source, e.g. `http://x`), mirroring `getPagesWithRels`' + // `redirect.from` / `redirect.fromId`. Lets report code print the + // "[REDIRECTED FROM]" note even on this (non-preloaded) referrer path. + 'target.url as through', + 'target.id as throughId', + 'anchors.hash', + 'anchors.textContent', + ) .from('anchors') - .join('pages', 'anchors.pageId', '=', 'pages.id') - .where('anchors.hrefId', pageId); + .join('pages as referrer', 'anchors.pageId', '=', 'referrer.id') + .join('pages as target', 'anchors.hrefId', '=', 'target.id') + .whereRaw('coalesce("target"."redirectDestId", "target"."id") = ?', [pageId]); return res; } /** diff --git a/packages/@nitpicker/crawler/src/archive/page.spec.ts b/packages/@nitpicker/crawler/src/archive/page.spec.ts index d3456190..89a3bc5e 100644 --- a/packages/@nitpicker/crawler/src/archive/page.spec.ts +++ b/packages/@nitpicker/crawler/src/archive/page.spec.ts @@ -341,6 +341,34 @@ describe('Page', () => { await page.getReferrers(); expect(archive.getReferrersOfPage).toHaveBeenCalledWith(7); }); + + it('プリロード無しのフォールバックでも through/throughId を含む Referrer 形状にマップする', async () => { + // getReferrersOfPage は redirect 解決済みの行(through = アンカーが実際に + // 指した URL)を返す。フォールバック経路でも #rawReferrers 経路と同じ形状に + // マップされ、report の "[REDIRECTED FROM]" 判定が機能することを保証する。 + const archive = createMockArchive({ + getReferrersOfPage: vi.fn().mockResolvedValue([ + { + url: 'https://example.com/linker', + through: 'http://example.com/page', + throughId: 9, + hash: null, + textContent: null, + }, + ]), + }); + const page = new Page(archive as never, createRawPage({ id: 7 })); + const referrers = await page.getReferrers(); + expect(referrers).toEqual([ + { + url: 'https://example.com/linker', + through: 'http://example.com/page', + throughId: 9, + hash: null, + textContent: '', + }, + ]); + }); }); describe('getHtml', () => { @@ -380,5 +408,30 @@ describe('Page', () => { await page.getRequests(); expect(archive.getReferrersOfPage).toHaveBeenCalledWith(3); }); + + it('through/throughId を含む Referrer 形状にマップする', async () => { + const archive = createMockArchive({ + getReferrersOfPage: vi.fn().mockResolvedValue([ + { + url: 'https://example.com/linker', + through: 'http://example.com/page', + throughId: 9, + hash: 'sec', + textContent: 'text', + }, + ]), + }); + const page = new Page(archive as never, createRawPage({ id: 3 })); + const requests = await page.getRequests(); + expect(requests).toEqual([ + { + url: 'https://example.com/linker', + through: 'http://example.com/page', + throughId: 9, + hash: 'sec', + textContent: 'text', + }, + ]); + }); }); }); diff --git a/packages/@nitpicker/crawler/src/archive/page.ts b/packages/@nitpicker/crawler/src/archive/page.ts index 047faa9a..abea4cb6 100644 --- a/packages/@nitpicker/crawler/src/archive/page.ts +++ b/packages/@nitpicker/crawler/src/archive/page.ts @@ -296,7 +296,14 @@ export default class Page { textContent: r.textContent || '', })); } - return this.#archive.getReferrersOfPage(this.#raw.id); + const refs = await this.#archive.getReferrersOfPage(this.#raw.id); + return refs.map((r) => ({ + url: r.url, + through: r.through, + throughId: r.throughId, + hash: r.hash, + textContent: r.textContent || '', + })); } /** @@ -305,7 +312,14 @@ export default class Page { * @returns An array of {@link Referrer} objects. */ async getRequests(): Promise { - return this.#archive.getReferrersOfPage(this.#raw.id); + const refs = await this.#archive.getReferrersOfPage(this.#raw.id); + return refs.map((r) => ({ + url: r.url, + through: r.through, + throughId: r.throughId, + hash: r.hash, + textContent: r.textContent || '', + })); } /** diff --git a/packages/@nitpicker/query/src/get-page-detail.spec.ts b/packages/@nitpicker/query/src/get-page-detail.spec.ts index dce38511..34082f35 100644 --- a/packages/@nitpicker/query/src/get-page-detail.spec.ts +++ b/packages/@nitpicker/query/src/get-page-detail.spec.ts @@ -162,3 +162,169 @@ describe('getPageDetail', () => { expect(result).toBeNull(); }); }); + +describe('getPageDetail: 被リンクを redirect 越しに解決する(http/https 合算, #71)', () => { + let archive: InstanceType; + const dir = path.resolve(__dirname, '__test_fixtures_get_page_detail_redirect__'); + const archiveFilePath = path.resolve(dir, 'page-detail-redirect.nitpicker'); + + /** + * Minimal empty metadata object shared by the redirect-resolution fixtures. + * Avoids repeating the full nullable meta shape in every `setPage` call. + */ + const emptyMeta = { + lang: null, + title: null, + description: null, + keywords: null, + noindex: false, + nofollow: false, + noarchive: false, + canonical: null, + alternate: null, + 'og:type': null, + 'og:title': null, + 'og:site_name': null, + 'og:description': null, + 'og:url': null, + 'og:image': null, + 'twitter:card': null, + }; + + beforeAll(async () => { + const { mkdirSync } = await import('node:fs'); + mkdirSync(dir, { recursive: true }); + archive = await Archive.create({ filePath: archiveFilePath, cwd: dir }); + await archive.setConfig({ + baseUrl: 'https://example.com', + name: 'test', + version: '0.4.4', + recursive: true, + interval: 0, + image: true, + fetchExternal: false, + parallels: 1, + roots: ['https://example.com'], + excludes: [], + excludeKeywords: [], + excludeUrls: [], + maxExcludedDepth: 0, + retry: 3, + fromList: false, + disableQueries: false, + userAgent: 'test', + ignoreRobots: false, + }); + + // 1) Canonical destination — the https content page. + await archive.setPage({ + url: parseUrl('https://example.com/page')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: 'Page', + meta: { ...emptyMeta, title: 'Page' }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // 2) http source that 301s to the https destination → http.redirectDestId = https.id. + await archive.setPage({ + url: parseUrl('http://example.com/page')!, + redirectPaths: ['https://example.com/page'], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...emptyMeta }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // 3) A page linking the https destination DIRECTLY. + await archive.setPage({ + url: parseUrl('https://example.com/linker-https')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...emptyMeta }, + anchorList: [ + { + href: parseUrl('https://example.com/page')!, + isExternal: false, + title: null, + textContent: 'direct https', + }, + ], + imageList: [], + isSkipped: false, + }); + + // 4) A page linking the http SOURCE (which redirects to the https destination). + await archive.setPage({ + url: parseUrl('https://example.com/linker-http')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...emptyMeta }, + anchorList: [ + { + href: parseUrl('http://example.com/page')!, + isExternal: false, + title: null, + textContent: 'via http', + }, + ], + imageList: [], + isSkipped: false, + }); + }); + + afterAll(async () => { + if (archive) { + await archive.close(); + } + const { rmSync } = await import('node:fs'); + rmSync(dir, { recursive: true, force: true }); + }); + + it('http リンクと https リンクが宛先ページの被リンクに合算される(分裂しない)', async () => { + const result = await getPageDetail(archive, 'https://example.com/page'); + const inboundUrls = result!.inboundLinks.map((l) => l.url).toSorted(); + // 直リンク(https) と redirect 元(http)へのリンク、両方が宛先に集約される。 + expect(inboundUrls).toEqual([ + 'https://example.com/linker-http', + 'https://example.com/linker-https', + ]); + }); + + it('redirect 元ページ自身の被リンクは宛先側に付け替えられ、空になる', async () => { + // http://example.com/page を指すリンクは https 宛先の被リンクに解決されるため、 + // 元(http)ページの被リンクには現れない(二重計上を防ぐ)。 + const result = await getPageDetail(archive, 'http://example.com/page'); + expect(result!.inboundLinks).toHaveLength(0); + }); +}); diff --git a/packages/@nitpicker/query/src/get-page-detail.ts b/packages/@nitpicker/query/src/get-page-detail.ts index 774a2e36..f2023b71 100644 --- a/packages/@nitpicker/query/src/get-page-detail.ts +++ b/packages/@nitpicker/query/src/get-page-detail.ts @@ -4,6 +4,9 @@ import type { ArchiveAccessor } from '@nitpicker/crawler'; /** * Retrieves detailed information about a single page by URL. * Includes all metadata, outbound links, inbound links, and redirect sources. + * Inbound links are resolved through redirects, so links to a redirect source + * (e.g. `http://x` 301-ing to `https://x`) count as backlinks of the final + * destination — they stay merged on the canonical page instead of splitting (#71). * @param accessor - The archive accessor to query. * @param url - The URL of the page to retrieve. * @returns Detailed page information, or null if the page is not found. @@ -28,6 +31,12 @@ export async function getPageDetail( console.warn(`Failed to parse responseHeaders for ${url}:`, error); } + // Outbound links intentionally do NOT resolve through redirects (asymmetric + // with inboundLinks below): they show the RAW anchor target — e.g. a link to + // `http://x` that 301s to `https://x` is reported as pointing at `http://x`. + // This preserves the audit signal "this page links to a redirecting URL". + // Inbound is the opposite: it merges backlinks onto the canonical destination + // (#71). See ARCHITECTURE.md「被リンク/参照の redirect 透過解決(#71)」. const outboundRows = await knex('anchors') .select('pages.url', 'anchors.textContent', 'pages.status', 'pages.isExternal') .join('pages', 'anchors.hrefId', '=', 'pages.id') @@ -47,10 +56,18 @@ export async function getPageDetail( }), ); + // Inbound links are resolved THROUGH redirects: an anchor pointing at a + // redirect source (e.g. `http://x` 301-ing to `https://x`) is counted as an + // incoming link to the redirect's final destination, not the source — so + // backlinks stay merged on the canonical page instead of splitting across the + // `http`/`https` pair (#71). `redirectDestId` is pre-flattened to the final + // destination, so `COALESCE(target.redirectDestId, target.id)` is a single hop + // (same semantics as crawler's `redirectTable()`). const inboundRows = await knex('anchors') - .select('pages.url', 'anchors.textContent') - .join('pages', 'anchors.pageId', '=', 'pages.id') - .where('anchors.hrefId', page.id); + .select('referrer.url', 'anchors.textContent') + .join('pages as referrer', 'anchors.pageId', '=', 'referrer.id') + .join('pages as target', 'anchors.hrefId', '=', 'target.id') + .whereRaw('coalesce("target"."redirectDestId", "target"."id") = ?', [page.id]); const inboundLinks = inboundRows.map( (row: { url: string; textContent: string | null }) => ({ diff --git a/packages/@nitpicker/query/src/list-page-links.spec.ts b/packages/@nitpicker/query/src/list-page-links.spec.ts index cb18d847..6cd07026 100644 --- a/packages/@nitpicker/query/src/list-page-links.spec.ts +++ b/packages/@nitpicker/query/src/list-page-links.spec.ts @@ -138,3 +138,134 @@ describe('listPageLinks', () => { expect(about?.hasResponseHeaders).toBe(false); }); }); + +describe('listPageLinks: referrerCount を redirect 越しに合算する(http/https, #71)', () => { + let archive: InstanceType; + const dir = path.resolve(dirname, '__test_fixtures_page_links_redirect__'); + const archiveFilePath = path.resolve(dir, 'page-links-redirect.nitpicker'); + + beforeAll(async () => { + const { mkdirSync } = await import('node:fs'); + mkdirSync(dir, { recursive: true }); + archive = await Archive.create({ filePath: archiveFilePath, cwd: dir }); + await archive.setConfig({ + baseUrl: 'https://example.com', + name: 'test', + version: '0.4.4', + recursive: true, + interval: 0, + image: true, + fetchExternal: false, + parallels: 1, + roots: ['https://example.com'], + excludes: [], + excludeKeywords: [], + excludeUrls: [], + maxExcludedDepth: 0, + retry: 3, + fromList: false, + disableQueries: false, + userAgent: 'test', + ignoreRobots: false, + }); + + // Canonical https destination. + await archive.setPage({ + url: parseUrl('https://example.com/page')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...META, title: 'Page' }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // http source 301-ing to the https destination. + await archive.setPage({ + url: parseUrl('http://example.com/page')!, + redirectPaths: ['https://example.com/page'], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...META }, + anchorList: [], + imageList: [], + isSkipped: false, + }); + + // One page links the https destination directly, another links the http source. + await archive.setPage({ + url: parseUrl('https://example.com/linker-https')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...META }, + anchorList: [ + { + href: parseUrl('https://example.com/page')!, + isExternal: false, + title: null, + textContent: 'direct https', + }, + ], + imageList: [], + isSkipped: false, + }); + await archive.setPage({ + url: parseUrl('https://example.com/linker-http')!, + redirectPaths: [], + isExternal: false, + isTarget: true, + status: 200, + statusText: 'OK', + contentType: 'text/html', + contentLength: 100, + responseHeaders: {}, + html: '', + meta: { ...META }, + anchorList: [ + { + href: parseUrl('http://example.com/page')!, + isExternal: false, + title: null, + textContent: 'via http', + }, + ], + imageList: [], + isSkipped: false, + }); + }); + + afterAll(async () => { + if (archive) { + await archive.close(); + } + const { rmSync } = await import('node:fs'); + rmSync(dir, { recursive: true, force: true }); + }); + + it('http リンクと https リンクの両方が宛先の referrerCount に合算される', async () => { + const result = await listPageLinks(archive); + const page = result.items.find((i) => i.url === 'https://example.com/page'); + // 直リンク(https) + redirect 元(http)へのリンク = 2 が宛先に集約される(分裂しない)。 + expect(page?.referrerCount).toBe(2); + }); +}); diff --git a/packages/@nitpicker/query/src/list-page-links.ts b/packages/@nitpicker/query/src/list-page-links.ts index e65c89b1..f663b246 100644 --- a/packages/@nitpicker/query/src/list-page-links.ts +++ b/packages/@nitpicker/query/src/list-page-links.ts @@ -15,7 +15,10 @@ import { paginateQuery } from './paginate-query.js'; * * Unlike `listLinks` (which analyzes anchors for broken/external/orphaned * links), this lists every page. Redirect-from and referrer counts use - * correlated subqueries so they do not perturb the pagination COUNT. + * correlated subqueries so they do not perturb the pagination COUNT. The + * referrer count is resolved THROUGH redirects, so a link to a redirect source + * (e.g. `http://x` 301-ing to `https://x`) counts toward the final destination + * — backlinks stay merged on the canonical page instead of splitting (#71). * @param accessor - The archive accessor to query. * @param options - Filter and pagination options. * @returns A paginated list of per-page network entries. @@ -72,8 +75,15 @@ export async function listPageLinks( knex.raw( '(select count(*) from "pages" as "r" where "r"."redirectDestId" = "pages"."id") as redirectFromCount', ), + // Referrer count is resolved THROUGH redirects: an anchor pointing at a + // redirect source (e.g. `http://x` 301-ing to `https://x`) counts toward + // the final destination, not the source — so backlinks stay merged on the + // canonical page instead of splitting across the `http`/`https` pair (#71). + // `redirectDestId` is pre-flattened to the final destination, so + // `COALESCE(t.redirectDestId, t.id)` is a single hop (same semantics as + // crawler's `redirectTable()`). knex.raw( - '(select count(*) from "anchors" where "anchors"."hrefId" = "pages"."id") as referrerCount', + '(select count(*) from "anchors" join "pages" as "t" on "anchors"."hrefId" = "t"."id" where coalesce("t"."redirectDestId", "t"."id") = "pages"."id") as referrerCount', ), ) .orderBy('url'), diff --git a/packages/test-server/src/__tests__/e2e/redirect.e2e.ts b/packages/test-server/src/__tests__/e2e/redirect.e2e.ts index 8e2db633..aa111d6d 100644 --- a/packages/test-server/src/__tests__/e2e/redirect.e2e.ts +++ b/packages/test-server/src/__tests__/e2e/redirect.e2e.ts @@ -48,6 +48,22 @@ describe('Redirect handling', () => { const anchorUrls = anchors.map((a) => a.url); expect(anchorUrls.some((u) => u.includes('/redirect/start'))).toBe(true); }); + + it('被リンクが redirect 元経由で宛先に合算される(end-to-end, #71)', async () => { + // /redirect/ は /redirect/start にリンクし、start は dest へ 301→302。 + // 被リンクを redirect 越しに解決するため、/redirect/start を指す /redirect/ は + // 最終宛先 /redirect/dest の被リンクとして現れる(http→https と同じ機構を + // http→http で end-to-end 検証)。解決しないと dest の被リンクは 0 になる。 + const pages = await result.accessor.getPages('internal-page'); + const dest = pages.find((p) => p.url.pathname === '/redirect/dest'); + expect(dest).toBeDefined(); + + const referrers = await dest!.getReferrers(); + const fromTop = referrers.find((r) => new URL(r.url).pathname === '/redirect/'); + expect(fromTop).toBeDefined(); + // through はアンカーが実際に指した URL(リダイレクト元 /redirect/start)。 + expect(new URL(fromTop!.through).pathname).toBe('/redirect/start'); + }); }); describe('Redirect convergence (#73): 多対一リダイレクト先を1回だけ描画する', () => {