d-zero-dev · YusukeHirao · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026 · Jun 14, 2026
@@ -628,6 +628,25 @@ updatePage(pageData) の処理:
   # destPageId の anchors / images は「置き換え」で保存（→「再スクレイプ時の…」参照）
 ```
 
+### 被リンク/参照の redirect 透過解決（#71）
+
+被リンク（incoming links / referrers）は **読み取り時に redirect を透過解決**する。アンカーがリダイレクト元（例: `http://x` が `https://x` に 301）を指していても、そのリンクは最終宛先（canonical ページ）の被リンクとして集約される。これにより `http`/`https` の別や、同一ページへ至る複数のリダイレクト経路があっても、被リンクが正規ページに合算され分裂しない。
+
+**解決規則:** `redirectDestId` は `#linkRedirectSources` が常に**最終宛先まで pre-flatten** する（`A → B → X` のとき A も B も `redirectDestId = X`）。そのため再帰的なチェーン走査は不要で、`COALESCE(target.redirectDestId, target.id)` の **1 ホップ**で最終宛先が求まる。これは `redirectTable()`（`A.redirectDestId = B.id` UNION identity）と同一セマンティクス。
+
+**読み取り経路間の一貫性:** 以下はすべて同じ規則で解決する。
+
+| 関数                                                    | パッケージ | 用途                                             |
+| ------------------------------------------------------- | ---------- | ------------------------------------------------ |
+| `getPagesWithRels`（`redirect.from`/`fromId` = 経由元） | crawler    | report（Google Sheets）                          |
+| `getReferrersOfPage`（`through`/`throughId` = 経由元）  | crawler    | `Page.getReferrers`/`getRequests` フォールバック |
+| `getPageDetail.inboundLinks`                            | query      | viewer / mcp / cli                               |
+| `listPageLinks.referrerCount`                           | query      | viewer                                           |
+
+`through` / `throughId` は「アンカーが実際に指した URL（= リダイレクト元）」で、report の `[REDIRECTED FROM]` 注記に使う。
+
+**意図的な非対称性（発リンクは解決しない）:** **inbound（被リンク）**は redirect 透過で canonical に集約する一方、**outbound（発リンク）**は `getPageDetail.outboundLinks` がアンカーの **raw な指し先**（例: `http://x`）をそのまま返す。これは「このページは古い/リダイレクトする URL にリンクしている」という監査シグナルを保持するための設計。**この非対称性を「統一」しようとしないこと**（発リンク側を解決すると監査情報が失われる）。
+
 ### 再スクレイプ時の anchors / images（置き換えセマンティクス）
 
 同一ページは 1 クロール内でも複数回 `updatePage` されうる。最も多いのは **多対一リダイレクト**: 多数の旧 URL が 301 で 1 つの宛先ページ D に集約されると、クローラはリダイレクト元 URL を 1 つずつスクレイプし、そのたびに D を再取得して D の anchors / images を保存する（`crawl --resume` で実行をまたいでも同様）。

@@ -730,6 +730,126 @@ describe('re-scrape: 同一ページの再 updatePage', () => {
 		}
 	});
 
+	it('被リンクを redirect 越しに解決する: http 元へのリンクが https 宛先の被リンクに合算される (#71)', async () => {
+		const dbPath = path.resolve(workingDir, 'referrers-redirect-merge.sqlite');
+		const db = await Database.connect({ workingDir, filename: dbPath });
+		const destUrl = 'https://localhost/page';
+		const srcUrl = 'http://localhost/page';
+
+		try {
+			// 1) https 宛先（実コンテンツ）。
+			await db.updatePage(
+				{
+					url: parseUrl(destUrl)!,
+					redirectPaths: [],
+					isExternal: false,
+					status: 200,
+					statusText: 'OK',
+					contentLength: 100,
+					contentType: 'text/html',
+					responseHeaders: {},
+					meta: { title: 'Page' },
+					anchorList: [],
+					imageList: [],
+					html: '<html></html>',
+					isSkipped: false,
+				},
+				workingDir,
+				true,
+			);
+
+			// 2) http 元が https 宛先へ 301（src.redirectDestId = dest.id）。
+			await db.updatePage(
+				{
+					url: parseUrl(srcUrl)!,
+					redirectPaths: [destUrl],
+					isExternal: false,
+					status: 200,
+					statusText: 'OK',
+					contentLength: 100,
+					contentType: 'text/html',
+					responseHeaders: {},
+					meta: { title: 'Page (http)' },
+					anchorList: [],
+					imageList: [],
+					html: '<html></html>',
+					isSkipped: false,
+				},
+				workingDir,
+				true,
+			);
+
+			// 3) 一方は https 宛先を直リンク、もう一方は http 元をリンク。
+			await db.updatePage(
+				{
+					url: parseUrl('http://localhost/linker-https')!,
+					redirectPaths: [],
+					isExternal: false,
+					status: 200,
+					statusText: 'OK',
+					contentLength: 100,
+					contentType: 'text/html',
+					responseHeaders: {},
+					meta: { title: 'Linker https' },
+					anchorList: [
+						{ href: parseUrl(destUrl)!, textContent: 'direct', isExternal: false },
+					],
+					imageList: [],
+					html: '<html></html>',
+					isSkipped: false,
+				},
+				workingDir,
+				true,
+			);
+			await db.updatePage(
+				{
+					url: parseUrl('http://localhost/linker-http')!,
+					redirectPaths: [],
+					isExternal: false,
+					status: 200,
+					statusText: 'OK',
+					contentLength: 100,
+					contentType: 'text/html',
+					responseHeaders: {},
+					meta: { title: 'Linker http' },
+					anchorList: [
+						{ href: parseUrl(srcUrl)!, textContent: 'via http', isExternal: false },
+					],
+					imageList: [],
+					html: '<html></html>',
+					isSkipped: false,
+				},
+				workingDir,
+				true,
+			);
+
+			const knex = db.getKnex();
+			const [dest] = await knex.from('pages').select('id').where('url', destUrl);
+
+			// 両リンクが宛先の被リンクに合算される（http/https で分裂しない）。
+			const refs = await db.getReferrersOfPage(dest.id);
+			const urls = refs.map((r) => r.url).toSorted();
+			expect(urls).toEqual([
+				'http://localhost/linker-http',
+				'http://localhost/linker-https',
+			]);
+
+			// through はアンカーが実際に指した URL（直リンクなら宛先、redirect 経由なら元）を返す。
+			const viaHttp = refs.find((r) => r.url === 'http://localhost/linker-http');
+			const direct = refs.find((r) => r.url === 'http://localhost/linker-https');
+			expect(viaHttp!.through).toBe(srcUrl);
+			expect(direct!.through).toBe(destUrl);
+
+			// 元(http)ページ側の被リンクは空（宛先に付け替わるため二重計上しない）。
+			const [src] = await knex.from('pages').select('id').where('url', srcUrl);
+			const srcRefs = await db.getReferrersOfPage(src.id);
+			expect(srcRefs).toHaveLength(0);
+		} finally {
+			await db.destroy();
+			await remove(dbPath);
+		}
+	});
+
 	it('ページ内に正当な同一リンク（ヘッダー/フッター重複）がある場合、再スクレイプでも件数を保持する', async () => {
 		// 実アーカイブの「重複」の大半は、全ページのヘッダー/フッターに同じリンクが
 		// 並ぶ正当なページ内重複。delete-then-insert は anchorList をそのまま入れ直す

@@ -498,17 +498,36 @@ export class Database extends EventEmitter<DatabaseEvent> {
 	}
 	/**
 	 * Retrieves pages that link to a specific page (incoming links / referrers).
+	 *
+	 * Incoming links are resolved **through redirects**: an anchor pointing at a
+	 * redirect source (e.g. `http://x` that 301s to `https://x`) counts as a
+	 * referrer of the redirect's final destination, not of the source. This keeps
+	 * backlinks merged on the canonical page instead of splitting them across the
+	 * `http`/`https` (or any redirect source/dest) pair. The resolution mirrors
+	 * `redirectTable()` — `redirectDestId` is pre-flattened to the final
+	 * destination, so `COALESCE(target.redirectDestId, target.id)` is a single hop.
 	 * @param pageId - The database ID of the target page.
 	 * @returns An array of referrer records with URL, hash, and text content.
 	 */
 	@ErrorEmitter()
 	@retry(retrySetting)
 	async getReferrersOfPage(pageId: number) {
 		const res = await this.#instance
-			.select('pages.url', 'anchors.hash', 'anchors.textContent')
+			.select(
+				'referrer.url',
+				// `through` / `throughId` = the URL the anchor actually pointed at (the
+				// redirect source, e.g. `http://x`), mirroring `getPagesWithRels`'
+				// `redirect.from` / `redirect.fromId`. Lets report code print the
+				// "[REDIRECTED FROM]" note even on this (non-preloaded) referrer path.
+				'target.url as through',
+				'target.id as throughId',
+				'anchors.hash',
+				'anchors.textContent',
+			)
 			.from('anchors')
-			.join('pages', 'anchors.pageId', '=', 'pages.id')
-			.where('anchors.hrefId', pageId);
+			.join('pages as referrer', 'anchors.pageId', '=', 'referrer.id')
+			.join('pages as target', 'anchors.hrefId', '=', 'target.id')
+			.whereRaw('coalesce("target"."redirectDestId", "target"."id") = ?', [pageId]);
 		return res;
 	}
 	/**

@@ -341,6 +341,34 @@ describe('Page', () => {
 			await page.getReferrers();
 			expect(archive.getReferrersOfPage).toHaveBeenCalledWith(7);
 		});
+
+		it('プリロード無しのフォールバックでも through/throughId を含む Referrer 形状にマップする', async () => {
+			// getReferrersOfPage は redirect 解決済みの行（through = アンカーが実際に
+			// 指した URL）を返す。フォールバック経路でも #rawReferrers 経路と同じ形状に
+			// マップされ、report の "[REDIRECTED FROM]" 判定が機能することを保証する。
+			const archive = createMockArchive({
+				getReferrersOfPage: vi.fn().mockResolvedValue([
+					{
+						url: 'https://example.com/linker',
+						through: 'http://example.com/page',
+						throughId: 9,
+						hash: null,
+						textContent: null,
+					},
+				]),
+			});
+			const page = new Page(archive as never, createRawPage({ id: 7 }));
+			const referrers = await page.getReferrers();
+			expect(referrers).toEqual([
+				{
+					url: 'https://example.com/linker',
+					through: 'http://example.com/page',
+					throughId: 9,
+					hash: null,
+					textContent: '',
+				},
+			]);
+		});
 	});
 
 	describe('getHtml', () => {
@@ -380,5 +408,30 @@ describe('Page', () => {
 			await page.getRequests();
 			expect(archive.getReferrersOfPage).toHaveBeenCalledWith(3);
 		});
+
+		it('through/throughId を含む Referrer 形状にマップする', async () => {
+			const archive = createMockArchive({
+				getReferrersOfPage: vi.fn().mockResolvedValue([
+					{
+						url: 'https://example.com/linker',
+						through: 'http://example.com/page',
+						throughId: 9,
+						hash: 'sec',
+						textContent: 'text',
+					},
+				]),
+			});
+			const page = new Page(archive as never, createRawPage({ id: 3 }));
+			const requests = await page.getRequests();
+			expect(requests).toEqual([
+				{
+					url: 'https://example.com/linker',
+					through: 'http://example.com/page',
+					throughId: 9,
+					hash: 'sec',
+					textContent: 'text',
+				},
+			]);
+		});
 	});
 });
@@ -296,7 +296,14 @@ export default class Page {
 				textContent: r.textContent || '',
 			}));
 		}
-		return this.#archive.getReferrersOfPage(this.#raw.id);
+		const refs = await this.#archive.getReferrersOfPage(this.#raw.id);
+		return refs.map((r) => ({
+			url: r.url,
+			through: r.through,
+			throughId: r.throughId,
+			hash: r.hash,
+			textContent: r.textContent || '',
+		}));
 	}
 
 	/**
@@ -305,7 +312,14 @@ export default class Page {
 	 * @returns An array of {@link Referrer} objects.
 	 */
 	async getRequests(): Promise<Referrer[]> {
-		return this.#archive.getReferrersOfPage(this.#raw.id);
+		const refs = await this.#archive.getReferrersOfPage(this.#raw.id);
+		return refs.map((r) => ({
+			url: r.url,
+			through: r.through,
+			throughId: r.throughId,
+			hash: r.hash,
+			textContent: r.textContent || '',
+		}));
 	}
 
 	/**