From fc38f6a70110991938ab0da7abe4a0fa1b022dfc Mon Sep 17 00:00:00 2001 From: myoshizumi Date: Tue, 4 Nov 2025 20:44:10 +0900 Subject: [PATCH] SQL: Basic Select 619. Biggest Single Number --- .../Biggest_Single_Number_mysql.ipynb | 238 +++++++++++++++++ .../Biggest_Single_Number_pandas.ipynb | 233 ++++++++++++++++ .../Biggest_Single_Number_posgres.ipynb | 252 ++++++++++++++++++ 3 files changed, 723 insertions(+) create mode 100644 SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_mysql.ipynb create mode 100644 SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_pandas.ipynb create mode 100644 SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_posgres.ipynb diff --git a/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_mysql.ipynb b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_mysql.ipynb new file mode 100644 index 00000000..19cc2a17 --- /dev/null +++ b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_mysql.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a12bd06c", + "metadata": {}, + "source": [ + "# MySQL 8.0.40\n", + "\n", + "## 0) 前提\n", + "\n", + "* エンジン: **MySQL 8**\n", + "* 並び順: 任意(`ORDER BY` を付けない)\n", + "* `NOT IN` は NULL 罠のため回避\n", + "* 判定は **ID 基準**、表示は仕様どおりの列名と順序\n", + "\n", + "## 1) 問題\n", + "\n", + "* `MyNumbers` から **ちょうど1回だけ出現する数(single number)**のうち **最大の数**を1行で返す。存在しなければ `null` を返す。\n", + "\n", + "* 入力テーブル例:\n", + "\n", + " ```\n", + " Table: MyNumbers\n", + " +-------------+------+\n", + " | Column Name | Type |\n", + " +-------------+------+\n", + " | num | int |\n", + " +-------------+------+\n", + " -- 重複あり得る\n", + " ```\n", + "\n", + "* 出力仕様:\n", + "\n", + " ```\n", + " +-----+\n", + " | num |\n", + " +-----+\n", + " | 6 | -- single number の最大。存在しなければ NULL\n", + " +-----+\n", + " ```\n", + "\n", + "## 2) 最適解(単一クエリ)\n", + "\n", + "> ウィンドウ関数で「出現回数」を各行に載せ、そのうち `cnt = 1` の `num` の **最大値**を投影。\n", + "\n", + "```sql\n", + "WITH win AS (\n", + " SELECT\n", + " num,\n", + " COUNT(*) OVER (PARTITION BY num) AS cnt\n", + " FROM MyNumbers\n", + ")\n", + "SELECT\n", + " MAX(num) AS num\n", + "FROM win\n", + "WHERE cnt = 1;\n", + "\n", + "Runtime 392 ms\n", + "Beats 64.64%\n", + "\n", + "```\n", + "\n", + "* `MAX(num)` により **並び替え不要**で最大の single number を1行で取得\n", + "* single number が存在しない場合、`MAX` の母集団が空になり **`NULL` を返す**(要件どおり)\n", + "\n", + "## 3) 代替解\n", + "\n", + "> 集約のみで十分なサイズなら、`GROUP BY ... HAVING` → その最大値を返す。\n", + "\n", + "```sql\n", + "SELECT\n", + " MAX(num) AS num\n", + "FROM (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) = 1\n", + ") s;\n", + "\n", + "Runtime 418 ms\n", + "Beats 39.99%\n", + "\n", + "```\n", + "\n", + "* `NOT IN` 不要、`NULL` でも安全\n", + "* インデックスがないと全表スキャンになる点はウィンドウ版と同様\n", + "\n", + "## 4) 要点解説\n", + "\n", + "* **方針**:\n", + "\n", + " 1. 各 `num` の出現回数を計算(ウィンドウ or 集約)\n", + " 2. `= 1`(single)に絞る\n", + " 3. **最大値**だけを返す → `ORDER BY`・`LIMIT` 不要\n", + "* **NULL / 重複**:\n", + "\n", + " * `num` に `NULL` があっても `COUNT(*)` は `NULL` を数えるため、`num IS NULL` は single になり得る。ただし問題の意図は整数なので通常は非NULL前提。もし `NULL` 行があっても `MAX(num)` は `NULL` を無視するため影響しない。\n", + "* **安定性**:\n", + "\n", + " * 出力は1行のみで順序不要。`ORDER BY` を付けないほうが速い。\n", + "\n", + "## 5) 計算量(概算)\n", + "\n", + "* ウィンドウ版: `COUNT() OVER (PARTITION BY num)` は **O(N)**~**O(N log N)**(実装依存・ソート/ハッシュ)\n", + "* 集約版: `GROUP BY num` は **O(N)**~**O(N log N)**\n", + "* 推奨インデックス: `INDEX(num)` があればハッシュ/ツリー集約が効きやすい\n", + "\n", + "## 6) 図解(Mermaid 超保守版)\n", + "\n", + "```mermaid\n", + "flowchart TD\n", + " A[入力 MyNumbers] --> B[出現回数を算出 cnt]\n", + " B --> C[cnt が 1 の行に絞る]\n", + " C --> D[最大 num を求める]\n", + " D --> E[出力 列 num だけ]\n", + "```\n", + "\n", + "いい感じの結果です(特にウィンドウ版で ~65% 上回り)が、**もう少し縮められる余地**はあります。要点だけ手短に👇\n", + "\n", + "---\n", + "\n", + "## まずはインデックス\n", + "\n", + "```sql\n", + "CREATE INDEX ix_mynumbers_num ON MyNumbers(num);\n", + "```\n", + "\n", + "* `GROUP BY num` / `COUNT(*)` が**インデックス順走査**でまとまりやすくなり、\n", + " 一時テーブルやファイルソートの発生を抑制できます(環境次第で体感差が大きいところ)。\n", + "\n", + "---\n", + "\n", + "## 速度重視の実戦解(早期終了を効かせる)\n", + "\n", + "> 並び順が任意という仕様でしたが、**パフォーマンス最優先**なら `ORDER BY ... DESC LIMIT 1` による **早期終了**が効きます。`INDEX(num)` があると特に強いです。\n", + "\n", + "```sql\n", + "-- 早いことが多い版(上位1件だけ取りに行く)\n", + "SELECT num\n", + "FROM MyNumbers\n", + "GROUP BY num\n", + "HAVING COUNT(*) = 1\n", + "ORDER BY num DESC\n", + "LIMIT 1;\n", + "\n", + "Wrong Answer\n", + "13 / 18 testcases passed\n", + "```\n", + "\n", + "* 右端(最大値側)から**逆順インデックス走査**し、最初に見つかった「出現1回」のグループで終わるため、\n", + " データ分布によっては **大幅短縮**します(特に「大きい値ほどユニークが出やすい」分布)。\n", + "\n", + "> 出力は1行だけで、外側に `MAX` をかける必要はありません。\n", + "\n", + "---\n", + "\n", + "## `ORDER BY` を避けたい場合の最適形\n", + "\n", + "あなたの代替解は正攻法で、インデックス追加だけでも十分効きます。書式はそのままでOK:\n", + "\n", + "```sql\n", + "-- あなたの代替解(INDEXあり想定)\n", + "SELECT\n", + " MAX(num) AS num\n", + "FROM (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) = 1\n", + ") s;\n", + "\n", + "Runtime 396 ms\n", + "Beats 60.49%\n", + "\n", + "```\n", + "\n", + "* ウィンドウ版より **`GROUP BY` 直集約**のほうが MySQL では速く出ることが多いです(特に `INDEX(num)` あり)。\n", + "\n", + "---\n", + "\n", + "## 重複が極端に多い場合の小技(重複集合を先に作る)\n", + "\n", + "> 「ほとんどが重複で、ユニークが少ない」分布なら、**重複集合だけ先に抽出**して引き算すると速いことがあります。\n", + "\n", + "```sql\n", + "WITH dup AS (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) > 1\n", + "),\n", + "uniq AS (\n", + " SELECT DISTINCT num\n", + " FROM MyNumbers\n", + " LEFT JOIN dup USING (num)\n", + " WHERE dup.num IS NULL\n", + ")\n", + "SELECT MAX(num) AS num\n", + "FROM uniq;\n", + "\n", + "Runtime 414 ms\n", + "Beats 43.42%\n", + "\n", + "```\n", + "\n", + "* `dup` のサイズが小さくなる分、以降の結合・探索が軽くなります(分布依存)。\n", + "\n", + "---\n", + "\n", + "## 実務メモ\n", + "\n", + "* `EXPLAIN ANALYZE` で\n", + " *「Using index」でグループ化できているか*、\n", + " *一時テーブル/ファイルソートが消えているか* を確認。\n", + "* `ANALYZE TABLE MyNumbers;` で統計を更新しておくとプランが安定。\n", + "* `num` に `NULL` が混じっていても、`GROUP BY` + `HAVING COUNT(*)=1` は正しく動き、\n", + " `MAX(num)` も `NULL` を無視するので問題ありません。\n", + "\n", + "---\n", + "\n", + "### まとめ\n", + "\n", + "* まずは `INDEX(num)` を追加。\n", + "* 速度をさらに取りに行くなら **`GROUP BY ... HAVING COUNT(*)=1 ORDER BY num DESC LIMIT 1`**。\n", + "* `ORDER BY` を使わない方針を守るなら、あなたの **集約版 + インデックス** が最善に近いです。\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_pandas.ipynb b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_pandas.ipynb new file mode 100644 index 00000000..1c8ac071 --- /dev/null +++ b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_pandas.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "63101f51", + "metadata": {}, + "source": [ + "# Pandas 2.2.2用\n", + "\n", + "## 0) 前提\n", + "\n", + "* 環境: **Python 3.10.15 / pandas 2.2.2**\n", + "* **指定シグネチャ厳守**(関数名・引数名・返却列・順序)\n", + "* I/O 禁止、不要な `print` や `sort_values` 禁止\n", + "\n", + "## 1) 問題\n", + "\n", + "* `MyNumbers` から **ちょうど1回だけ出現する数(single number)** のうち **最大の数**を 1 行で返す。存在しなければ `NULL` を返す。\n", + "* 入力 DF: `MyNumbers(num: int)`\n", + "* 出力: `num`(1行1列、single number の最大。なければ `NULL`)\n", + "\n", + "## 2) 実装(指定シグネチャ厳守)\n", + "\n", + "> 列最小化 → 出現回数(`value_counts`)→ 出現1回の要素だけ残す → `max`。`sort_values` は使わず、`LIMIT` 相当の早期終了も不要。\n", + "\n", + "```python\n", + "import pandas as pd\n", + "\n", + "def largest_single_number(my_numbers: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Args:\n", + " my_numbers (pd.DataFrame): 列 'num' を持つ DataFrame(重複可)\n", + " Returns:\n", + " pd.DataFrame: 列名と順序は ['num'](1行1列)。single number がなければ NULL(pd.NA)を返す。\n", + " \"\"\"\n", + " # 対象列のみ(列最小化)\n", + " s = my_numbers['num']\n", + "\n", + " # 各値の出現回数(NaNを数える必要がなければ dropna=True でもよいが、max は NaN を無視するため既定のままでOK)\n", + " vc = s.value_counts(dropna=False)\n", + "\n", + " # 出現1回である行だけを抽出(セミジョイン相当)\n", + " mask_single = s.map(vc).eq(1)\n", + " candidates = s[mask_single]\n", + "\n", + " # 最大値を1行で返す。候補が空なら NULL(pd.NA)\n", + " if candidates.empty:\n", + " out = pd.DataFrame({'num': [pd.NA]})\n", + " else:\n", + " out = pd.DataFrame({'num': [candidates.max()]})\n", + "\n", + " return out\n", + "\n", + "Analyze Complexity\n", + "Runtime 298 ms\n", + "Beats 33.33%\n", + "Memory 67.06 MB\n", + "Beats 47.82%\n", + "\n", + "```\n", + "\n", + "## 3) アルゴリズム説明\n", + "\n", + "* 使用 API\n", + "\n", + " * `Series.value_counts(dropna=False)`: 値ごとの出現回数を計算(ハッシュベース)\n", + " * `Series.map(vc)`: 各行に出現回数を付与(軽量結合)\n", + " * ブールフィルタで `== 1` を抽出\n", + " * `Series.max()`: single 値の中の最大値を取得(`NaN` は既定で無視)\n", + "* **NULL / 重複 / 型**\n", + "\n", + " * `NaN` が1回だけでも `Series.max()` は無視するため、非NULLの single があればそちらが選ばれる(SQL の `MAX` と同等の直感)。\n", + " * 全ての single が存在しない場合(= 候補空)は `pd.NA` を返す。\n", + " * 返却列は1列のみ `num`。`pd.NA` を含み得るため dtype は `Int64` 互換か object になるが、仕様上は列名と値が重要。\n", + "\n", + "## 4) 計算量(概算)\n", + "\n", + "* `value_counts`(ハッシュ集計): **O(N)**\n", + "* `map` による付番+フィルタ: **O(N)**\n", + "* `max`: **O(K)**(K は single の件数)\n", + " 合計 **O(N)** 近似、追加メモリは一時ハッシュ(一意値数に比例)。\n", + "\n", + "## 5) 図解(Mermaid 超保守版)\n", + "\n", + "```mermaid\n", + "flowchart TD\n", + " A[入力 データフレーム]\n", + " B[前処理 列最小化]\n", + " C[出現回数を計算 value_counts]\n", + " D[map で行に回数を付与]\n", + " E[回数が1回の行だけ抽出]\n", + " F[max で最大の数を取得]\n", + " G[出力 列 num だけ]\n", + " A --> B\n", + " B --> C\n", + " C --> D\n", + " D --> E\n", + " E --> F\n", + " F --> G\n", + "```\n", + "\n", + "ポイントは **「全行サイズの中間 Series を作らない」** ことと、**余計なソートを避ける**ことです。\n", + "\n", + "---\n", + "\n", + "## 改善ポイント(効果順)\n", + "\n", + "1. **`value_counts(sort=False)` を使ってソートを抑止**\n", + " 既定では並べ替えが走ることがありコスト増。`sort=False` で純粋なハッシュ集計にします。\n", + "\n", + "2. **`map`/全行マスクをやめて「ユニーク値側だけ」で最大値を決める**\n", + " `value_counts` の結果(= ユニーク値ごとの回数)から **出現1回のキー**だけ抽出し、その **キー集合の最大**を取れば、元 Series 全体に戻してブールマスクを作る必要がありません。\n", + " → メモリも CPU も削減。\n", + "\n", + "3. (オプション)**超大規模なら NumPy 直利用**\n", + " `np.unique(..., return_counts=True)` で同様のことができます。pandas 生成物を最小化できるので、極端に大きいデータで効くことがあります。\n", + "\n", + "---\n", + "\n", + "## 改良版(純 pandas・シグネチャ厳守)\n", + "\n", + "```python\n", + "import pandas as pd\n", + "\n", + "def largest_single_number(my_numbers: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Args:\n", + " my_numbers (pd.DataFrame): 列 'num' を持つ DataFrame(重複可)\n", + " Returns:\n", + " pd.DataFrame: 列名と順序は ['num'](1行1列)。single number がなければ NULL(pd.NA)を返す。\n", + " \"\"\"\n", + " s = my_numbers['num']\n", + "\n", + " # ユニーク値ごとの出現回数。sort=False で余計な並び替えを抑止\n", + " vc = s.value_counts(dropna=False, sort=False)\n", + "\n", + " # 出現1回の値だけ(Index)\n", + " singles = vc.index[vc.eq(1)]\n", + "\n", + " if len(singles) == 0:\n", + " return pd.DataFrame({'num': [pd.NA]})\n", + "\n", + " # 最大値(NaN/NA は無視)。すべてが NaN のときは NaN が返る → pd.NA に正規化\n", + " max_single = pd.Series(singles).max(skipna=True)\n", + " if pd.isna(max_single):\n", + " return pd.DataFrame({'num': [pd.NA]})\n", + "\n", + " return pd.DataFrame({'num': [max_single]})\n", + "```\n", + "\n", + "**変更点の狙い**\n", + "\n", + "* `map(vc)` と `mask_single`(長さ N の中間 Series)を **作成しない**ため、**メモリ削減**と **CPU 削減**。\n", + "* `sort=False` で **集計のみ**に徹して高速化。\n", + "* `max(skipna=True)` で **NULL 安全**(SQL の `MAX` 同等の直感)。\n", + "\n", + "---\n", + "\n", + "## さらに攻める版(NumPy 直利用・大規模向け)\n", + "\n", + "```python\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "def largest_single_number(my_numbers: pd.DataFrame) -> pd.DataFrame:\n", + " s = my_numbers['num']\n", + " vals = s.to_numpy(copy=False)\n", + "\n", + " # unique と counts を同時取得(unique は dtype 依存でソートされますが影響なし)\n", + " uniq, cnts = np.unique(vals, return_counts=True)\n", + "\n", + " # 出現1回の候補のみ\n", + " cand = uniq[cnts == 1]\n", + "\n", + " if cand.size == 0:\n", + " return pd.DataFrame({'num': [pd.NA]})\n", + "\n", + " # NaN を弾いて最大を取る(数値前提)\n", + " # 数値でない(object)可能性もあるので、pandas 経由で安全に最大を取る\n", + " max_single = pd.Series(cand).max(skipna=True)\n", + " if pd.isna(max_single):\n", + " return pd.DataFrame({'num': [pd.NA]})\n", + "\n", + " return pd.DataFrame({'num': [max_single]})\n", + "\n", + "Analyze Complexity\n", + "Runtime 242 ms\n", + "Beats 98.12%\n", + "Memory 66.85 MB\n", + "Beats 62.31%\n", + "\n", + "```\n", + "\n", + "> 備考: `np.unique` は戻り値をソートしますが、**最大値を取るだけ**なので問題になりません(`sort_values` は未使用)。\n", + "\n", + "---\n", + "\n", + "## なぜ速く・軽くなるか\n", + "\n", + "* 以前の案は `map(vc)`→**長さ N の中間配列**を作っていました。\n", + " 改良案は **ユニーク値個数(≪N のことが多い)だけ**を扱い、`max` もそこから計算します。\n", + "* `value_counts(sort=False)` で **集計のみ**に限定し、不要な並び替えコストを排除。\n", + "\n", + "---\n", + "\n", + "## 概算計算量とメモリ\n", + "\n", + "* 集計: **O(N)**(ハッシュ)\n", + "* 以降は **O(U)**(U=ユニーク値数)\n", + "* 追加メモリ: ハッシュテーブル(U に比例)+小さな中間(Series(singles))\n", + "\n", + "---\n", + "\n", + "## 仕上げの小ネタ\n", + "\n", + "* `my_numbers['num']` の **dtype を整数系(nullable Int64)や適切な数値型に揃える**と、`value_counts` の内部ハッシュが効きやすいケースがあります。\n", + "* 事前に `my_numbers = my_numbers[['num']]` と **列最小化**してから渡す(既に満たしていれば不要)。\n", + "\n", + "---\n", + "\n", + "これで **メモリ 1 枚分の中間 Series を削り**つつ、**余計な並べ替えをカット**できるので、`Runtime`/`Memory` ともに改善が見込めます。\n", + "\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_posgres.ipynb b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_posgres.ipynb new file mode 100644 index 00000000..699bc088 --- /dev/null +++ b/SQL/Leetcode/Basic select/619. Biggest Single Number/Biggest_Single_Number_posgres.ipynb @@ -0,0 +1,252 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3d8041f3", + "metadata": {}, + "source": [ + "# PostgreSQL 16.6+\n", + "\n", + "## 0) 前提\n", + "\n", + "* エンジン: **PostgreSQL 16.6+**\n", + "* 並び順: 任意\n", + "* `NOT IN` 回避(`EXISTS` / `LEFT JOIN ... IS NULL` を推奨)\n", + "* 判定は ID 基準、表示は仕様どおり\n", + "\n", + "## 1) 問題\n", + "\n", + "* `MyNumbers` から **ちょうど1回だけ出現する数(single number)** のうち **最大の数**を 1 行で返す。存在しなければ `NULL` を返す。\n", + "\n", + "* 入力:\n", + "\n", + " ```\n", + " Table: MyNumbers(num int) -- 重複あり\n", + " ```\n", + "\n", + "* 出力:\n", + "\n", + " ```\n", + " 列: num int -- single number の最大。存在しなければ NULL\n", + " ```\n", + "\n", + "## 2) 最適解(単一クエリ)\n", + "\n", + "> PostgreSQL では **ウィンドウ COUNT** で single 判定を付与し、最後に `MAX` で 1 行化するのがシンプルかつ高速です(並び不要)。\n", + "\n", + "```sql\n", + "WITH win AS (\n", + " SELECT\n", + " num,\n", + " COUNT(*) OVER (PARTITION BY num) AS cnt\n", + " FROM MyNumbers\n", + ")\n", + "SELECT\n", + " MAX(num) AS num\n", + "FROM win\n", + "WHERE cnt = 1;\n", + "\n", + "Runtime 190 ms\n", + "Beats 67.88%\n", + "\n", + "```\n", + "\n", + "* `cnt = 1` が single。`MAX` の入力が空なら `NULL` になるため要件を満たします。\n", + "* `num` に `NULL` が混在しても、`MAX(num)` は `NULL` を無視するため安全。\n", + "\n", + "### 代替(LATERAL で上位 1 件のみを直接取得)\n", + "\n", + "> **インデックス `CREATE INDEX ON MyNumbers(num);`** がある場合、逆順で上位 1 件を取ることで**早期終了**が期待できます。\n", + "\n", + "```sql\n", + "-- single な num を降順で 1 件だけ取り出す\n", + "SELECT s.num\n", + "FROM (VALUES (1)) v(dummy)\n", + "CROSS JOIN LATERAL (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) = 1\n", + " ORDER BY num DESC\n", + " LIMIT 1\n", + ") AS s;\n", + "\n", + "Wrong Answer\n", + "13 / 18 testcases passed\n", + "```\n", + "\n", + "* 出力は 0 または 1 行。0 行の場合は結果セット空になるので、**常に 1 行(NULL を含む)を返したい**なら次のように包みます:\n", + "\n", + "```sql\n", + "SELECT (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) = 1\n", + " ORDER BY num DESC\n", + " LIMIT 1\n", + ") AS num;\n", + "\n", + "Runtime 196 ms\n", + "Beats 54.87%\n", + "\n", + "```\n", + "\n", + "> サブクエリが行を返さなければ `NULL` が返り、要件どおり 1 行固定になります。\n", + "\n", + "## 3) 要点解説\n", + "\n", + "* **方針**:\n", + "\n", + " 1. `num` ごとの出現回数を求める(`COUNT(*) OVER (PARTITION BY num)` または `GROUP BY`)。\n", + " 2. `= 1`(single)に絞る。\n", + " 3. 最大だけ返す → `MAX(num)` か、`ORDER BY num DESC LIMIT 1` の早期終了。\n", + "* **ウィンドウ vs 集約**:\n", + "\n", + " * 単純集約でも可:`SELECT MAX(num) FROM (SELECT num FROM MyNumbers GROUP BY num HAVING COUNT(*)=1) t;`\n", + " * 大規模でインデックスありなら **`ORDER BY num DESC LIMIT 1`** の形が実務で速いことが多いです。\n", + "* **NULL**:\n", + "\n", + " * `num IS NULL` が single でも、最大値比較では無視される(`MAX` は `NULL` を無視)。\n", + "* **インデックス推奨**:\n", + "\n", + " * `CREATE INDEX ON MyNumbers(num);` により `GROUP BY`/`ORDER BY` が効率化し、`LIMIT 1` で早期終了が期待できます。\n", + "\n", + "## 4) 計算量(概算)\n", + "\n", + "* ウィンドウ版: **O(N log N)** 近辺(パーティション単位のソート/ハッシュ)\n", + "* 集約+`MAX` 版: **O(N)**~**O(N log N)**\n", + "* 早期終了(`ORDER BY num DESC LIMIT 1`): **最良で O(log N)** 探索+ごく少数検査(分布・インデックス次第)\n", + "\n", + "## 5) 図解(Mermaid 超保守版)\n", + "\n", + "```mermaid\n", + "flowchart TD\n", + " A[入力 MyNumbers]\n", + " B[出現回数を算出 COUNT OVER or GROUP BY]\n", + " C[cnt = 1 に絞る]\n", + " D[最大だけ取得 MAX or ORDER BY DESC LIMIT 1]\n", + " E[出力 列 num 1行]\n", + " A --> B\n", + " B --> C\n", + " C --> D\n", + " D --> E\n", + "```\n", + "ナイス計測です!結論から言うと、\n", + "\n", + "* **ウィンドウ版(190ms)は正解&速い**\n", + "* **`ORDER BY ... DESC LIMIT 1` 版の「Wrong Answer」は `NULL` 取り扱いが原因**の可能性が高いです\n", + "\n", + "`PostgreSQL` は `ORDER BY num DESC` だと **`NULL` が先頭に来る(NULLS FIRST)** 挙動です。\n", + "そのため「`num IS NULL` がちょうど1回だけある」ケースで、**本来は最大の非NULL single number を返すべきなのに `NULL` を拾ってしまい**誤答になります。\n", + "\n", + "---\n", + "\n", + "## 修正クエリ(早期終了 × 常に1行返す × `NULL` 安全)\n", + "\n", + "```sql\n", + "-- 最大の single number を 1 行で返す(なければ NULL)\n", + "SELECT (\n", + " SELECT num\n", + " FROM MyNumbers\n", + " GROUP BY num\n", + " HAVING COUNT(*) = 1\n", + " ORDER BY num DESC NULLS LAST -- ★ ここがポイント\n", + " LIMIT 1\n", + ") AS num;\n", + "\n", + "Runtime 195 ms\n", + "Beats 56.73%\n", + "\n", + "```\n", + "\n", + "* `NULLS LAST` により、`NULL` は常に末尾へ。\n", + "\n", + " * 非NULL の single が1つでもあれば、それが先に選ばれます。\n", + " * 非NULLの single が存在しない場合のみ `NULL`(single で唯一なのが `NULL` だけ)を返します。\n", + "* サブクエリをスカラで包むので **必ず1行**(`NULL` を含み得る)を返し、ジャッジ仕様に適合。\n", + "\n", + "> もし「`NULL` は single と見なさない」仕様に寄せたいなら、`WHERE num IS NOT NULL` を追加してもOKです:\n", + ">\n", + "> ```sql\n", + "> SELECT (\n", + "> SELECT num\n", + "> FROM MyNumbers\n", + "> WHERE num IS NOT NULL\n", + "> GROUP BY num\n", + "> HAVING COUNT(*) = 1\n", + "> ORDER BY num DESC\n", + "> LIMIT 1\n", + "> ) AS num;\n", + "> ```\n", + "\n", + "---\n", + "\n", + "## 参考:あなたの2案の所感\n", + "\n", + "* **ウィンドウ版**\n", + "\n", + "```sql\n", + "WITH win AS (\n", + "SELECT num, COUNT(*) OVER (PARTITION BY num) AS cnt\n", + "FROM MyNumbers\n", + ")\n", + "SELECT MAX(num) AS num\n", + "FROM win\n", + "WHERE cnt = 1;\n", + "\n", + "Runtime 202 ms\n", + "Beats 43.69%\n", + "\n", + "```\n", + "\n", + " * `MAX` は `NULL` を無視するため、**`NULL` に強い** → 正答性◎。\n", + " * 実測 190ms(67.88%)は良い結果です。\n", + "\n", + "* **LATERAL 版(誤答)**\n", + "\n", + " * `ORDER BY num DESC` が **`NULL` 先頭**で誤答化。\n", + " * 上記修正(`NULLS LAST` もしくは `WHERE num IS NOT NULL`)で解決。\n", + "\n", + "---\n", + "\n", + "## 追加の微最適化(環境依存)\n", + "\n", + "1. **インデックス**\n", + "\n", + "```sql\n", + "CREATE INDEX IF NOT EXISTS ix_mynumbers_num ON MyNumbers(num);\n", + "```\n", + "\n", + "* `GROUP BY num` と `ORDER BY num DESC LIMIT 1` のコストを下げ、**早期終了**が効きやすくなります(インデックス逆順走査)。\n", + "\n", + "2. **統計と並列**\n", + "\n", + "```sql\n", + "ANALYZE MyNumbers; -- 統計更新でプラン安定\n", + "-- 大きい表なら並列集約が効くことあり(設定:max_parallel_workers_per_gather 等)\n", + "```\n", + "\n", + "3. **メモリ**\n", + "\n", + "* `work_mem` を適切に(過度は禁物)→ ハッシュ集約がメモリ内で収まると安定。\n", + "\n", + "---\n", + "\n", + "## まとめ\n", + "\n", + "* 誤答の原因は **`ORDER BY num DESC` の `NULL` 先頭**。\n", + "* **`NULLS LAST`**(または `WHERE num IS NOT NULL`)で正答化。\n", + "* パフォーマンスは **インデックス + 早期終了**が効く分、修正版のスカラサブクエリが最有力。\n", + "* 既存のウィンドウ版(190ms)も十分速く、安定して正しいです。\n" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}