w7-mgfcode · w7-mgfcode · Jun 1, 2026 · Jun 1, 2026 · Jun 1, 2026
@@ -371,3 +371,49 @@ def requires_approval(action_name: str) -> bool:
 - Never bypass safety checks or approval requirements
 - Log all significant decisions and their reasoning
 """
+
+# Generalized read-only intent guard. Embedded in the experiment-agent prompt to
+# stop a read-only question (list/rank/summarize/compare/report) from derailing
+# into a scenario / write / experiment tool — especially on an output-format
+# validation retry, where a weak local model tends to start a brand-new action
+# instead of just reformatting the data it already fetched (issue #347). Every
+# `tool_*` name referenced here is registered on the experiment agent, so the
+# `test_prompts_only_reference_registered_tool_names` invariant still holds.
+READ_ONLY_INTENT_GUARD = """
+READ-ONLY INTENT GUARD (apply this before every turn):
+Many requests are READ-ONLY — the user wants you to look something up and report
+it, not to change anything. Treat a request as READ-ONLY when it asks you to list,
+show, rank, summarize, compare, or report. Examples that are ALWAYS read-only
+unless the user explicitly asks to change something:
+- listing or ranking stores or products (e.g. "top products")
+- sales, revenue, or units-sold summaries
+- forecast summaries, or which products have the highest forecasted demand
+- model runs and metric comparisons, including WAPE, MAE, or RMSE
+- registry aliases and deployment status
+- backtest metrics
+- RAG / document / knowledge questions
+
+For a READ-ONLY request you MUST:
+- Use ONLY read-only tools: tool_list_runs, tool_get_run, tool_compare_runs,
+  tool_compare_backtest_results.
+- NEVER call tool_propose_scenario, tool_save_scenario, tool_create_alias,
+  tool_archive_run, or tool_run_backtest. Those create, save, promote, archive,
+  run, or plan something — they are NOT allowed for a read-only question.
+- Call a mutating / planning / experiment tool ONLY when the user EXPLICITLY asks
+  to create, save, promote, archive, run a backtest, or run an experiment.
+- Answer directly in the ExperimentReport `summary` field, grounded in tool output.
+
+OUTPUT-FORMAT RETRIES:
+- If your previous reply failed schema validation (e.g. "summary: Field required"),
+  DO NOT call any new tool. Only reformat the data you already obtained into a
+  valid ExperimentReport with a concise `summary`. A validation retry is a
+  formatting fix, never a reason to start a new action.
+
+WHEN A TOOL IS MISSING OR THE REQUEST IS AMBIGUOUS:
+- If a ranking is ambiguous (e.g. "top products"), ask a clarifying question such
+  as: "Top by revenue, units sold, forecasted demand, or model error?" — do not guess.
+- If no read-only tool exists for the requested metric, say plainly that this agent
+  does not have a tool for that metric. Do NOT invent data.
+- NEVER invent or guess a store_id, product_id, or run_id. Use only IDs returned by
+  a tool or explicitly supplied by the user.
+"""
@@ -16,6 +16,7 @@
 from pydantic_ai import Agent, PromptedOutput, RunContext
 
 from app.features.agents.agents.base import (
+    READ_ONLY_INTENT_GUARD,
     SAFETY_INSTRUCTIONS,
     SYSTEM_PROMPT_HEADER,
     TOOL_USAGE_INSTRUCTIONS,
@@ -68,6 +69,8 @@
   would like to experiment on. Do NOT call any tools until you have a specific
   objective (a store and product plus a date range, or an explicit request).
 
+{READ_ONLY_INTENT_GUARD}
+
 {TOOL_USAGE_INSTRUCTIONS}
 
 {SAFETY_INSTRUCTIONS}

@@ -0,0 +1,172 @@
+"""Deterministic tests for the experiment-agent read-only intent guard (#347).
+
+The guard stops a read-only question ("list the runs and tell me the lowest
+WAPE", "top products", "current deployment alias") from derailing into a
+scenario / write / experiment tool — especially on an output-format validation
+retry, where a weak local model tends to start a brand-new action instead of
+reformatting the data it already fetched.
+
+These tests are deterministic and require **no live model call**: they assert
+that the guard text exists, names the right tools, and governs each named
+read-only intent, and that the guard is actually delivered to the model in the
+system prompt.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Iterator
+from unittest.mock import AsyncMock
+
+import pytest
+from pydantic_ai.messages import ModelMessage, ModelResponse, TextPart
+from pydantic_ai.models.function import AgentInfo, FunctionModel
+
+from app.core.config import get_settings
+from app.features.agents.agents.base import READ_ONLY_INTENT_GUARD
+from app.features.agents.agents.experiment import (
+    EXPERIMENT_SYSTEM_PROMPT,
+    create_experiment_agent,
+)
+from app.features.agents.deps import AgentDeps
+
+# Tools that must NEVER be called for a read-only intent.
+PROHIBITED_TOOLS = (
+    "tool_propose_scenario",
+    "tool_save_scenario",
+    "tool_create_alias",
+    "tool_archive_run",
+    "tool_run_backtest",
+)
+
+# Read-only tools the guard steers the model toward.
+ALLOWED_READ_TOOLS = (
+    "tool_list_runs",
+    "tool_get_run",
+    "tool_compare_runs",
+)
+
+
+@pytest.fixture(autouse=True)
+def _reset_settings() -> Iterator[None]:
+    """Reset the settings cache so model mutations do not leak across tests."""
+    get_settings.cache_clear()
+    yield
+    get_settings.cache_clear()
+
+
+def test_guard_is_embedded_in_experiment_prompt() -> None:
+    """The experiment system prompt embeds the read-only intent guard."""
+    assert READ_ONLY_INTENT_GUARD.strip() in EXPERIMENT_SYSTEM_PROMPT
+    assert "READ-ONLY INTENT GUARD" in EXPERIMENT_SYSTEM_PROMPT
+
+
+@pytest.mark.parametrize("tool_name", PROHIBITED_TOOLS)
+def test_guard_names_prohibited_tools(tool_name: str) -> None:
+    """The guard explicitly forbids each scenario/write/experiment tool."""
+    assert tool_name in READ_ONLY_INTENT_GUARD
+    # And the prohibition is unambiguous.
+    assert "NEVER call" in READ_ONLY_INTENT_GUARD
+
+
+@pytest.mark.parametrize("tool_name", ALLOWED_READ_TOOLS)
+def test_guard_names_allowed_read_tools(tool_name: str) -> None:
+    """The guard points the model at the read-only tools to use instead."""
+    assert tool_name in READ_ONLY_INTENT_GUARD
+    assert "Use ONLY read-only tools" in READ_ONLY_INTENT_GUARD
+
+
+def test_guard_forbids_new_tools_on_validation_retry() -> None:
+    """On an output-format retry the model must reformat, not call new tools."""
+    guard = READ_ONLY_INTENT_GUARD
+    assert "OUTPUT-FORMAT RETRIES" in guard
+    assert "DO NOT call any new tool" in guard
+    assert "reformat" in guard
+    # The exact validation-error string that triggered the original derail.
+    assert "summary: Field required" in guard
+
+
+def test_guard_requires_clarification_for_ambiguous_top_products() -> None:
+    """An ambiguous "top products" ranking gets a clarifying question, not a guess."""
+    guard = READ_ONLY_INTENT_GUARD
+    assert "top products" in guard
+    assert "Top by revenue, units sold, forecasted demand, or model error?" in guard
+
+
+def test_guard_prohibits_invented_ids() -> None:
+    """The guard forbids inventing store_id / product_id / run_id values."""
+    guard = READ_ONLY_INTENT_GUARD
+    assert "NEVER invent" in guard
+    for token in ("store_id", "product_id", "run_id"):
+        assert token in guard
+
+
+def test_guard_states_limitation_when_no_tool_exists() -> None:
+    """The guard tells the model to state a missing-tool limitation, not fabricate."""
+    guard = READ_ONLY_INTENT_GUARD
+    assert "does not have a tool for that metric" in guard
+    assert "Do NOT invent data" in guard
+
+
+# Each example read-only prompt and the guard substring that proves the guard
+# governs that intent (so the named query class can never silently lose
+# coverage). Deterministic — no model is invoked.
+@pytest.mark.parametrize(
+    ("prompt", "covered_intent"),
+    [
+        (
+            "List the most recent model runs and tell me which has the lowest WAPE.",
+            "WAPE",
+        ),
+        ("List the top products.", "top products"),
+        (
+            "Which products have the highest forecasted demand?",
+            "highest forecasted demand",
+        ),
+        ("Show the current deployment alias.", "registry aliases and deployment status"),
+        ("Summarize total revenue and units sold.", "units-sold summaries"),
+        ("Show the backtest metrics for this grain.", "backtest metrics"),
+    ],
+)
+def test_read_only_intents_are_covered_by_guard(prompt: str, covered_intent: str) -> None:
+    """Every named read-only intent is enumerated in the guard's read-only list.
+
+    This is the routing contract: each of these prompts is a read-only request,
+    and the guard's read-only example list names its intent — so the model is
+    told to answer it with read tools only and never with a scenario/write tool.
+    """
+    assert covered_intent in READ_ONLY_INTENT_GUARD
+
+
+def test_guard_is_delivered_in_system_prompt_to_model() -> None:
+    """The guard actually reaches the model in the delivered system prompt.
+
+    Builds the real experiment agent against a stub FunctionModel (no live
+    call), captures the system prompt the framework sends, and asserts the guard
+    is present. Regression for #347 — a guard that never reaches the model is
+    worthless.
+    """
+    settings = get_settings()
+    settings.agent_default_model = "ollama:llama3.1"
+    agent = create_experiment_agent()
+
+    captured: dict[str, str] = {}
+
+    def respond(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
+        for message in messages:
+            for part in getattr(message, "parts", []):
+                if getattr(part, "part_kind", None) == "system-prompt":
+                    captured["system_prompt"] = part.content
+        # End the run with a PromptedOutput-parseable text reply.
+        return ModelResponse(parts=[TextPart(content='{"summary": "noop"}')])
+
+    agent.run_sync(
+        "List the most recent model runs and tell me which has the lowest WAPE.",
+        model=FunctionModel(respond),
+        deps=AgentDeps(db=AsyncMock(), session_id="test-read-only-guard"),
+    )
+
+    system_prompt = captured.get("system_prompt", "")
+    assert "READ-ONLY INTENT GUARD" in system_prompt
+    assert "DO NOT call any new tool" in system_prompt
+    for tool_name in PROHIBITED_TOOLS:
+        assert tool_name in system_prompt
@@ -26,7 +26,7 @@
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
-from app.features.data_platform.models import SalesDaily
+from app.features.data_platform.models import Product, SalesDaily, Store
 from app.features.scenarios.models import SCENARIO_SOURCE_AGENT
 from app.features.scenarios.schemas import (
     CreateScenarioRequest,
@@ -52,6 +52,27 @@
 AGENT_SAVE_APPROVED_BY = "operator"
 
 
+async def _grain_exists(db: AsyncSession, store_id: int, product_id: int) -> tuple[bool, bool]:
+    """Report whether the store and product dimension rows exist.
+
+    Read-only existence probe against the ``store`` / ``product`` dimension
+    tables. Used to reject a proposal for a grain that does not exist (issue
+    #347 — a weak model derailed into proposing a what-if for a hallucinated
+    ``store_id=123`` / ``product_id=456``).
+
+    Args:
+        db: Database session.
+        store_id: Candidate store id.
+        product_id: Candidate product id.
+
+    Returns:
+        ``(store_exists, product_exists)``.
+    """
+    store_exists = await db.scalar(select(Store.id).where(Store.id == store_id)) is not None
+    product_exists = await db.scalar(select(Product.id).where(Product.id == product_id)) is not None
+    return store_exists, product_exists
+
+
 async def propose_scenario(
     db: AsyncSession,
     store_id: int,
@@ -79,6 +100,12 @@ async def propose_scenario(
         the candidate ``assumptions`` (JSON-mode dump so dates are ISO strings,
         ready to pass straight back into ``save_scenario``), and a
         plain-language ``recommendation``.
+
+        When the ``(store_id, product_id)`` grain does not exist, returns a
+        non-persistable validation error instead — ``{"valid": False,
+        "persistable": False, "error": ..., "missing": [...]}`` — so a
+        hallucinated grain (e.g. store 123 / product 456) never yields a normal
+        proposal (issue #347).
     """
     logger.info(
         "agents.scenario_tool.propose_scenario_called",
@@ -87,6 +114,36 @@ async def propose_scenario(
         horizon=horizon,
     )
 
+    # Reject a grain that does not exist before drafting anything. A weak model
+    # can hallucinate placeholder ids (store 123 / product 456); a proposal for a
+    # non-existent grain is meaningless and must never look like a normal,
+    # save-able candidate (issue #347). This is read-only and persists nothing.
+    store_exists, product_exists = await _grain_exists(db, store_id, product_id)
+    if not (store_exists and product_exists):
+        missing: list[str] = []
+        if not store_exists:
+            missing.append(f"store_id={store_id}")
+        if not product_exists:
+            missing.append(f"product_id={product_id}")
+        logger.info(
+            "agents.scenario_tool.propose_scenario_rejected_unknown_grain",
+            store_id=store_id,
+            product_id=product_id,
+            missing=missing,
+        )
+        return {
+            "valid": False,
+            "persistable": False,
+            "store_id": store_id,
+            "product_id": product_id,
+            "missing": missing,
+            "error": (
+                f"Cannot propose a scenario: {' and '.join(missing)} "
+                "do not exist. Use a real store/product pair (look one up with a "
+                "read-only tool) — do not invent identifiers."
+            ),
+        }
+
     # Read the most recent unit price for a grounded recommendation. Read-only.
     latest_price = await db.scalar(
         select(SalesDaily.unit_price)

@@ -66,6 +66,29 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]:
     await engine.dispose()
 
 
+@pytest.fixture
+async def existing_grain(db_session: AsyncSession) -> AsyncGenerator[tuple[int, int], None]:
+    """Insert the Store + Product dimension rows for the test grain; clean up after.
+
+    ``propose_scenario`` now rejects a grain whose store/product do not exist
+    (#347). ``TEST_STORE_ID`` / ``TEST_PRODUCT_ID`` are deliberately high IDs no
+    seeder uses, so a read-only proposal for them needs the dimension rows seeded
+    explicitly. Removed on teardown so the grain stays absent for the
+    rejection-path tests.
+    """
+    from app.features.data_platform.models import Product, Store
+
+    db_session.add(Store(id=TEST_STORE_ID, code=f"S{TEST_STORE_ID}", name="Test Store"))
+    db_session.add(Product(id=TEST_PRODUCT_ID, sku=f"SKU{TEST_PRODUCT_ID}", name="Test Product"))
+    await db_session.commit()
+    try:
+        yield (TEST_STORE_ID, TEST_PRODUCT_ID)
+    finally:
+        await db_session.execute(delete(Product).where(Product.id == TEST_PRODUCT_ID))
+        await db_session.execute(delete(Store).where(Store.id == TEST_STORE_ID))
+        await db_session.commit()
+
+
 @pytest.fixture
 async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]:
     """Create a test client with the database dependency overridden."""