diff --git a/app/features/agents/agents/base.py b/app/features/agents/agents/base.py index 90c433b6..d098b567 100644 --- a/app/features/agents/agents/base.py +++ b/app/features/agents/agents/base.py @@ -371,3 +371,49 @@ def requires_approval(action_name: str) -> bool: - Never bypass safety checks or approval requirements - Log all significant decisions and their reasoning """ + +# Generalized read-only intent guard. Embedded in the experiment-agent prompt to +# stop a read-only question (list/rank/summarize/compare/report) from derailing +# into a scenario / write / experiment tool — especially on an output-format +# validation retry, where a weak local model tends to start a brand-new action +# instead of just reformatting the data it already fetched (issue #347). Every +# `tool_*` name referenced here is registered on the experiment agent, so the +# `test_prompts_only_reference_registered_tool_names` invariant still holds. +READ_ONLY_INTENT_GUARD = """ +READ-ONLY INTENT GUARD (apply this before every turn): +Many requests are READ-ONLY — the user wants you to look something up and report +it, not to change anything. Treat a request as READ-ONLY when it asks you to list, +show, rank, summarize, compare, or report. Examples that are ALWAYS read-only +unless the user explicitly asks to change something: +- listing or ranking stores or products (e.g. "top products") +- sales, revenue, or units-sold summaries +- forecast summaries, or which products have the highest forecasted demand +- model runs and metric comparisons, including WAPE, MAE, or RMSE +- registry aliases and deployment status +- backtest metrics +- RAG / document / knowledge questions + +For a READ-ONLY request you MUST: +- Use ONLY read-only tools: tool_list_runs, tool_get_run, tool_compare_runs, + tool_compare_backtest_results. +- NEVER call tool_propose_scenario, tool_save_scenario, tool_create_alias, + tool_archive_run, or tool_run_backtest. Those create, save, promote, archive, + run, or plan something — they are NOT allowed for a read-only question. +- Call a mutating / planning / experiment tool ONLY when the user EXPLICITLY asks + to create, save, promote, archive, run a backtest, or run an experiment. +- Answer directly in the ExperimentReport `summary` field, grounded in tool output. + +OUTPUT-FORMAT RETRIES: +- If your previous reply failed schema validation (e.g. "summary: Field required"), + DO NOT call any new tool. Only reformat the data you already obtained into a + valid ExperimentReport with a concise `summary`. A validation retry is a + formatting fix, never a reason to start a new action. + +WHEN A TOOL IS MISSING OR THE REQUEST IS AMBIGUOUS: +- If a ranking is ambiguous (e.g. "top products"), ask a clarifying question such + as: "Top by revenue, units sold, forecasted demand, or model error?" — do not guess. +- If no read-only tool exists for the requested metric, say plainly that this agent + does not have a tool for that metric. Do NOT invent data. +- NEVER invent or guess a store_id, product_id, or run_id. Use only IDs returned by + a tool or explicitly supplied by the user. +""" diff --git a/app/features/agents/agents/experiment.py b/app/features/agents/agents/experiment.py index 1b0139f6..d58b53de 100644 --- a/app/features/agents/agents/experiment.py +++ b/app/features/agents/agents/experiment.py @@ -16,6 +16,7 @@ from pydantic_ai import Agent, PromptedOutput, RunContext from app.features.agents.agents.base import ( + READ_ONLY_INTENT_GUARD, SAFETY_INSTRUCTIONS, SYSTEM_PROMPT_HEADER, TOOL_USAGE_INSTRUCTIONS, @@ -68,6 +69,8 @@ would like to experiment on. Do NOT call any tools until you have a specific objective (a store and product plus a date range, or an explicit request). +{READ_ONLY_INTENT_GUARD} + {TOOL_USAGE_INSTRUCTIONS} {SAFETY_INSTRUCTIONS} diff --git a/app/features/agents/tests/test_read_only_guard.py b/app/features/agents/tests/test_read_only_guard.py new file mode 100644 index 00000000..c59e232d --- /dev/null +++ b/app/features/agents/tests/test_read_only_guard.py @@ -0,0 +1,172 @@ +"""Deterministic tests for the experiment-agent read-only intent guard (#347). + +The guard stops a read-only question ("list the runs and tell me the lowest +WAPE", "top products", "current deployment alias") from derailing into a +scenario / write / experiment tool — especially on an output-format validation +retry, where a weak local model tends to start a brand-new action instead of +reformatting the data it already fetched. + +These tests are deterministic and require **no live model call**: they assert +that the guard text exists, names the right tools, and governs each named +read-only intent, and that the guard is actually delivered to the model in the +system prompt. +""" + +from __future__ import annotations + +from collections.abc import Iterator +from unittest.mock import AsyncMock + +import pytest +from pydantic_ai.messages import ModelMessage, ModelResponse, TextPart +from pydantic_ai.models.function import AgentInfo, FunctionModel + +from app.core.config import get_settings +from app.features.agents.agents.base import READ_ONLY_INTENT_GUARD +from app.features.agents.agents.experiment import ( + EXPERIMENT_SYSTEM_PROMPT, + create_experiment_agent, +) +from app.features.agents.deps import AgentDeps + +# Tools that must NEVER be called for a read-only intent. +PROHIBITED_TOOLS = ( + "tool_propose_scenario", + "tool_save_scenario", + "tool_create_alias", + "tool_archive_run", + "tool_run_backtest", +) + +# Read-only tools the guard steers the model toward. +ALLOWED_READ_TOOLS = ( + "tool_list_runs", + "tool_get_run", + "tool_compare_runs", +) + + +@pytest.fixture(autouse=True) +def _reset_settings() -> Iterator[None]: + """Reset the settings cache so model mutations do not leak across tests.""" + get_settings.cache_clear() + yield + get_settings.cache_clear() + + +def test_guard_is_embedded_in_experiment_prompt() -> None: + """The experiment system prompt embeds the read-only intent guard.""" + assert READ_ONLY_INTENT_GUARD.strip() in EXPERIMENT_SYSTEM_PROMPT + assert "READ-ONLY INTENT GUARD" in EXPERIMENT_SYSTEM_PROMPT + + +@pytest.mark.parametrize("tool_name", PROHIBITED_TOOLS) +def test_guard_names_prohibited_tools(tool_name: str) -> None: + """The guard explicitly forbids each scenario/write/experiment tool.""" + assert tool_name in READ_ONLY_INTENT_GUARD + # And the prohibition is unambiguous. + assert "NEVER call" in READ_ONLY_INTENT_GUARD + + +@pytest.mark.parametrize("tool_name", ALLOWED_READ_TOOLS) +def test_guard_names_allowed_read_tools(tool_name: str) -> None: + """The guard points the model at the read-only tools to use instead.""" + assert tool_name in READ_ONLY_INTENT_GUARD + assert "Use ONLY read-only tools" in READ_ONLY_INTENT_GUARD + + +def test_guard_forbids_new_tools_on_validation_retry() -> None: + """On an output-format retry the model must reformat, not call new tools.""" + guard = READ_ONLY_INTENT_GUARD + assert "OUTPUT-FORMAT RETRIES" in guard + assert "DO NOT call any new tool" in guard + assert "reformat" in guard + # The exact validation-error string that triggered the original derail. + assert "summary: Field required" in guard + + +def test_guard_requires_clarification_for_ambiguous_top_products() -> None: + """An ambiguous "top products" ranking gets a clarifying question, not a guess.""" + guard = READ_ONLY_INTENT_GUARD + assert "top products" in guard + assert "Top by revenue, units sold, forecasted demand, or model error?" in guard + + +def test_guard_prohibits_invented_ids() -> None: + """The guard forbids inventing store_id / product_id / run_id values.""" + guard = READ_ONLY_INTENT_GUARD + assert "NEVER invent" in guard + for token in ("store_id", "product_id", "run_id"): + assert token in guard + + +def test_guard_states_limitation_when_no_tool_exists() -> None: + """The guard tells the model to state a missing-tool limitation, not fabricate.""" + guard = READ_ONLY_INTENT_GUARD + assert "does not have a tool for that metric" in guard + assert "Do NOT invent data" in guard + + +# Each example read-only prompt and the guard substring that proves the guard +# governs that intent (so the named query class can never silently lose +# coverage). Deterministic — no model is invoked. +@pytest.mark.parametrize( + ("prompt", "covered_intent"), + [ + ( + "List the most recent model runs and tell me which has the lowest WAPE.", + "WAPE", + ), + ("List the top products.", "top products"), + ( + "Which products have the highest forecasted demand?", + "highest forecasted demand", + ), + ("Show the current deployment alias.", "registry aliases and deployment status"), + ("Summarize total revenue and units sold.", "units-sold summaries"), + ("Show the backtest metrics for this grain.", "backtest metrics"), + ], +) +def test_read_only_intents_are_covered_by_guard(prompt: str, covered_intent: str) -> None: + """Every named read-only intent is enumerated in the guard's read-only list. + + This is the routing contract: each of these prompts is a read-only request, + and the guard's read-only example list names its intent — so the model is + told to answer it with read tools only and never with a scenario/write tool. + """ + assert covered_intent in READ_ONLY_INTENT_GUARD + + +def test_guard_is_delivered_in_system_prompt_to_model() -> None: + """The guard actually reaches the model in the delivered system prompt. + + Builds the real experiment agent against a stub FunctionModel (no live + call), captures the system prompt the framework sends, and asserts the guard + is present. Regression for #347 — a guard that never reaches the model is + worthless. + """ + settings = get_settings() + settings.agent_default_model = "ollama:llama3.1" + agent = create_experiment_agent() + + captured: dict[str, str] = {} + + def respond(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse: + for message in messages: + for part in getattr(message, "parts", []): + if getattr(part, "part_kind", None) == "system-prompt": + captured["system_prompt"] = part.content + # End the run with a PromptedOutput-parseable text reply. + return ModelResponse(parts=[TextPart(content='{"summary": "noop"}')]) + + agent.run_sync( + "List the most recent model runs and tell me which has the lowest WAPE.", + model=FunctionModel(respond), + deps=AgentDeps(db=AsyncMock(), session_id="test-read-only-guard"), + ) + + system_prompt = captured.get("system_prompt", "") + assert "READ-ONLY INTENT GUARD" in system_prompt + assert "DO NOT call any new tool" in system_prompt + for tool_name in PROHIBITED_TOOLS: + assert tool_name in system_prompt diff --git a/app/features/scenarios/agent_tools.py b/app/features/scenarios/agent_tools.py index ab99c17c..b987afca 100644 --- a/app/features/scenarios/agent_tools.py +++ b/app/features/scenarios/agent_tools.py @@ -26,7 +26,7 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession -from app.features.data_platform.models import SalesDaily +from app.features.data_platform.models import Product, SalesDaily, Store from app.features.scenarios.models import SCENARIO_SOURCE_AGENT from app.features.scenarios.schemas import ( CreateScenarioRequest, @@ -52,6 +52,27 @@ AGENT_SAVE_APPROVED_BY = "operator" +async def _grain_exists(db: AsyncSession, store_id: int, product_id: int) -> tuple[bool, bool]: + """Report whether the store and product dimension rows exist. + + Read-only existence probe against the ``store`` / ``product`` dimension + tables. Used to reject a proposal for a grain that does not exist (issue + #347 — a weak model derailed into proposing a what-if for a hallucinated + ``store_id=123`` / ``product_id=456``). + + Args: + db: Database session. + store_id: Candidate store id. + product_id: Candidate product id. + + Returns: + ``(store_exists, product_exists)``. + """ + store_exists = await db.scalar(select(Store.id).where(Store.id == store_id)) is not None + product_exists = await db.scalar(select(Product.id).where(Product.id == product_id)) is not None + return store_exists, product_exists + + async def propose_scenario( db: AsyncSession, store_id: int, @@ -79,6 +100,12 @@ async def propose_scenario( the candidate ``assumptions`` (JSON-mode dump so dates are ISO strings, ready to pass straight back into ``save_scenario``), and a plain-language ``recommendation``. + + When the ``(store_id, product_id)`` grain does not exist, returns a + non-persistable validation error instead — ``{"valid": False, + "persistable": False, "error": ..., "missing": [...]}`` — so a + hallucinated grain (e.g. store 123 / product 456) never yields a normal + proposal (issue #347). """ logger.info( "agents.scenario_tool.propose_scenario_called", @@ -87,6 +114,36 @@ async def propose_scenario( horizon=horizon, ) + # Reject a grain that does not exist before drafting anything. A weak model + # can hallucinate placeholder ids (store 123 / product 456); a proposal for a + # non-existent grain is meaningless and must never look like a normal, + # save-able candidate (issue #347). This is read-only and persists nothing. + store_exists, product_exists = await _grain_exists(db, store_id, product_id) + if not (store_exists and product_exists): + missing: list[str] = [] + if not store_exists: + missing.append(f"store_id={store_id}") + if not product_exists: + missing.append(f"product_id={product_id}") + logger.info( + "agents.scenario_tool.propose_scenario_rejected_unknown_grain", + store_id=store_id, + product_id=product_id, + missing=missing, + ) + return { + "valid": False, + "persistable": False, + "store_id": store_id, + "product_id": product_id, + "missing": missing, + "error": ( + f"Cannot propose a scenario: {' and '.join(missing)} " + "do not exist. Use a real store/product pair (look one up with a " + "read-only tool) — do not invent identifiers." + ), + } + # Read the most recent unit price for a grounded recommendation. Read-only. latest_price = await db.scalar( select(SalesDaily.unit_price) diff --git a/app/features/scenarios/tests/conftest.py b/app/features/scenarios/tests/conftest.py index c5ebc731..0ef3ca06 100644 --- a/app/features/scenarios/tests/conftest.py +++ b/app/features/scenarios/tests/conftest.py @@ -66,6 +66,29 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]: await engine.dispose() +@pytest.fixture +async def existing_grain(db_session: AsyncSession) -> AsyncGenerator[tuple[int, int], None]: + """Insert the Store + Product dimension rows for the test grain; clean up after. + + ``propose_scenario`` now rejects a grain whose store/product do not exist + (#347). ``TEST_STORE_ID`` / ``TEST_PRODUCT_ID`` are deliberately high IDs no + seeder uses, so a read-only proposal for them needs the dimension rows seeded + explicitly. Removed on teardown so the grain stays absent for the + rejection-path tests. + """ + from app.features.data_platform.models import Product, Store + + db_session.add(Store(id=TEST_STORE_ID, code=f"S{TEST_STORE_ID}", name="Test Store")) + db_session.add(Product(id=TEST_PRODUCT_ID, sku=f"SKU{TEST_PRODUCT_ID}", name="Test Product")) + await db_session.commit() + try: + yield (TEST_STORE_ID, TEST_PRODUCT_ID) + finally: + await db_session.execute(delete(Product).where(Product.id == TEST_PRODUCT_ID)) + await db_session.execute(delete(Store).where(Store.id == TEST_STORE_ID)) + await db_session.commit() + + @pytest.fixture async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]: """Create a test client with the database dependency overridden.""" diff --git a/app/features/scenarios/tests/test_agent_tools.py b/app/features/scenarios/tests/test_agent_tools.py index 5f1ba739..95400c92 100644 --- a/app/features/scenarios/tests/test_agent_tools.py +++ b/app/features/scenarios/tests/test_agent_tools.py @@ -14,6 +14,7 @@ import uuid from datetime import UTC, datetime, timedelta +from unittest.mock import AsyncMock import pytest from sqlalchemy import delete, func, select @@ -35,18 +36,85 @@ def test_save_scenario_requires_approval() -> None: assert requires_approval("save_scenario") is True +class TestProposeScenarioEntityValidation: + """propose_scenario rejects a grain whose store/product do not exist (#347). + + Unit-level (mocked DB): the entity-existence probe (`_grain_exists`) issues + `db.scalar(...)` for the store id, then the product id, then — only on the + valid path — the latest unit price. Driving `db.scalar` with a `side_effect` + list exercises every branch with no database. These run in the fast + ``not integration`` gate. + """ + + @staticmethod + def _mock_db(scalar_returns: list[object]) -> AsyncMock: + """An AsyncSession-shaped mock whose `scalar` yields the given values.""" + db = AsyncMock(spec=AsyncSession) + db.scalar = AsyncMock(side_effect=scalar_returns) + return db + + async def test_existing_grain_returns_a_proposal(self) -> None: + """A real store/product pair yields a normal, save-able proposal.""" + db = self._mock_db([5, 8, 12.5]) # store id, product id, latest price + + result = await propose_scenario( + db, store_id=5, product_id=8, horizon=14, objective="grow demand" + ) + + assert result.get("valid") is not False + assert "assumptions" in result + assert "recommendation" in result + ScenarioAssumptions.model_validate(result["assumptions"]) + + async def test_nonexistent_grain_is_rejected(self) -> None: + """A grain with no store and no product row is rejected, not proposed.""" + db = self._mock_db([None, None]) + + result = await propose_scenario( + db, store_id=77001, product_id=77002, horizon=14, objective="x" + ) + + assert result["valid"] is False + assert result["persistable"] is False + assert "assumptions" not in result + assert "store_id=77001" in result["missing"] + assert "product_id=77002" in result["missing"] + + async def test_hallucinated_123_456_is_rejected(self) -> None: + """The exact hallucinated store 123 / product 456 case never proposes.""" + db = self._mock_db([None, None]) + + result = await propose_scenario(db, store_id=123, product_id=456, horizon=14, objective="") + + assert result["valid"] is False + assert "assumptions" not in result + # Read-only rejection — nothing is written. + db.add.assert_not_called() + db.commit.assert_not_awaited() + + async def test_missing_product_only_is_rejected(self) -> None: + """A real store but a missing product is still rejected.""" + db = self._mock_db([5, None]) + + result = await propose_scenario(db, store_id=5, product_id=999999, horizon=7, objective="") + + assert result["valid"] is False + assert result["missing"] == ["product_id=999999"] + + @pytest.mark.integration class TestProposeScenario: - """propose_scenario drafts a candidate and persists nothing.""" + """propose_scenario drafts a candidate (for a real grain) and persists nothing.""" async def test_returns_valid_assumptions_and_recommendation( - self, db_session: AsyncSession + self, db_session: AsyncSession, existing_grain: tuple[int, int] ) -> None: """A default objective yields a valid price-cut candidate.""" + store_id, product_id = existing_grain result = await propose_scenario( db_session, - store_id=TEST_STORE_ID, - product_id=TEST_PRODUCT_ID, + store_id=store_id, + product_id=product_id, horizon=14, objective="grow demand for the summer range", ) @@ -58,12 +126,15 @@ async def test_returns_valid_assumptions_and_recommendation( assert isinstance(result["recommendation"], str) assert result["recommendation"] - async def test_promotion_keyword_proposes_a_promotion(self, db_session: AsyncSession) -> None: + async def test_promotion_keyword_proposes_a_promotion( + self, db_session: AsyncSession, existing_grain: tuple[int, int] + ) -> None: """An objective mentioning a promotion steers the candidate accordingly.""" + store_id, product_id = existing_grain result = await propose_scenario( db_session, - store_id=TEST_STORE_ID, - product_id=TEST_PRODUCT_ID, + store_id=store_id, + product_id=product_id, horizon=7, objective="run a promotion next week", ) @@ -72,12 +143,15 @@ async def test_promotion_keyword_proposes_a_promotion(self, db_session: AsyncSes assert assumptions.promotion is not None assert assumptions.price is None - async def test_persists_no_row(self, db_session: AsyncSession) -> None: + async def test_persists_no_row( + self, db_session: AsyncSession, existing_grain: tuple[int, int] + ) -> None: """propose_scenario is read-only — it never writes a scenario_plan row.""" + store_id, product_id = existing_grain await propose_scenario( db_session, - store_id=TEST_STORE_ID, - product_id=TEST_PRODUCT_ID, + store_id=store_id, + product_id=product_id, horizon=10, objective="test", ) @@ -85,6 +159,31 @@ async def test_persists_no_row(self, db_session: AsyncSession) -> None: assert count == 0 +@pytest.mark.integration +class TestProposeScenarioRejectsUnknownGrain: + """propose_scenario rejects a non-existent grain against a real DB (#347).""" + + async def test_rejects_nonexistent_grain_and_persists_nothing( + self, db_session: AsyncSession + ) -> None: + """A grain with no dimension rows is rejected and writes no plan.""" + # IDs far above any seeded range — guaranteed absent. + result = await propose_scenario( + db_session, + store_id=9_999_001, + product_id=9_999_002, + horizon=14, + objective="grow demand", + ) + + assert result["valid"] is False + assert result["persistable"] is False + assert "assumptions" not in result + + count = await db_session.scalar(select(func.count()).select_from(ScenarioPlan)) + assert count == 0 + + @pytest.mark.integration class TestSaveScenario: """save_scenario persists a plan stamped with agent provenance."""