Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions app/features/agents/agents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,49 @@ def requires_approval(action_name: str) -> bool:
- Never bypass safety checks or approval requirements
- Log all significant decisions and their reasoning
"""

# Generalized read-only intent guard. Embedded in the experiment-agent prompt to
# stop a read-only question (list/rank/summarize/compare/report) from derailing
# into a scenario / write / experiment tool — especially on an output-format
# validation retry, where a weak local model tends to start a brand-new action
# instead of just reformatting the data it already fetched (issue #347). Every
# `tool_*` name referenced here is registered on the experiment agent, so the
# `test_prompts_only_reference_registered_tool_names` invariant still holds.
READ_ONLY_INTENT_GUARD = """
READ-ONLY INTENT GUARD (apply this before every turn):
Many requests are READ-ONLY — the user wants you to look something up and report
it, not to change anything. Treat a request as READ-ONLY when it asks you to list,
show, rank, summarize, compare, or report. Examples that are ALWAYS read-only
unless the user explicitly asks to change something:
- listing or ranking stores or products (e.g. "top products")
- sales, revenue, or units-sold summaries
- forecast summaries, or which products have the highest forecasted demand
- model runs and metric comparisons, including WAPE, MAE, or RMSE
- registry aliases and deployment status
- backtest metrics
- RAG / document / knowledge questions

For a READ-ONLY request you MUST:
- Use ONLY read-only tools: tool_list_runs, tool_get_run, tool_compare_runs,
tool_compare_backtest_results.
- NEVER call tool_propose_scenario, tool_save_scenario, tool_create_alias,
tool_archive_run, or tool_run_backtest. Those create, save, promote, archive,
run, or plan something — they are NOT allowed for a read-only question.
- Call a mutating / planning / experiment tool ONLY when the user EXPLICITLY asks
to create, save, promote, archive, run a backtest, or run an experiment.
- Answer directly in the ExperimentReport `summary` field, grounded in tool output.

OUTPUT-FORMAT RETRIES:
- If your previous reply failed schema validation (e.g. "summary: Field required"),
DO NOT call any new tool. Only reformat the data you already obtained into a
valid ExperimentReport with a concise `summary`. A validation retry is a
formatting fix, never a reason to start a new action.

WHEN A TOOL IS MISSING OR THE REQUEST IS AMBIGUOUS:
- If a ranking is ambiguous (e.g. "top products"), ask a clarifying question such
as: "Top by revenue, units sold, forecasted demand, or model error?" — do not guess.
- If no read-only tool exists for the requested metric, say plainly that this agent
does not have a tool for that metric. Do NOT invent data.
- NEVER invent or guess a store_id, product_id, or run_id. Use only IDs returned by
a tool or explicitly supplied by the user.
"""
3 changes: 3 additions & 0 deletions app/features/agents/agents/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from pydantic_ai import Agent, PromptedOutput, RunContext

from app.features.agents.agents.base import (
READ_ONLY_INTENT_GUARD,
SAFETY_INSTRUCTIONS,
SYSTEM_PROMPT_HEADER,
TOOL_USAGE_INSTRUCTIONS,
Expand Down Expand Up @@ -68,6 +69,8 @@
would like to experiment on. Do NOT call any tools until you have a specific
objective (a store and product plus a date range, or an explicit request).

{READ_ONLY_INTENT_GUARD}

{TOOL_USAGE_INSTRUCTIONS}

{SAFETY_INSTRUCTIONS}
Expand Down
172 changes: 172 additions & 0 deletions app/features/agents/tests/test_read_only_guard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
"""Deterministic tests for the experiment-agent read-only intent guard (#347).

The guard stops a read-only question ("list the runs and tell me the lowest
WAPE", "top products", "current deployment alias") from derailing into a
scenario / write / experiment tool — especially on an output-format validation
retry, where a weak local model tends to start a brand-new action instead of
reformatting the data it already fetched.

These tests are deterministic and require **no live model call**: they assert
that the guard text exists, names the right tools, and governs each named
read-only intent, and that the guard is actually delivered to the model in the
system prompt.
"""

from __future__ import annotations

from collections.abc import Iterator
from unittest.mock import AsyncMock

import pytest
from pydantic_ai.messages import ModelMessage, ModelResponse, TextPart
from pydantic_ai.models.function import AgentInfo, FunctionModel

from app.core.config import get_settings
from app.features.agents.agents.base import READ_ONLY_INTENT_GUARD
from app.features.agents.agents.experiment import (
EXPERIMENT_SYSTEM_PROMPT,
create_experiment_agent,
)
from app.features.agents.deps import AgentDeps

# Tools that must NEVER be called for a read-only intent.
PROHIBITED_TOOLS = (
"tool_propose_scenario",
"tool_save_scenario",
"tool_create_alias",
"tool_archive_run",
"tool_run_backtest",
)

# Read-only tools the guard steers the model toward.
ALLOWED_READ_TOOLS = (
"tool_list_runs",
"tool_get_run",
"tool_compare_runs",
)


@pytest.fixture(autouse=True)
def _reset_settings() -> Iterator[None]:
"""Reset the settings cache so model mutations do not leak across tests."""
get_settings.cache_clear()
yield
get_settings.cache_clear()


def test_guard_is_embedded_in_experiment_prompt() -> None:
"""The experiment system prompt embeds the read-only intent guard."""
assert READ_ONLY_INTENT_GUARD.strip() in EXPERIMENT_SYSTEM_PROMPT
assert "READ-ONLY INTENT GUARD" in EXPERIMENT_SYSTEM_PROMPT


@pytest.mark.parametrize("tool_name", PROHIBITED_TOOLS)
def test_guard_names_prohibited_tools(tool_name: str) -> None:
"""The guard explicitly forbids each scenario/write/experiment tool."""
assert tool_name in READ_ONLY_INTENT_GUARD
# And the prohibition is unambiguous.
assert "NEVER call" in READ_ONLY_INTENT_GUARD


@pytest.mark.parametrize("tool_name", ALLOWED_READ_TOOLS)
def test_guard_names_allowed_read_tools(tool_name: str) -> None:
"""The guard points the model at the read-only tools to use instead."""
assert tool_name in READ_ONLY_INTENT_GUARD
assert "Use ONLY read-only tools" in READ_ONLY_INTENT_GUARD


def test_guard_forbids_new_tools_on_validation_retry() -> None:
"""On an output-format retry the model must reformat, not call new tools."""
guard = READ_ONLY_INTENT_GUARD
assert "OUTPUT-FORMAT RETRIES" in guard
assert "DO NOT call any new tool" in guard
assert "reformat" in guard
# The exact validation-error string that triggered the original derail.
assert "summary: Field required" in guard


def test_guard_requires_clarification_for_ambiguous_top_products() -> None:
"""An ambiguous "top products" ranking gets a clarifying question, not a guess."""
guard = READ_ONLY_INTENT_GUARD
assert "top products" in guard
assert "Top by revenue, units sold, forecasted demand, or model error?" in guard


def test_guard_prohibits_invented_ids() -> None:
"""The guard forbids inventing store_id / product_id / run_id values."""
guard = READ_ONLY_INTENT_GUARD
assert "NEVER invent" in guard
for token in ("store_id", "product_id", "run_id"):
assert token in guard


def test_guard_states_limitation_when_no_tool_exists() -> None:
"""The guard tells the model to state a missing-tool limitation, not fabricate."""
guard = READ_ONLY_INTENT_GUARD
assert "does not have a tool for that metric" in guard
assert "Do NOT invent data" in guard


# Each example read-only prompt and the guard substring that proves the guard
# governs that intent (so the named query class can never silently lose
# coverage). Deterministic — no model is invoked.
@pytest.mark.parametrize(
("prompt", "covered_intent"),
[
(
"List the most recent model runs and tell me which has the lowest WAPE.",
"WAPE",
),
("List the top products.", "top products"),
(
"Which products have the highest forecasted demand?",
"highest forecasted demand",
),
("Show the current deployment alias.", "registry aliases and deployment status"),
("Summarize total revenue and units sold.", "units-sold summaries"),
("Show the backtest metrics for this grain.", "backtest metrics"),
],
)
def test_read_only_intents_are_covered_by_guard(prompt: str, covered_intent: str) -> None:
"""Every named read-only intent is enumerated in the guard's read-only list.

This is the routing contract: each of these prompts is a read-only request,
and the guard's read-only example list names its intent — so the model is
told to answer it with read tools only and never with a scenario/write tool.
"""
assert covered_intent in READ_ONLY_INTENT_GUARD


def test_guard_is_delivered_in_system_prompt_to_model() -> None:
"""The guard actually reaches the model in the delivered system prompt.

Builds the real experiment agent against a stub FunctionModel (no live
call), captures the system prompt the framework sends, and asserts the guard
is present. Regression for #347 — a guard that never reaches the model is
worthless.
"""
settings = get_settings()
settings.agent_default_model = "ollama:llama3.1"
agent = create_experiment_agent()

captured: dict[str, str] = {}

def respond(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
for message in messages:
for part in getattr(message, "parts", []):
if getattr(part, "part_kind", None) == "system-prompt":
captured["system_prompt"] = part.content
# End the run with a PromptedOutput-parseable text reply.
return ModelResponse(parts=[TextPart(content='{"summary": "noop"}')])

agent.run_sync(
"List the most recent model runs and tell me which has the lowest WAPE.",
model=FunctionModel(respond),
deps=AgentDeps(db=AsyncMock(), session_id="test-read-only-guard"),
)

system_prompt = captured.get("system_prompt", "")
assert "READ-ONLY INTENT GUARD" in system_prompt
assert "DO NOT call any new tool" in system_prompt
for tool_name in PROHIBITED_TOOLS:
assert tool_name in system_prompt
59 changes: 58 additions & 1 deletion app/features/scenarios/agent_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.features.data_platform.models import SalesDaily
from app.features.data_platform.models import Product, SalesDaily, Store
from app.features.scenarios.models import SCENARIO_SOURCE_AGENT
from app.features.scenarios.schemas import (
CreateScenarioRequest,
Expand All @@ -52,6 +52,27 @@
AGENT_SAVE_APPROVED_BY = "operator"


async def _grain_exists(db: AsyncSession, store_id: int, product_id: int) -> tuple[bool, bool]:
"""Report whether the store and product dimension rows exist.

Read-only existence probe against the ``store`` / ``product`` dimension
tables. Used to reject a proposal for a grain that does not exist (issue
#347 — a weak model derailed into proposing a what-if for a hallucinated
``store_id=123`` / ``product_id=456``).

Args:
db: Database session.
store_id: Candidate store id.
product_id: Candidate product id.

Returns:
``(store_exists, product_exists)``.
"""
store_exists = await db.scalar(select(Store.id).where(Store.id == store_id)) is not None
product_exists = await db.scalar(select(Product.id).where(Product.id == product_id)) is not None
return store_exists, product_exists


async def propose_scenario(
db: AsyncSession,
store_id: int,
Expand Down Expand Up @@ -79,6 +100,12 @@ async def propose_scenario(
the candidate ``assumptions`` (JSON-mode dump so dates are ISO strings,
ready to pass straight back into ``save_scenario``), and a
plain-language ``recommendation``.

When the ``(store_id, product_id)`` grain does not exist, returns a
non-persistable validation error instead — ``{"valid": False,
"persistable": False, "error": ..., "missing": [...]}`` — so a
hallucinated grain (e.g. store 123 / product 456) never yields a normal
proposal (issue #347).
"""
logger.info(
"agents.scenario_tool.propose_scenario_called",
Expand All @@ -87,6 +114,36 @@ async def propose_scenario(
horizon=horizon,
)

# Reject a grain that does not exist before drafting anything. A weak model
# can hallucinate placeholder ids (store 123 / product 456); a proposal for a
# non-existent grain is meaningless and must never look like a normal,
# save-able candidate (issue #347). This is read-only and persists nothing.
store_exists, product_exists = await _grain_exists(db, store_id, product_id)
if not (store_exists and product_exists):
missing: list[str] = []
if not store_exists:
missing.append(f"store_id={store_id}")
if not product_exists:
missing.append(f"product_id={product_id}")
logger.info(
"agents.scenario_tool.propose_scenario_rejected_unknown_grain",
store_id=store_id,
product_id=product_id,
missing=missing,
)
return {
"valid": False,
"persistable": False,
"store_id": store_id,
"product_id": product_id,
"missing": missing,
"error": (
f"Cannot propose a scenario: {' and '.join(missing)} "
"do not exist. Use a real store/product pair (look one up with a "
"read-only tool) — do not invent identifiers."
),
}

# Read the most recent unit price for a grounded recommendation. Read-only.
latest_price = await db.scalar(
select(SalesDaily.unit_price)
Expand Down
23 changes: 23 additions & 0 deletions app/features/scenarios/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,29 @@ async def db_session() -> AsyncGenerator[AsyncSession, None]:
await engine.dispose()


@pytest.fixture
async def existing_grain(db_session: AsyncSession) -> AsyncGenerator[tuple[int, int], None]:
"""Insert the Store + Product dimension rows for the test grain; clean up after.

``propose_scenario`` now rejects a grain whose store/product do not exist
(#347). ``TEST_STORE_ID`` / ``TEST_PRODUCT_ID`` are deliberately high IDs no
seeder uses, so a read-only proposal for them needs the dimension rows seeded
explicitly. Removed on teardown so the grain stays absent for the
rejection-path tests.
"""
from app.features.data_platform.models import Product, Store

db_session.add(Store(id=TEST_STORE_ID, code=f"S{TEST_STORE_ID}", name="Test Store"))
db_session.add(Product(id=TEST_PRODUCT_ID, sku=f"SKU{TEST_PRODUCT_ID}", name="Test Product"))
await db_session.commit()
try:
yield (TEST_STORE_ID, TEST_PRODUCT_ID)
finally:
await db_session.execute(delete(Product).where(Product.id == TEST_PRODUCT_ID))
await db_session.execute(delete(Store).where(Store.id == TEST_STORE_ID))
await db_session.commit()


@pytest.fixture
async def client(db_session: AsyncSession) -> AsyncGenerator[AsyncClient, None]:
"""Create a test client with the database dependency overridden."""
Expand Down
Loading