Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions apps/pii/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
endpoints so a single PRESIDIO_URL serves both.
"""

import logging
import time
from typing import Any

from fastapi import FastAPI
Expand Down Expand Up @@ -133,6 +135,9 @@ def build_analyzer() -> AnalyzerEngine:
analyzer = build_analyzer()
anonymizer = AnonymizerEngine()

# Propagates to uvicorn's root handler, so timing lands in the container log stream.
logger = logging.getLogger("sim.pii")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Timing Logger Is Unhandled

Under the sidecar's default uvicorn server:app startup, uvicorn configures its own named loggers rather than a root handler for sim.pii. The new /analyze and /anonymize logger.info calls can be dropped silently, so the timing logs described by the PR never reach the container log stream.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!


app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None)


Expand Down Expand Up @@ -163,18 +168,27 @@ def supported_entities(language: str = "en") -> list[str]:

@app.post("/analyze")
def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]:
started = time.perf_counter()
results = analyzer.analyze(
text=req.text,
language=req.language,
entities=req.entities or None,
score_threshold=req.score_threshold,
Comment on lines 172 to 176

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Analyze Failures Skip Timing

If analyzer.analyze(...) raises for a request, FastAPI returns an error before the new duration_ms log is emitted. Failed or slow /analyze calls then disappear from the sidecar timing stream, which weakens the latency signal during Presidio failures.

return_decision_process=req.return_decision_process,
)
logger.info(
"analyze lang=%s chars=%d entities=%d duration_ms=%.1f",
req.language,
len(req.text),
len(results),
(time.perf_counter() - started) * 1000,
)
return [r.to_dict() for r in results]


@app.post("/anonymize")
def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
started = time.perf_counter()
analyzer_results = [
RecognizerResult(
entity_type=r["entity_type"],
Expand All @@ -197,6 +211,12 @@ def anonymize(req: AnonymizeRequest) -> dict[str, Any]:
analyzer_results=analyzer_results,
operators=operators,
)
logger.info(
"anonymize chars=%d spans=%d duration_ms=%.1f",
len(req.text),
len(analyzer_results),
(time.perf_counter() - started) * 1000,
)
return {
"text": result.text,
"items": [
Expand Down
6 changes: 5 additions & 1 deletion apps/sim/app/api/guardrails/mask-batch/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,12 @@ export const POST = withRouteHandler(async (request: NextRequest) => {
const { texts, entityTypes, language } = parsed.data.body

try {
const startedAt = performance.now()
const masked = await maskPIIBatch(texts, entityTypes, language)
logger.info('Masked PII batch', { count: texts.length })
logger.info('Masked PII batch', {
count: texts.length,
durationMs: Math.round(performance.now() - startedAt),
})
return NextResponse.json({ masked })
} catch (error) {
// An unreachable/misconfigured Presidio sidecar makes maskPIIBatch throw; fail
Expand Down
11 changes: 11 additions & 0 deletions apps/sim/lib/logs/execution/pii-redaction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ export async function redactPIIFromExecution(
): Promise<RedactablePayload> {
const { entityTypes } = options
const language = options.language ?? 'en'
const startedAt = performance.now()

const units = REDACTABLE_KEYS.filter((key) => payload[key] !== undefined).map((key) => ({
key,
Expand All @@ -151,12 +152,14 @@ export async function redactPIIFromExecution(
if (collected.length === 0) return payload

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 No-String Executions Skip Metrics

When an execution payload has no collected strings, this early return skips the new PII redaction completed log. Those valid no-op executions become indistinguishable from cases where the redaction path did not run or logging failed, so per-execution dashboards undercount the persist path.

Suggested change
if (collected.length === 0) return payload
if (collected.length === 0) {
logger.info('PII redaction completed', {
stringCount: 0,
totalBytes,
durationMs: Math.round(performance.now() - startedAt),
scrubbed: false,
})
return payload
}


let masked: string[]
let scrubbed = false
if (totalBytes > PII_MAX_TOTAL_BYTES) {
logger.warn('Execution exceeds PII redaction ceiling; scrubbing text', {
totalBytes,
ceiling: PII_MAX_TOTAL_BYTES,
})
masked = collected.map(() => REDACTION_FAILED_MARKER)
scrubbed = true
} else {
try {
// Presidio runs only in the app container; the persist path also runs in
Expand All @@ -168,6 +171,7 @@ export async function redactPIIFromExecution(
stringCount: collected.length,
})
masked = collected.map(() => REDACTION_FAILED_MARKER)
scrubbed = true
}
}

Expand All @@ -176,5 +180,12 @@ export async function redactPIIFromExecution(
for (const unit of units) {
result[unit.key] = transformUnit(unit.key, unit.value, () => masked[index++])
}

logger.info('PII redaction completed', {
stringCount: collected.length,
totalBytes,
durationMs: Math.round(performance.now() - startedAt),
scrubbed,
})
Comment on lines +184 to +189

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Production Suppresses Completion Logs

The app logger's production default suppresses info messages unless the deployment lowers LOG_LEVEL. In that common production state, successful redactions never emit PII redaction completed, so the per-execution timing metric added here is not queryable after deployment.

Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!

return result
}
Loading