-
Notifications
You must be signed in to change notification settings - Fork 3.7k
feat(pii): add redaction timing metrics across sidecar and persist path #5264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,8 @@ | |
| endpoints so a single PRESIDIO_URL serves both. | ||
| """ | ||
|
|
||
| import logging | ||
| import time | ||
| from typing import Any | ||
|
|
||
| from fastapi import FastAPI | ||
|
|
@@ -133,6 +135,9 @@ def build_analyzer() -> AnalyzerEngine: | |
| analyzer = build_analyzer() | ||
| anonymizer = AnonymizerEngine() | ||
|
|
||
| # Propagates to uvicorn's root handler, so timing lands in the container log stream. | ||
| logger = logging.getLogger("sim.pii") | ||
|
|
||
| app = FastAPI(title="Sim Presidio", docs_url=None, redoc_url=None) | ||
|
|
||
|
|
||
|
|
@@ -163,18 +168,27 @@ def supported_entities(language: str = "en") -> list[str]: | |
|
|
||
| @app.post("/analyze") | ||
| def analyze(req: AnalyzeRequest) -> list[dict[str, Any]]: | ||
| started = time.perf_counter() | ||
| results = analyzer.analyze( | ||
| text=req.text, | ||
| language=req.language, | ||
| entities=req.entities or None, | ||
| score_threshold=req.score_threshold, | ||
|
Comment on lines
172
to
176
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| return_decision_process=req.return_decision_process, | ||
| ) | ||
| logger.info( | ||
| "analyze lang=%s chars=%d entities=%d duration_ms=%.1f", | ||
| req.language, | ||
| len(req.text), | ||
| len(results), | ||
| (time.perf_counter() - started) * 1000, | ||
| ) | ||
| return [r.to_dict() for r in results] | ||
|
|
||
|
|
||
| @app.post("/anonymize") | ||
| def anonymize(req: AnonymizeRequest) -> dict[str, Any]: | ||
| started = time.perf_counter() | ||
| analyzer_results = [ | ||
| RecognizerResult( | ||
| entity_type=r["entity_type"], | ||
|
|
@@ -197,6 +211,12 @@ def anonymize(req: AnonymizeRequest) -> dict[str, Any]: | |
| analyzer_results=analyzer_results, | ||
| operators=operators, | ||
| ) | ||
| logger.info( | ||
| "anonymize chars=%d spans=%d duration_ms=%.1f", | ||
| len(req.text), | ||
| len(analyzer_results), | ||
| (time.perf_counter() - started) * 1000, | ||
| ) | ||
| return { | ||
| "text": result.text, | ||
| "items": [ | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -132,6 +132,7 @@ export async function redactPIIFromExecution( | |||||||||||||||||||||
| ): Promise<RedactablePayload> { | ||||||||||||||||||||||
| const { entityTypes } = options | ||||||||||||||||||||||
| const language = options.language ?? 'en' | ||||||||||||||||||||||
| const startedAt = performance.now() | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| const units = REDACTABLE_KEYS.filter((key) => payload[key] !== undefined).map((key) => ({ | ||||||||||||||||||||||
| key, | ||||||||||||||||||||||
|
|
@@ -151,12 +152,14 @@ export async function redactPIIFromExecution( | |||||||||||||||||||||
| if (collected.length === 0) return payload | ||||||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
When an execution payload has no collected strings, this early return skips the new
Suggested change
|
||||||||||||||||||||||
|
|
||||||||||||||||||||||
| let masked: string[] | ||||||||||||||||||||||
| let scrubbed = false | ||||||||||||||||||||||
| if (totalBytes > PII_MAX_TOTAL_BYTES) { | ||||||||||||||||||||||
| logger.warn('Execution exceeds PII redaction ceiling; scrubbing text', { | ||||||||||||||||||||||
| totalBytes, | ||||||||||||||||||||||
| ceiling: PII_MAX_TOTAL_BYTES, | ||||||||||||||||||||||
| }) | ||||||||||||||||||||||
| masked = collected.map(() => REDACTION_FAILED_MARKER) | ||||||||||||||||||||||
| scrubbed = true | ||||||||||||||||||||||
| } else { | ||||||||||||||||||||||
| try { | ||||||||||||||||||||||
| // Presidio runs only in the app container; the persist path also runs in | ||||||||||||||||||||||
|
|
@@ -168,6 +171,7 @@ export async function redactPIIFromExecution( | |||||||||||||||||||||
| stringCount: collected.length, | ||||||||||||||||||||||
| }) | ||||||||||||||||||||||
| masked = collected.map(() => REDACTION_FAILED_MARKER) | ||||||||||||||||||||||
| scrubbed = true | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
|
|
@@ -176,5 +180,12 @@ export async function redactPIIFromExecution( | |||||||||||||||||||||
| for (const unit of units) { | ||||||||||||||||||||||
| result[unit.key] = transformUnit(unit.key, unit.value, () => masked[index++]) | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| logger.info('PII redaction completed', { | ||||||||||||||||||||||
| stringCount: collected.length, | ||||||||||||||||||||||
| totalBytes, | ||||||||||||||||||||||
| durationMs: Math.round(performance.now() - startedAt), | ||||||||||||||||||||||
| scrubbed, | ||||||||||||||||||||||
| }) | ||||||||||||||||||||||
|
Comment on lines
+184
to
+189
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The app logger's production default suppresses Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time! |
||||||||||||||||||||||
| return result | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Under the sidecar's default
uvicorn server:appstartup, uvicorn configures its own named loggers rather than a root handler forsim.pii. The new/analyzeand/anonymizelogger.infocalls can be dropped silently, so the timing logs described by the PR never reach the container log stream.Note: If this suggestion doesn't match your team's coding style, reply to this and let me know. I'll remember it for next time!