From f7ba1b6499afdef5fe6bc3ef82f784cf0c793e0b Mon Sep 17 00:00:00 2001 From: "fengduzhen.666" Date: Mon, 8 Jun 2026 20:50:35 +0800 Subject: [PATCH] feat: add trace content config --- config.yaml.full | 1 + .../docs/framework/configuration.en.mdx | 1 + docs/content/docs/framework/configuration.mdx | 1 + .../framework/observability/overview.en.mdx | 3 +- .../docs/framework/observability/overview.mdx | 3 +- .../observability/span-attributes.en.mdx | 24 +- .../observability/span-attributes.mdx | 24 +- .../framework/observability/tracing.en.mdx | 2 + .../docs/framework/observability/tracing.mdx | 4 +- tests/test_tracing_content.py | 263 ++++++++++++++++++ veadk/config.py | 4 + veadk/configs/tracing_configs.py | 7 + .../telemetry/attributes/attributes.py | 32 +++ veadk/tracing/telemetry/content_tracing.py | 34 +++ .../telemetry/exporters/apmplus_exporter.py | 37 +-- veadk/tracing/telemetry/telemetry.py | 24 +- 16 files changed, 415 insertions(+), 49 deletions(-) create mode 100644 tests/test_tracing_content.py create mode 100644 veadk/tracing/telemetry/content_tracing.py diff --git a/config.yaml.full b/config.yaml.full index 89fcf664..02f6edb9 100644 --- a/config.yaml.full +++ b/config.yaml.full @@ -110,6 +110,7 @@ tool: observability: # [optional] for exporting tracing data to Volcengine CozeLoop and APMPlus platform opentelemetry: + trace_content: true # true | false, collect agent/LLM/tool input and output content in traces apmplus: endpoint: http://apmplus-cn-beijing.volces.com:4317 api_key: diff --git a/docs/content/docs/framework/configuration.en.mdx b/docs/content/docs/framework/configuration.en.mdx index 07b0bf15..a095ed2a 100644 --- a/docs/content/docs/framework/configuration.en.mdx +++ b/docs/content/docs/framework/configuration.en.mdx @@ -104,6 +104,7 @@ Unified prefix: `OBSERVABILITY_` | Subcategory | Variable | Description | | :- | :- | :- | +| OpenTelemetry | `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT` | Whether to collect Agent, LLM, and tool input/output content, default `true`; set to `false` to keep non-content trace data such as spans, latency, model metadata, token usage, and tool names | | APMPlus | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_ENDPOINT` | APMPlus reporting URL | | | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_API_KEY` | APMPlus auth key | | | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_SERVICE_NAME` | APMPlus service name | diff --git a/docs/content/docs/framework/configuration.mdx b/docs/content/docs/framework/configuration.mdx index 17f1c06e..b7cd943c 100644 --- a/docs/content/docs/framework/configuration.mdx +++ b/docs/content/docs/framework/configuration.mdx @@ -104,6 +104,7 @@ volcengine: | 子类 | 环境变量名称 | 释义 | | :- | :- | :- | +| OpenTelemetry | `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT` | 是否采集 Agent、LLM 与工具调用的输入输出内容,默认 `true`;设为 `false` 后仍保留 Span、耗时、模型、Token、工具名等非内容 Trace 信息 | | APMPlus | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_ENDPOINT` | APMPlus 上报地址 | | | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_API_KEY` | APMPlus 鉴权密钥 | | | `OBSERVABILITY_OPENTELEMETRY_APMPLUS_SERVICE_NAME` | APMPlus 服务名称 | diff --git a/docs/content/docs/framework/observability/overview.en.mdx b/docs/content/docs/framework/observability/overview.en.mdx index 1decc23f..1d501698 100644 --- a/docs/content/docs/framework/observability/overview.en.mdx +++ b/docs/content/docs/framework/observability/overview.en.mdx @@ -12,7 +12,8 @@ VeADK's observability follows the [OpenTelemetry](https://opentelemetry.io/docs/ - **Visualized traces**: The complete flow of each request, from intake to response, can be traced; - **Structured data**: Records context, state, event types, latency, and more; - **Cross-component tracing**: Covers the agent, tools, memory modules, knowledge bases, and external interfaces; -- **Multi-agent coordination**: Analyzes the invocation relationships and data flow between agents in multi-agent scenarios. +- **Multi-agent coordination**: Analyzes the invocation relationships and data flow between agents in multi-agent scenarios; +- **Content collection control**: Disable Agent, LLM, and tool input/output content collection with `observability.opentelemetry.trace_content` while keeping non-content trace data. ## Supported Platforms diff --git a/docs/content/docs/framework/observability/overview.mdx b/docs/content/docs/framework/observability/overview.mdx index 992aa1e8..27cdd0ea 100644 --- a/docs/content/docs/framework/observability/overview.mdx +++ b/docs/content/docs/framework/observability/overview.mdx @@ -12,7 +12,8 @@ VeADK 的可观测能力遵循 [OpenTelemetry](https://opentelemetry.io/docs/spe - **可视化链路**:每次请求从接入到响应的完整流程都可被追踪; - **结构化数据**:记录上下文、状态、事件类型、耗时等信息; - **跨组件追踪**:覆盖 Agent、工具、记忆模块、知识库以及外部接口; -- **多 Agent 协调**:在多 Agent 协作场景中分析各 Agent 的调用关系与数据流向。 +- **多 Agent 协调**:在多 Agent 协作场景中分析各 Agent 的调用关系与数据流向; +- **内容采集控制**:可通过 `observability.opentelemetry.trace_content` 关闭 Agent、LLM 与工具输入输出内容采集,仅保留非内容 Trace 信息。 ## 支持的平台 diff --git a/docs/content/docs/framework/observability/span-attributes.en.mdx b/docs/content/docs/framework/observability/span-attributes.en.mdx index 2a1f3868..bcc2104d 100644 --- a/docs/content/docs/framework/observability/span-attributes.en.mdx +++ b/docs/content/docs/framework/observability/span-attributes.en.mdx @@ -29,6 +29,10 @@ VeADK's span attribute naming and value conventions follow the [OpenTelemetry](h Because the OpenTelemetry community's field conventions for generative AI are still evolving, the meaning of some fields may change. + + `observability.opentelemetry.trace_content` or the `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT` environment variable controls Agent, LLM, and tool input/output content collection. When set to `false`, fields marked as controlled by content collection in the tables below are not written to spans; non-content data such as trace structure, model parameters, token usage, and tool names is still retained. + + ### Common | No. | Attribute Name | Meaning | Notes | @@ -65,15 +69,15 @@ VeADK's span attribute naming and value conventions follow the [OpenTelemetry](h | 10 | `gen_ai.is_streaming` | Whether the response is streaming | Returns `None` | | 11 | `gen_ai.operation.name` | The operation name | Always returns `chat`, used to uniformly identify the operation type | | 12 | `gen_ai.span.kind` | The span kind | Always returns `llm`, conforming to OpenTelemetry semantic conventions | -| 13 | `gen_ai.prompt` | Structured information of the request input content | Records role, content, function calls, images, and other inputs in message order | -| 14 | `gen_ai.completion` | Structured information of the model response content | Records the text, function calls, and other output generated by the model | +| 13 | `gen_ai.prompt` | Structured information of the request input content | Records role, content, function calls, images, and other inputs in message order; controlled by content collection | +| 14 | `gen_ai.completion` | Structured information of the model response content | Records the text, function calls, and other output generated by the model; controlled by content collection | | 15 | `gen_ai.usage.input_tokens` | The number of input tokens | Extracted from `params.llm_response.usage_metadata.prompt_token_count` | | 16 | `gen_ai.usage.output_tokens` | The number of output tokens | Extracted from `params.llm_response.usage_metadata.candidates_token_count` | | 17 | `gen_ai.usage.total_tokens` | The total number of tokens | Extracted from `params.llm_response.usage_metadata.total_token_count` | | 18 | `gen_ai.usage.cache_creation_input_tokens` | The number of tokens used to create the cache | Extracted from `params.llm_response.usage_metadata.cached_content_token_count` | | 19 | `gen_ai.usage.cache_read_input_tokens` | The number of tokens used to read the cache | Extracted from `params.llm_response.usage_metadata.cached_content_token_count` | -| 20 | `gen_ai.messages` | The complete conversation message events | Includes the structured event sequence of system instructions, user messages, tool responses, and assistant replies | -| 21 | `gen_ai.choice` | The model choice event | Represents the candidate response generated by the model (including function calls or text content) | +| 20 | `gen_ai.messages` | The complete conversation message events | Includes the structured event sequence of system instructions, user messages, tool responses, and assistant replies; controlled by content collection | +| 21 | `gen_ai.choice` | The model choice event | Represents the candidate response generated by the model (including function calls or text content); controlled by content collection | | 22 | `input.value` | The complete LLM request body | *(for debugging)* The serialized output request object | | 23 | `output.value` | The complete LLM response body | *(for debugging)* The serialized output response object | @@ -83,10 +87,10 @@ VeADK's span attribute naming and value conventions follow the [OpenTelemetry](h | - | - | - | - | | 1 | `gen_ai.operation.name` | The operation name | Always returns `execute_tool`, uniformly identifying tool-call operations | | 2 | `gen_ai.tool.name` | The tool name | Obtained from `params.tool.name`; if absent, it is ``; used for the TLS platform | -| 3 | `gen_ai.tool.input` | The tool input content | JSON serialization includes `name`, `description`, `parameters`, used to record tool-call arguments; used for the TLS platform | -| 4 | `gen_ai.tool.output` | The tool output content | JSON serialization includes `id`, `name`, `response`, recording tool execution results; used for the TLS platform | -| 5 | `cozeloop.input` | The tool input | Same as `gen_ai.tool.input`; used for the CozeLoop platform | -| 6 | `cozeloop.output` | The tool output | Same as `gen_ai.tool.output`; used for the CozeLoop platform | +| 3 | `gen_ai.tool.input` | The tool input content | JSON serialization includes `name`, `description`, `parameters`, used to record tool-call arguments; used for the TLS platform; controlled by content collection | +| 4 | `gen_ai.tool.output` | The tool output content | JSON serialization includes `id`, `name`, `response`, recording tool execution results; used for the TLS platform; controlled by content collection | +| 5 | `cozeloop.input` | The tool input | Same as `gen_ai.tool.input`; used for the CozeLoop platform; controlled by content collection | +| 6 | `cozeloop.output` | The tool output | Same as `gen_ai.tool.output`; used for the CozeLoop platform; controlled by content collection | | 7 | `gen_ai.span.kind` | The span kind | Always returns `tool`, following OpenTelemetry semantic conventions; used for the APMPlus platform | -| 8 | `gen_ai.input` | The tool input | Same as `gen_ai.tool.input`; used for the APMPlus platform | -| 9 | `gen_ai.output` | The tool output | Same as `gen_ai.tool.output`; used for the APMPlus platform | +| 8 | `gen_ai.input` | The tool input | Same as `gen_ai.tool.input`; used for the APMPlus platform; controlled by content collection | +| 9 | `gen_ai.output` | The tool output | Same as `gen_ai.tool.output`; used for the APMPlus platform; controlled by content collection | diff --git a/docs/content/docs/framework/observability/span-attributes.mdx b/docs/content/docs/framework/observability/span-attributes.mdx index 6a06f0bd..ffd9d6d5 100644 --- a/docs/content/docs/framework/observability/span-attributes.mdx +++ b/docs/content/docs/framework/observability/span-attributes.mdx @@ -29,6 +29,10 @@ VeADK 的 Span 属性命名和值规范遵循 [OpenTelemetry](https://openteleme 由于 OpenTelemetry 社区对生成式 AI 的字段规范还在发展完善中,因此部分字段含义可能会发生变化。 + + `observability.opentelemetry.trace_content` 或环境变量 `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT` 控制 Agent、LLM 与 Tool 输入输出内容采集。设为 `false` 后,下表中标记“受内容采集开关控制”的字段不会写入 Span;普通链路、模型参数、Token 用量、工具名等非内容信息仍会保留。 + + ### 通用类 | 序号 | 埋点字段名 | 含义 | 注释 | @@ -65,15 +69,15 @@ VeADK 的 Span 属性命名和值规范遵循 [OpenTelemetry](https://openteleme | 10 | `gen_ai.is_streaming` | 是否为流式响应 | 返回 `None` | | 11 | `gen_ai.operation.name` | 操作名称 | 固定返回 `chat`,用于统一标识操作类型 | | 12 | `gen_ai.span.kind` | Span 类型 | 固定返回 `llm`,符合 OpenTelemetry 语义约定 | -| 13 | `gen_ai.prompt` | 请求输入内容结构化信息 | 按消息顺序记录角色、内容、函数调用、图片等输入 | -| 14 | `gen_ai.completion` | 模型响应内容结构化信息 | 记录模型生成的文本、函数调用等输出内容 | +| 13 | `gen_ai.prompt` | 请求输入内容结构化信息 | 按消息顺序记录角色、内容、函数调用、图片等输入;受内容采集开关控制 | +| 14 | `gen_ai.completion` | 模型响应内容结构化信息 | 记录模型生成的文本、函数调用等输出内容;受内容采集开关控制 | | 15 | `gen_ai.usage.input_tokens` | 输入 token 数量 | 从 `params.llm_response.usage_metadata.prompt_token_count` 提取 | | 16 | `gen_ai.usage.output_tokens` | 输出 token 数量 | 从 `params.llm_response.usage_metadata.candidates_token_count` 提取 | | 17 | `gen_ai.usage.total_tokens` | 总 token 数量 | 从 `params.llm_response.usage_metadata.total_token_count` 提取 | | 18 | `gen_ai.usage.cache_creation_input_tokens` | 缓存创建所用 token 数量 | 从 `params.llm_response.usage_metadata.cached_content_token_count` 提取 | | 19 | `gen_ai.usage.cache_read_input_tokens` | 缓存读取所用 token 数量 | 从 `params.llm_response.usage_metadata.cached_content_token_count` 提取 | -| 20 | `gen_ai.messages` | 完整对话消息事件 | 包括系统指令、用户消息、工具响应和助手回复的结构化事件序列 | -| 21 | `gen_ai.choice` | 模型选择事件 | 表示模型生成的候选响应(含函数调用或文本内容) | +| 20 | `gen_ai.messages` | 完整对话消息事件 | 包括系统指令、用户消息、工具响应和助手回复的结构化事件序列;受内容采集开关控制 | +| 21 | `gen_ai.choice` | 模型选择事件 | 表示模型生成的候选响应(含函数调用或文本内容);受内容采集开关控制 | | 22 | `input.value` | 完整 LLM 请求体 | *(供调试使用)* 序列化输出请求对象 | | 23 | `output.value` | 完整 LLM 响应体 | *(供调试使用)* 序列化输出响应对象 | @@ -83,10 +87,10 @@ VeADK 的 Span 属性命名和值规范遵循 [OpenTelemetry](https://openteleme | - | - | - | - | | 1 | `gen_ai.operation.name` | 操作名称 | 固定返回 `execute_tool`,统一标识工具调用操作 | | 2 | `gen_ai.tool.name` | 工具名称 | 从 `params.tool.name` 获取;若无则为 ``,用于 TLS 平台 | -| 3 | `gen_ai.tool.input` | 工具输入内容 | JSON 序列化包含:`name`、`description`、`parameters`,用于记录工具调用参数,用于 TLS 平台 | -| 4 | `gen_ai.tool.output` | 工具输出内容 | JSON 序列化包含:`id`、`name`、`response`,记录工具执行结果,用于 TLS 平台 | -| 5 | `cozeloop.input` | 工具输入 | 同 `gen_ai.tool.input`,用于 CozeLoop 平台 | -| 6 | `cozeloop.output` | 工具输出 | 同 `gen_ai.tool.output`,用于 CozeLoop 平台 | +| 3 | `gen_ai.tool.input` | 工具输入内容 | JSON 序列化包含:`name`、`description`、`parameters`,用于记录工具调用参数,用于 TLS 平台;受内容采集开关控制 | +| 4 | `gen_ai.tool.output` | 工具输出内容 | JSON 序列化包含:`id`、`name`、`response`,记录工具执行结果,用于 TLS 平台;受内容采集开关控制 | +| 5 | `cozeloop.input` | 工具输入 | 同 `gen_ai.tool.input`,用于 CozeLoop 平台;受内容采集开关控制 | +| 6 | `cozeloop.output` | 工具输出 | 同 `gen_ai.tool.output`,用于 CozeLoop 平台;受内容采集开关控制 | | 7 | `gen_ai.span.kind` | Span 类型 | 固定返回 `tool`,遵循 OpenTelemetry 语义约定,用于 APMPlus 平台 | -| 8 | `gen_ai.input` | 工具输入 | 同 `gen_ai.tool.input`,用于 APMPlus 平台 | -| 9 | `gen_ai.output` | 工具输出 | 同 `gen_ai.tool.output`,用于 APMPlus 平台 | +| 8 | `gen_ai.input` | 工具输入 | 同 `gen_ai.tool.input`,用于 APMPlus 平台;受内容采集开关控制 | +| 9 | `gen_ai.output` | 工具输出 | 同 `gen_ai.tool.output`,用于 APMPlus 平台;受内容采集开关控制 | diff --git a/docs/content/docs/framework/observability/tracing.en.mdx b/docs/content/docs/framework/observability/tracing.en.mdx index b381d722..849e4e14 100644 --- a/docs/content/docs/framework/observability/tracing.en.mdx +++ b/docs/content/docs/framework/observability/tracing.en.mdx @@ -35,12 +35,14 @@ To enable full-chain observability, attach an `OpentelemetryTracer` with one or - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_API_KEY`: The API key of the APM service - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_ENDPOINT`: The endpoint of the APM service, e.g. `http://apmplus-cn-beijing.volces.com:4317` - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_SERVICE_NAME`: The service name for APM, e.g. `python_coder_agent` + - `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT`: Whether to collect Agent, LLM, and tool input/output content, default `true` Or define them in `config.yaml`: ```yaml title="config.yaml" observability: opentelemetry: + trace_content: true apmplus: endpoint: ... api_key: ... diff --git a/docs/content/docs/framework/observability/tracing.mdx b/docs/content/docs/framework/observability/tracing.mdx index e9a6b77e..b30e6dda 100644 --- a/docs/content/docs/framework/observability/tracing.mdx +++ b/docs/content/docs/framework/observability/tracing.mdx @@ -34,13 +34,13 @@ Tracing(链路追踪)对智能体执行过程进行**全链路记录**,是 - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_API_KEY`:APM 服务的 API Key - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_ENDPOINT`:APM 服务的 Endpoint,例如 `http://apmplus-cn-beijing.volces.com:4317` - - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_SERVICE_NAME`:APM 的 Service Name,例如 `python_coder_agent` + - `OBSERVABILITY_OPENTELEMETRY_APMPLUS_SERVICE_NAME`:APM 的 Service Name,例如 `python_coder_agent - `OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT`:是否采集 Agent、LLM 与工具输入输出内容,默认 `true` 或在 `config.yaml` 中定义: ```yaml title="config.yaml" observability: - opentelemetry: + opentelemetry trace_content: true apmplus: endpoint: ... api_key: ... diff --git a/tests/test_tracing_content.py b/tests/test_tracing_content.py new file mode 100644 index 00000000..367ac6eb --- /dev/null +++ b/tests/test_tracing_content.py @@ -0,0 +1,263 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass + +from opentelemetry import context as context_api +from opentelemetry.sdk import trace as trace_sdk + +from veadk.config import settings +from veadk.tracing.telemetry import telemetry +from veadk.tracing.telemetry.content_tracing import should_trace_content +from veadk.tracing.telemetry.exporters.apmplus_exporter import MeterUploader + + +@dataclass +class _FakePart: + text: str | None = None + function_call: object | None = None + function_response: object | None = None + inline_data: object | None = None + + +class _FakeContent: + def __init__(self, role: str, parts: list[_FakePart]): + self.role = role + self.parts = parts + + def model_dump(self, exclude_none: bool = True): + return { + "role": self.role, + "parts": [{"text": part.text} for part in self.parts if part.text], + } + + +class _FakeConfig: + max_output_tokens = 128 + temperature = 0.5 + top_p = 0.9 + system_instruction = "system secret" + + +class _FakeUsageMetadata: + prompt_token_count = 11 + candidates_token_count = 7 + total_token_count = 18 + cached_content_token_count = 0 + + +class _FakeLlmRequest: + model = "test-model" + config = _FakeConfig() + contents = [_FakeContent("user", [_FakePart(text="user secret")])] + tools_dict = {} + + def model_dump(self, exclude_none: bool = True): + return {"model": self.model} + + +class _FakeLlmResponse: + content = _FakeContent("model", [_FakePart(text="assistant secret")]) + usage_metadata = _FakeUsageMetadata() + error_code = None + + def model_dump(self, exclude_none: bool = True): + return {"content": self.content.model_dump(exclude_none=exclude_none)} + + +class _FakeSession: + app_name = "app" + id = "session" + + +class _FakeAgent: + name = "agent" + model_provider = "provider" + model_name = "model" + model_api_base = "http://model.test" + + +class _FakeInvocationContext: + agent = _FakeAgent() + app_name = "app" + user_id = "user" + session = _FakeSession() + invocation_id = "invocation" + run_config = None + user_content = _FakeContent("user", [_FakePart(text="root user secret")]) + + +class _FakeFunctionResponse: + def model_dump(self): + return { + "id": "call-1", + "name": "lookup", + "response": {"result": "tool secret"}, + } + + +class _FakeFunctionResponseEvent: + def get_function_responses(self): + return [_FakeFunctionResponse()] + + +class _ExplodingFunctionResponseEvent: + def get_function_responses(self): + raise AssertionError("tool output content should not be read") + + +class _FakeTool: + name = "lookup" + description = "looks up private data" + custom_metadata = {} + + +class _FakeMetricRecorder: + def __init__(self): + self.records = [] + + def record(self, value, attributes=None): + self.records.append((value, attributes)) + + +def _start_test_span(name: str): + provider = trace_sdk.TracerProvider() + tracer = provider.get_tracer(__name__) + return tracer.start_as_current_span(name) + + +def _event_names(span): + return [event.name for event in span.events] + + +def setup_function(): + telemetry.meter_uploader = None + + +def test_trace_call_llm_records_content_by_default(monkeypatch): + monkeypatch.delenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", raising=False) + + with _start_test_span("call_llm") as span: + telemetry.trace_call_llm( + _FakeInvocationContext(), + "event-id", + _FakeLlmRequest(), + _FakeLlmResponse(), + ) + + assert span.attributes["gen_ai.request.model"] == "test-model" + assert span.attributes["gen_ai.prompt.0.content"] == "user secret" + assert span.attributes["gen_ai.completion.0.content"] == "assistant secret" + assert "gen_ai.system.message" in _event_names(span) + assert "gen_ai.user.message" in _event_names(span) + assert "gen_ai.choice" in _event_names(span) + + +def test_content_tracing_uses_veadk_config_when_env_missing(monkeypatch): + monkeypatch.delenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", raising=False) + monkeypatch.setattr(settings.opentelemetry_config, "trace_content", False) + + assert should_trace_content() is False + + +def test_trace_call_llm_skips_content_when_env_false(monkeypatch): + monkeypatch.setenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", "false") + + with _start_test_span("call_llm") as span: + telemetry.trace_call_llm( + _FakeInvocationContext(), + "event-id", + _FakeLlmRequest(), + _FakeLlmResponse(), + ) + + assert span.attributes["gen_ai.request.model"] == "test-model" + assert span.attributes["gen_ai.usage.total_tokens"] == 18 + assert not any(k.startswith("gen_ai.prompt.") for k in span.attributes) + assert not any(k.startswith("gen_ai.completion.") for k in span.attributes) + assert "gen_ai.system.message" not in _event_names(span) + assert "gen_ai.user.message" not in _event_names(span) + assert "gen_ai.tool.message" not in _event_names(span) + assert "gen_ai.assistant.message" not in _event_names(span) + assert "gen_ai.choice" not in _event_names(span) + + +def test_trace_tool_call_skips_content_when_env_false(monkeypatch): + monkeypatch.setenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", "false") + + with _start_test_span("execute_tool lookup") as span: + telemetry.trace_tool_call( + _FakeTool(), + {"query": "tool input secret"}, + _FakeFunctionResponseEvent(), + ) + + assert span.attributes["gen_ai.operation.name"] == "execute_tool" + assert span.attributes["gen_ai.tool.name"] == "lookup" + assert span.attributes["gen_ai.span.kind"] == "tool" + assert "gen_ai.tool.input" not in span.attributes + assert "gen_ai.tool.output" not in span.attributes + assert "cozeloop.input" not in span.attributes + assert "cozeloop.output" not in span.attributes + assert "gen_ai.input" not in span.attributes + assert "gen_ai.output" not in span.attributes + + +def test_apmplus_tool_metrics_skip_token_usage_when_tool_content_missing(): + meter_uploader = object.__new__(MeterUploader) + meter_uploader.apmplus_span_latency = _FakeMetricRecorder() + meter_uploader.apmplus_tool_token_usage = _FakeMetricRecorder() + + with _start_test_span("execute_tool lookup"): + meter_uploader.record_tool_call( + _FakeTool(), + {"query": "tool input secret"}, + _ExplodingFunctionResponseEvent(), + ) + + assert len(meter_uploader.apmplus_span_latency.records) == 1 + assert meter_uploader.apmplus_tool_token_usage.records == [] + + +def test_agent_root_span_skips_content_when_env_false(monkeypatch): + monkeypatch.setenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", "false") + + with _start_test_span("invocation") as span: + telemetry._set_agent_input_attribute(span, _FakeInvocationContext()) + telemetry._set_agent_output_attribute(span, _FakeLlmResponse()) + + assert "gen_ai.input" not in span.attributes + assert "gen_ai.output" not in span.attributes + assert "gen_ai.user.message" not in _event_names(span) + assert "gen_ai.choice" not in _event_names(span) + + +def test_content_tracing_context_override_allows_content(monkeypatch): + monkeypatch.setenv("OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", "false") + token = context_api.attach( + context_api.set_value("override_enable_content_tracing", True) + ) + try: + with _start_test_span("call_llm") as span: + telemetry.trace_call_llm( + _FakeInvocationContext(), + "event-id", + _FakeLlmRequest(), + _FakeLlmResponse(), + ) + + assert span.attributes["gen_ai.prompt.0.content"] == "user secret" + assert span.attributes["gen_ai.completion.0.content"] == "assistant secret" + finally: + context_api.detach(token) diff --git a/veadk/config.py b/veadk/config.py index fdacae82..3ad0cc4c 100644 --- a/veadk/config.py +++ b/veadk/config.py @@ -32,6 +32,7 @@ from veadk.configs.tracing_configs import ( APMPlusConfig, CozeloopConfig, + OpenTelemetryConfig, PrometheusConfig, TLSConfig, ) @@ -66,6 +67,9 @@ class VeADKConfig(BaseModel): tool: BuiltinToolConfigs = Field(default_factory=BuiltinToolConfigs) prompt_pilot: PromptPilotConfig = Field(default_factory=PromptPilotConfig) + opentelemetry_config: OpenTelemetryConfig = Field( + default_factory=OpenTelemetryConfig + ) apmplus_config: APMPlusConfig = Field(default_factory=APMPlusConfig) cozeloop_config: CozeloopConfig = Field(default_factory=CozeloopConfig) tls_config: TLSConfig = Field(default_factory=TLSConfig) diff --git a/veadk/configs/tracing_configs.py b/veadk/configs/tracing_configs.py index 6f224e14..2b236573 100644 --- a/veadk/configs/tracing_configs.py +++ b/veadk/configs/tracing_configs.py @@ -31,6 +31,13 @@ from veadk.integrations.ve_tls.ve_tls import VeTLS +class OpenTelemetryConfig(BaseSettings): + trace_content: bool = Field( + default=True, + alias="OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT", + ) + + class APMPlusConfig(BaseSettings): otel_exporter_endpoint: str = Field( default=DEFAULT_APMPLUS_OTEL_EXPORTER_ENDPOINT, diff --git a/veadk/tracing/telemetry/attributes/attributes.py b/veadk/tracing/telemetry/attributes/attributes.py index 19ced9d9..edfcde40 100644 --- a/veadk/tracing/telemetry/attributes/attributes.py +++ b/veadk/tracing/telemetry/attributes/attributes.py @@ -21,9 +21,41 @@ from veadk.tracing.telemetry.attributes.extractors.tool_attributes_extractors import ( TOOL_ATTRIBUTES, ) +from veadk.tracing.telemetry.content_tracing import should_trace_content ATTRIBUTES = { "common": COMMON_ATTRIBUTES, "llm": LLM_ATTRIBUTES, "tool": TOOL_ATTRIBUTES, } + +CONTENT_ATTRIBUTES = { + "llm": { + "gen_ai.prompt", + "gen_ai.completion", + "gen_ai.messages", + "gen_ai.choice", + }, + "tool": { + "gen_ai.tool.input", + "gen_ai.tool.output", + "cozeloop.input", + "cozeloop.output", + "gen_ai.input", + "gen_ai.output", + }, +} + + +def get_attributes(kind: str) -> dict: + """Return trace attributes, excluding content fields when configured.""" + attributes = ATTRIBUTES.get(kind, {}) + content_attributes = CONTENT_ATTRIBUTES.get(kind) + if not content_attributes or should_trace_content(): + return attributes + + return { + attr_name: attr_extractor + for attr_name, attr_extractor in attributes.items() + if attr_name not in content_attributes + } diff --git a/veadk/tracing/telemetry/content_tracing.py b/veadk/tracing/telemetry/content_tracing.py new file mode 100644 index 00000000..0a3063ef --- /dev/null +++ b/veadk/tracing/telemetry/content_tracing.py @@ -0,0 +1,34 @@ +# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from opentelemetry import context as context_api + +OVERRIDE_ENABLE_CONTENT_TRACING = "override_enable_content_tracing" +TRACE_CONTENT_ENV_VAR = "OBSERVABILITY_OPENTELEMETRY_TRACE_CONTENT" + + +def should_trace_content() -> bool: + """Return whether prompt/completion/tool content should be added to spans.""" + from veadk.config import settings + + trace_content = settings.opentelemetry_config.trace_content + # VeADK flattens config.yaml into environment variables during startup. + # Reading the env var here keeps system/.env/config.yaml override behavior + # aligned with other config fields, and also supports runtime test overrides. + trace_content = os.getenv(TRACE_CONTENT_ENV_VAR, str(trace_content)) + return str(trace_content).lower() == "true" or bool( + context_api.get_value(OVERRIDE_ENABLE_CONTENT_TRACING) + ) diff --git a/veadk/tracing/telemetry/exporters/apmplus_exporter.py b/veadk/tracing/telemetry/exporters/apmplus_exporter.py index e21eb168..2b096aa4 100644 --- a/veadk/tracing/telemetry/exporters/apmplus_exporter.py +++ b/veadk/tracing/telemetry/exporters/apmplus_exporter.py @@ -431,23 +431,26 @@ def record_tool_call( self.apmplus_span_latency.record(duration, attributes=attributes) if self.apmplus_tool_token_usage and hasattr(span, "attributes"): - tool_input = span.attributes["gen_ai.tool.input"] - tool_token_usage_input = ( - len(tool_input) / 4 - ) # tool token 数量,使用文本长度/4 - input_tool_token_attributes = {**attributes, "token_type": "input"} - self.apmplus_tool_token_usage.record( - tool_token_usage_input, attributes=input_tool_token_attributes - ) - - tool_output = span.attributes["gen_ai.tool.output"] - tool_token_usage_output = ( - len(tool_output) / 4 - ) # tool token 数量,使用文本长度/4 - output_tool_token_attributes = {**attributes, "token_type": "output"} - self.apmplus_tool_token_usage.record( - tool_token_usage_output, attributes=output_tool_token_attributes - ) + span_attributes = span.attributes or {} + tool_input = span_attributes.get("gen_ai.tool.input") + if tool_input: + tool_token_usage_input = ( + len(tool_input) / 4 + ) # tool token 数量,使用文本长度/4 + input_tool_token_attributes = {**attributes, "token_type": "input"} + self.apmplus_tool_token_usage.record( + tool_token_usage_input, attributes=input_tool_token_attributes + ) + + tool_output = span_attributes.get("gen_ai.tool.output") + if tool_output: + tool_token_usage_output = ( + len(tool_output) / 4 + ) # tool token 数量,使用文本长度/4 + output_tool_token_attributes = {**attributes, "token_type": "output"} + self.apmplus_tool_token_usage.record( + tool_token_usage_output, attributes=output_tool_token_attributes + ) class APMPlusExporterConfig(BaseModel): diff --git a/veadk/tracing/telemetry/telemetry.py b/veadk/tracing/telemetry/telemetry.py index 3f46f538..dfabd5f5 100644 --- a/veadk/tracing/telemetry/telemetry.py +++ b/veadk/tracing/telemetry/telemetry.py @@ -23,12 +23,13 @@ from opentelemetry.context import get_value from opentelemetry.sdk.trace import Span, _Span -from veadk.tracing.telemetry.attributes.attributes import ATTRIBUTES +from veadk.tracing.telemetry.attributes.attributes import ATTRIBUTES, get_attributes from veadk.tracing.telemetry.attributes.extractors.types import ( ExtractorResponse, LLMAttributesParams, ToolAttributesParams, ) +from veadk.tracing.telemetry.content_tracing import should_trace_content from veadk.utils.logger import get_logger from veadk.utils.misc import safe_json_serialize @@ -130,6 +131,9 @@ def _set_agent_input_attribute( - Supports multimodal content (text and images) - Follows gen_ai attribute conventions """ + if not should_trace_content(): + return + event_names = [event.name for event in span.events] if "gen_ai.user.message" in event_names: return @@ -146,11 +150,6 @@ def _set_agent_input_attribute( user_content = invocation_context.user_content if user_content and user_content.parts: - # set gen_ai.input attribute required by APMPlus - span.set_attribute( - "gen_ai.input", - safe_json_serialize(user_content.model_dump(exclude_none=True)), - ) span.add_event( "gen_ai.user.message", { @@ -160,6 +159,12 @@ def _set_agent_input_attribute( "session_id": invocation_context.session.id, }, ) + + # set gen_ai.input attribute required by APMPlus + span.set_attribute( + "gen_ai.input", + safe_json_serialize(user_content.model_dump(exclude_none=True)), + ) for idx, part in enumerate(user_content.parts): if part.text: span.add_event( @@ -202,6 +207,9 @@ def _set_agent_output_attribute(span: Span, llm_response: LlmResponse) -> None: - Follows gen_ai attribute conventions - Handles multipart responses with proper indexing """ + if not should_trace_content(): + return + content = llm_response.content if content and content.parts: # set gen_ai.output attribute required by APMPlus @@ -347,7 +355,7 @@ def trace_tool_call( set_common_attributes_on_tool_span(current_span=span) # type: ignore - tool_attributes_mapping = ATTRIBUTES.get("tool", {}) + tool_attributes_mapping = get_attributes("tool") params = ToolAttributesParams(tool, args, function_response_event) for attr_name, attr_extractor in tool_attributes_mapping.items(): @@ -417,7 +425,7 @@ def trace_call_llm( ), ) - llm_attributes_mapping = ATTRIBUTES.get("llm", {}) + llm_attributes_mapping = get_attributes("llm") params = LLMAttributesParams( invocation_context=invocation_context, event_id=event_id,