diff --git a/langfuse/api/__init__.py b/langfuse/api/__init__.py index 0e036263a..46985c0b9 100644 --- a/langfuse/api/__init__.py +++ b/langfuse/api/__init__.py @@ -30,6 +30,7 @@ scim, score_configs, scores, + scores_v3, sessions, trace, unstable, @@ -297,6 +298,31 @@ GetScoresResponseData_Text, GetScoresResponseTraceData, ) + from .scores_v3 import ( + BaseScoreV3, + BooleanScoreV3, + CategoricalScoreV3, + CorrectionScoreV3, + GetScoresV3Meta, + GetScoresV3Response, + NumericScoreV3, + ScoreSubjectExperimentV3, + ScoreSubjectObservationV3, + ScoreSubjectSessionV3, + ScoreSubjectTraceV3, + ScoreSubjectV3, + ScoreSubjectV3_Experiment, + ScoreSubjectV3_Observation, + ScoreSubjectV3_Session, + ScoreSubjectV3_Trace, + ScoreV3, + ScoreV3_Boolean, + ScoreV3_Categorical, + ScoreV3_Correction, + ScoreV3_Numeric, + ScoreV3_Text, + TextScoreV3, + ) from .sessions import PaginatedSessions from .trace import DeleteTraceResponse, Sort, Traces _dynamic_imports: typing.Dict[str, str] = { @@ -316,6 +342,7 @@ "BasePrompt": ".prompts", "BaseScore": ".commons", "BaseScoreV1": ".commons", + "BaseScoreV3": ".scores_v3", "BlobStorageExportFieldGroup": ".blob_storage_integrations", "BlobStorageExportFrequency": ".blob_storage_integrations", "BlobStorageExportMode": ".blob_storage_integrations", @@ -329,9 +356,11 @@ "BlobStorageSyncStatus": ".blob_storage_integrations", "BooleanScore": ".commons", "BooleanScoreV1": ".commons", + "BooleanScoreV3": ".scores_v3", "BulkConfig": ".scim", "CategoricalScore": ".commons", "CategoricalScoreV1": ".commons", + "CategoricalScoreV3": ".scores_v3", "ChatMessage": ".prompts", "ChatMessageType": ".prompts", "ChatMessageWithPlaceholders": ".prompts", @@ -340,6 +369,7 @@ "CommentObjectType": ".commons", "ConfigCategory": ".commons", "CorrectionScore": ".commons", + "CorrectionScoreV3": ".scores_v3", "CreateAnnotationQueueAssignmentResponse": ".annotation_queues", "CreateAnnotationQueueItemRequest": ".annotation_queues", "CreateAnnotationQueueRequest": ".annotation_queues", @@ -397,6 +427,8 @@ "GetScoresResponseData_Numeric": ".scores", "GetScoresResponseData_Text": ".scores", "GetScoresResponseTraceData": ".scores", + "GetScoresV3Meta": ".scores_v3", + "GetScoresV3Response": ".scores_v3", "HealthResponse": ".health", "IngestionError": ".ingestion", "IngestionEvent": ".ingestion", @@ -431,6 +463,7 @@ "NotFoundError": ".commons", "NumericScore": ".commons", "NumericScoreV1": ".commons", + "NumericScoreV3": ".scores_v3", "Observation": ".commons", "ObservationBody": ".ingestion", "ObservationLevel": ".commons", @@ -500,11 +533,26 @@ "ScoreDataType": ".commons", "ScoreEvent": ".ingestion", "ScoreSource": ".commons", + "ScoreSubjectExperimentV3": ".scores_v3", + "ScoreSubjectObservationV3": ".scores_v3", + "ScoreSubjectSessionV3": ".scores_v3", + "ScoreSubjectTraceV3": ".scores_v3", + "ScoreSubjectV3": ".scores_v3", + "ScoreSubjectV3_Experiment": ".scores_v3", + "ScoreSubjectV3_Observation": ".scores_v3", + "ScoreSubjectV3_Session": ".scores_v3", + "ScoreSubjectV3_Trace": ".scores_v3", "ScoreV1": ".commons", "ScoreV1_Boolean": ".commons", "ScoreV1_Categorical": ".commons", "ScoreV1_Numeric": ".commons", "ScoreV1_Text": ".commons", + "ScoreV3": ".scores_v3", + "ScoreV3_Boolean": ".scores_v3", + "ScoreV3_Categorical": ".scores_v3", + "ScoreV3_Correction": ".scores_v3", + "ScoreV3_Numeric": ".scores_v3", + "ScoreV3_Text": ".scores_v3", "Score_Boolean": ".commons", "Score_Categorical": ".commons", "Score_Correction": ".commons", @@ -520,6 +568,7 @@ "TextPrompt": ".prompts", "TextScore": ".commons", "TextScoreV1": ".commons", + "TextScoreV3": ".scores_v3", "Trace": ".commons", "TraceBody": ".ingestion", "TraceEvent": ".ingestion", @@ -562,6 +611,7 @@ "scim": ".scim", "score_configs": ".score_configs", "scores": ".scores", + "scores_v3": ".scores_v3", "sessions": ".sessions", "trace": ".trace", "unstable": ".unstable", @@ -613,6 +663,7 @@ def __dir__(): "BasePrompt", "BaseScore", "BaseScoreV1", + "BaseScoreV3", "BlobStorageExportFieldGroup", "BlobStorageExportFrequency", "BlobStorageExportMode", @@ -626,9 +677,11 @@ def __dir__(): "BlobStorageSyncStatus", "BooleanScore", "BooleanScoreV1", + "BooleanScoreV3", "BulkConfig", "CategoricalScore", "CategoricalScoreV1", + "CategoricalScoreV3", "ChatMessage", "ChatMessageType", "ChatMessageWithPlaceholders", @@ -637,6 +690,7 @@ def __dir__(): "CommentObjectType", "ConfigCategory", "CorrectionScore", + "CorrectionScoreV3", "CreateAnnotationQueueAssignmentResponse", "CreateAnnotationQueueItemRequest", "CreateAnnotationQueueRequest", @@ -694,6 +748,8 @@ def __dir__(): "GetScoresResponseData_Numeric", "GetScoresResponseData_Text", "GetScoresResponseTraceData", + "GetScoresV3Meta", + "GetScoresV3Response", "HealthResponse", "IngestionError", "IngestionEvent", @@ -728,6 +784,7 @@ def __dir__(): "NotFoundError", "NumericScore", "NumericScoreV1", + "NumericScoreV3", "Observation", "ObservationBody", "ObservationLevel", @@ -797,11 +854,26 @@ def __dir__(): "ScoreDataType", "ScoreEvent", "ScoreSource", + "ScoreSubjectExperimentV3", + "ScoreSubjectObservationV3", + "ScoreSubjectSessionV3", + "ScoreSubjectTraceV3", + "ScoreSubjectV3", + "ScoreSubjectV3_Experiment", + "ScoreSubjectV3_Observation", + "ScoreSubjectV3_Session", + "ScoreSubjectV3_Trace", "ScoreV1", "ScoreV1_Boolean", "ScoreV1_Categorical", "ScoreV1_Numeric", "ScoreV1_Text", + "ScoreV3", + "ScoreV3_Boolean", + "ScoreV3_Categorical", + "ScoreV3_Correction", + "ScoreV3_Numeric", + "ScoreV3_Text", "Score_Boolean", "Score_Categorical", "Score_Correction", @@ -817,6 +889,7 @@ def __dir__(): "TextPrompt", "TextScore", "TextScoreV1", + "TextScoreV3", "Trace", "TraceBody", "TraceEvent", @@ -859,6 +932,7 @@ def __dir__(): "scim", "score_configs", "scores", + "scores_v3", "sessions", "trace", "unstable", diff --git a/langfuse/api/client.py b/langfuse/api/client.py index c0413704b..a72aede85 100644 --- a/langfuse/api/client.py +++ b/langfuse/api/client.py @@ -39,6 +39,7 @@ from .scim.client import AsyncScimClient, ScimClient from .score_configs.client import AsyncScoreConfigsClient, ScoreConfigsClient from .scores.client import AsyncScoresClient, ScoresClient + from .scores_v3.client import AsyncScoresV3Client, ScoresV3Client from .sessions.client import AsyncSessionsClient, SessionsClient from .trace.client import AsyncTraceClient, TraceClient from .unstable.client import AsyncUnstableClient, UnstableClient @@ -145,6 +146,7 @@ def __init__( self._prompts: typing.Optional[PromptsClient] = None self._scim: typing.Optional[ScimClient] = None self._score_configs: typing.Optional[ScoreConfigsClient] = None + self._scores_v3: typing.Optional[ScoresV3Client] = None self._scores: typing.Optional[ScoresClient] = None self._sessions: typing.Optional[SessionsClient] = None self._trace: typing.Optional[TraceClient] = None @@ -336,6 +338,14 @@ def score_configs(self): ) return self._score_configs + @property + def scores_v3(self): + if self._scores_v3 is None: + from .scores_v3.client import ScoresV3Client # noqa: E402 + + self._scores_v3 = ScoresV3Client(client_wrapper=self._client_wrapper) + return self._scores_v3 + @property def scores(self): if self._scores is None: @@ -470,6 +480,7 @@ def __init__( self._prompts: typing.Optional[AsyncPromptsClient] = None self._scim: typing.Optional[AsyncScimClient] = None self._score_configs: typing.Optional[AsyncScoreConfigsClient] = None + self._scores_v3: typing.Optional[AsyncScoresV3Client] = None self._scores: typing.Optional[AsyncScoresClient] = None self._sessions: typing.Optional[AsyncSessionsClient] = None self._trace: typing.Optional[AsyncTraceClient] = None @@ -665,6 +676,14 @@ def score_configs(self): ) return self._score_configs + @property + def scores_v3(self): + if self._scores_v3 is None: + from .scores_v3.client import AsyncScoresV3Client # noqa: E402 + + self._scores_v3 = AsyncScoresV3Client(client_wrapper=self._client_wrapper) + return self._scores_v3 + @property def scores(self): if self._scores is None: diff --git a/langfuse/api/scores_v3/__init__.py b/langfuse/api/scores_v3/__init__.py new file mode 100644 index 000000000..855868335 --- /dev/null +++ b/langfuse/api/scores_v3/__init__.py @@ -0,0 +1,112 @@ +# This file was auto-generated by Fern from our API Definition. + +# isort: skip_file + +import typing +from importlib import import_module + +if typing.TYPE_CHECKING: + from .types import ( + BaseScoreV3, + BooleanScoreV3, + CategoricalScoreV3, + CorrectionScoreV3, + GetScoresV3Meta, + GetScoresV3Response, + NumericScoreV3, + ScoreSubjectExperimentV3, + ScoreSubjectObservationV3, + ScoreSubjectSessionV3, + ScoreSubjectTraceV3, + ScoreSubjectV3, + ScoreSubjectV3_Experiment, + ScoreSubjectV3_Observation, + ScoreSubjectV3_Session, + ScoreSubjectV3_Trace, + ScoreV3, + ScoreV3_Boolean, + ScoreV3_Categorical, + ScoreV3_Correction, + ScoreV3_Numeric, + ScoreV3_Text, + TextScoreV3, + ) +_dynamic_imports: typing.Dict[str, str] = { + "BaseScoreV3": ".types", + "BooleanScoreV3": ".types", + "CategoricalScoreV3": ".types", + "CorrectionScoreV3": ".types", + "GetScoresV3Meta": ".types", + "GetScoresV3Response": ".types", + "NumericScoreV3": ".types", + "ScoreSubjectExperimentV3": ".types", + "ScoreSubjectObservationV3": ".types", + "ScoreSubjectSessionV3": ".types", + "ScoreSubjectTraceV3": ".types", + "ScoreSubjectV3": ".types", + "ScoreSubjectV3_Experiment": ".types", + "ScoreSubjectV3_Observation": ".types", + "ScoreSubjectV3_Session": ".types", + "ScoreSubjectV3_Trace": ".types", + "ScoreV3": ".types", + "ScoreV3_Boolean": ".types", + "ScoreV3_Categorical": ".types", + "ScoreV3_Correction": ".types", + "ScoreV3_Numeric": ".types", + "ScoreV3_Text": ".types", + "TextScoreV3": ".types", +} + + +def __getattr__(attr_name: str) -> typing.Any: + module_name = _dynamic_imports.get(attr_name) + if module_name is None: + raise AttributeError( + f"No {attr_name} found in _dynamic_imports for module name -> {__name__}" + ) + try: + module = import_module(module_name, __package__) + if module_name == f".{attr_name}": + return module + else: + return getattr(module, attr_name) + except ImportError as e: + raise ImportError( + f"Failed to import {attr_name} from {module_name}: {e}" + ) from e + except AttributeError as e: + raise AttributeError( + f"Failed to get {attr_name} from {module_name}: {e}" + ) from e + + +def __dir__(): + lazy_attrs = list(_dynamic_imports.keys()) + return sorted(lazy_attrs) + + +__all__ = [ + "BaseScoreV3", + "BooleanScoreV3", + "CategoricalScoreV3", + "CorrectionScoreV3", + "GetScoresV3Meta", + "GetScoresV3Response", + "NumericScoreV3", + "ScoreSubjectExperimentV3", + "ScoreSubjectObservationV3", + "ScoreSubjectSessionV3", + "ScoreSubjectTraceV3", + "ScoreSubjectV3", + "ScoreSubjectV3_Experiment", + "ScoreSubjectV3_Observation", + "ScoreSubjectV3_Session", + "ScoreSubjectV3_Trace", + "ScoreV3", + "ScoreV3_Boolean", + "ScoreV3_Categorical", + "ScoreV3_Correction", + "ScoreV3_Numeric", + "ScoreV3_Text", + "TextScoreV3", +] diff --git a/langfuse/api/scores_v3/client.py b/langfuse/api/scores_v3/client.py new file mode 100644 index 000000000..2755d3e74 --- /dev/null +++ b/langfuse/api/scores_v3/client.py @@ -0,0 +1,341 @@ +# This file was auto-generated by Fern from our API Definition. + +import datetime as dt +import typing + +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.request_options import RequestOptions +from .raw_client import AsyncRawScoresV3Client, RawScoresV3Client +from .types.get_scores_v3response import GetScoresV3Response + + +class ScoresV3Client: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._raw_client = RawScoresV3Client(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> RawScoresV3Client: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + RawScoresV3Client + """ + return self._raw_client + + def get_many_v3( + self, + *, + limit: typing.Optional[int] = None, + cursor: typing.Optional[str] = None, + fields: typing.Optional[str] = None, + id: typing.Optional[str] = None, + name: typing.Optional[str] = None, + source: typing.Optional[str] = None, + data_type: typing.Optional[str] = None, + environment: typing.Optional[str] = None, + config_id: typing.Optional[str] = None, + queue_id: typing.Optional[str] = None, + author_user_id: typing.Optional[str] = None, + value: typing.Optional[str] = None, + value_min: typing.Optional[float] = None, + value_max: typing.Optional[float] = None, + trace_id: typing.Optional[str] = None, + session_id: typing.Optional[str] = None, + observation_id: typing.Optional[str] = None, + experiment_id: typing.Optional[str] = None, + from_timestamp: typing.Optional[dt.datetime] = None, + to_timestamp: typing.Optional[dt.datetime] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> GetScoresV3Response: + """ + Get a list of scores with a polymorphic `value` field (v3). + + This endpoint requires Langfuse v4 or later. + + The `value` field type depends on `dataType`: + - `NUMERIC` → number + - `BOOLEAN` → boolean + - `CATEGORICAL`, `TEXT`, `CORRECTION` → string + + Use the `fields` parameter to include optional field groups beyond the + default `core`. Unknown group names return HTTP 400. + + Parameters + ---------- + limit : typing.Optional[int] + Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400. + + cursor : typing.Optional[str] + URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page. + + fields : typing.Optional[str] + Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400. + + id : typing.Optional[str] + Comma-separated list of score IDs to filter by (OR within, AND across filters). + + name : typing.Optional[str] + Comma-separated list of score names to filter by. + + source : typing.Optional[str] + Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent. + + data_type : typing.Optional[str] + Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax. + + environment : typing.Optional[str] + Comma-separated list of environments to filter by. + + config_id : typing.Optional[str] + Comma-separated list of score config IDs to filter by. + + queue_id : typing.Optional[str] + Comma-separated list of annotation queue IDs to filter by. + + author_user_id : typing.Optional[str] + Comma-separated list of author user IDs to filter by. + + value : typing.Optional[str] + Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400. + + value_min : typing.Optional[float] + Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + value_max : typing.Optional[float] + Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + trace_id : typing.Optional[str] + Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace. + + session_id : typing.Optional[str] + Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId. + + observation_id : typing.Optional[str] + Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId. + + experiment_id : typing.Optional[str] + Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId. + + from_timestamp : typing.Optional[dt.datetime] + Inclusive lower bound on the score timestamp. + + to_timestamp : typing.Optional[dt.datetime] + Exclusive upper bound on the score timestamp. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + GetScoresV3Response + + Examples + -------- + from langfuse import LangfuseAPI + + client = LangfuseAPI( + x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", + x_langfuse_sdk_version="YOUR_X_LANGFUSE_SDK_VERSION", + x_langfuse_public_key="YOUR_X_LANGFUSE_PUBLIC_KEY", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", + base_url="https://yourhost.com/path/to/api", + ) + client.scores_v3.get_many_v3() + """ + _response = self._raw_client.get_many_v3( + limit=limit, + cursor=cursor, + fields=fields, + id=id, + name=name, + source=source, + data_type=data_type, + environment=environment, + config_id=config_id, + queue_id=queue_id, + author_user_id=author_user_id, + value=value, + value_min=value_min, + value_max=value_max, + trace_id=trace_id, + session_id=session_id, + observation_id=observation_id, + experiment_id=experiment_id, + from_timestamp=from_timestamp, + to_timestamp=to_timestamp, + request_options=request_options, + ) + return _response.data + + +class AsyncScoresV3Client: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._raw_client = AsyncRawScoresV3Client(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> AsyncRawScoresV3Client: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + AsyncRawScoresV3Client + """ + return self._raw_client + + async def get_many_v3( + self, + *, + limit: typing.Optional[int] = None, + cursor: typing.Optional[str] = None, + fields: typing.Optional[str] = None, + id: typing.Optional[str] = None, + name: typing.Optional[str] = None, + source: typing.Optional[str] = None, + data_type: typing.Optional[str] = None, + environment: typing.Optional[str] = None, + config_id: typing.Optional[str] = None, + queue_id: typing.Optional[str] = None, + author_user_id: typing.Optional[str] = None, + value: typing.Optional[str] = None, + value_min: typing.Optional[float] = None, + value_max: typing.Optional[float] = None, + trace_id: typing.Optional[str] = None, + session_id: typing.Optional[str] = None, + observation_id: typing.Optional[str] = None, + experiment_id: typing.Optional[str] = None, + from_timestamp: typing.Optional[dt.datetime] = None, + to_timestamp: typing.Optional[dt.datetime] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> GetScoresV3Response: + """ + Get a list of scores with a polymorphic `value` field (v3). + + This endpoint requires Langfuse v4 or later. + + The `value` field type depends on `dataType`: + - `NUMERIC` → number + - `BOOLEAN` → boolean + - `CATEGORICAL`, `TEXT`, `CORRECTION` → string + + Use the `fields` parameter to include optional field groups beyond the + default `core`. Unknown group names return HTTP 400. + + Parameters + ---------- + limit : typing.Optional[int] + Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400. + + cursor : typing.Optional[str] + URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page. + + fields : typing.Optional[str] + Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400. + + id : typing.Optional[str] + Comma-separated list of score IDs to filter by (OR within, AND across filters). + + name : typing.Optional[str] + Comma-separated list of score names to filter by. + + source : typing.Optional[str] + Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent. + + data_type : typing.Optional[str] + Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax. + + environment : typing.Optional[str] + Comma-separated list of environments to filter by. + + config_id : typing.Optional[str] + Comma-separated list of score config IDs to filter by. + + queue_id : typing.Optional[str] + Comma-separated list of annotation queue IDs to filter by. + + author_user_id : typing.Optional[str] + Comma-separated list of author user IDs to filter by. + + value : typing.Optional[str] + Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400. + + value_min : typing.Optional[float] + Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + value_max : typing.Optional[float] + Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + trace_id : typing.Optional[str] + Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace. + + session_id : typing.Optional[str] + Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId. + + observation_id : typing.Optional[str] + Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId. + + experiment_id : typing.Optional[str] + Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId. + + from_timestamp : typing.Optional[dt.datetime] + Inclusive lower bound on the score timestamp. + + to_timestamp : typing.Optional[dt.datetime] + Exclusive upper bound on the score timestamp. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + GetScoresV3Response + + Examples + -------- + import asyncio + + from langfuse import AsyncLangfuseAPI + + client = AsyncLangfuseAPI( + x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", + x_langfuse_sdk_version="YOUR_X_LANGFUSE_SDK_VERSION", + x_langfuse_public_key="YOUR_X_LANGFUSE_PUBLIC_KEY", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", + base_url="https://yourhost.com/path/to/api", + ) + + + async def main() -> None: + await client.scores_v3.get_many_v3() + + + asyncio.run(main()) + """ + _response = await self._raw_client.get_many_v3( + limit=limit, + cursor=cursor, + fields=fields, + id=id, + name=name, + source=source, + data_type=data_type, + environment=environment, + config_id=config_id, + queue_id=queue_id, + author_user_id=author_user_id, + value=value, + value_min=value_min, + value_max=value_max, + trace_id=trace_id, + session_id=session_id, + observation_id=observation_id, + experiment_id=experiment_id, + from_timestamp=from_timestamp, + to_timestamp=to_timestamp, + request_options=request_options, + ) + return _response.data diff --git a/langfuse/api/scores_v3/raw_client.py b/langfuse/api/scores_v3/raw_client.py new file mode 100644 index 000000000..47c9f3f8d --- /dev/null +++ b/langfuse/api/scores_v3/raw_client.py @@ -0,0 +1,460 @@ +# This file was auto-generated by Fern from our API Definition. + +import datetime as dt +import typing +from json.decoder import JSONDecodeError + +from ..commons.errors.access_denied_error import AccessDeniedError +from ..commons.errors.error import Error +from ..commons.errors.method_not_allowed_error import MethodNotAllowedError +from ..commons.errors.not_found_error import NotFoundError +from ..commons.errors.unauthorized_error import UnauthorizedError +from ..core.api_error import ApiError +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.datetime_utils import serialize_datetime +from ..core.http_response import AsyncHttpResponse, HttpResponse +from ..core.pydantic_utilities import parse_obj_as +from ..core.request_options import RequestOptions +from .types.get_scores_v3response import GetScoresV3Response + + +class RawScoresV3Client: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._client_wrapper = client_wrapper + + def get_many_v3( + self, + *, + limit: typing.Optional[int] = None, + cursor: typing.Optional[str] = None, + fields: typing.Optional[str] = None, + id: typing.Optional[str] = None, + name: typing.Optional[str] = None, + source: typing.Optional[str] = None, + data_type: typing.Optional[str] = None, + environment: typing.Optional[str] = None, + config_id: typing.Optional[str] = None, + queue_id: typing.Optional[str] = None, + author_user_id: typing.Optional[str] = None, + value: typing.Optional[str] = None, + value_min: typing.Optional[float] = None, + value_max: typing.Optional[float] = None, + trace_id: typing.Optional[str] = None, + session_id: typing.Optional[str] = None, + observation_id: typing.Optional[str] = None, + experiment_id: typing.Optional[str] = None, + from_timestamp: typing.Optional[dt.datetime] = None, + to_timestamp: typing.Optional[dt.datetime] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> HttpResponse[GetScoresV3Response]: + """ + Get a list of scores with a polymorphic `value` field (v3). + + This endpoint requires Langfuse v4 or later. + + The `value` field type depends on `dataType`: + - `NUMERIC` → number + - `BOOLEAN` → boolean + - `CATEGORICAL`, `TEXT`, `CORRECTION` → string + + Use the `fields` parameter to include optional field groups beyond the + default `core`. Unknown group names return HTTP 400. + + Parameters + ---------- + limit : typing.Optional[int] + Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400. + + cursor : typing.Optional[str] + URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page. + + fields : typing.Optional[str] + Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400. + + id : typing.Optional[str] + Comma-separated list of score IDs to filter by (OR within, AND across filters). + + name : typing.Optional[str] + Comma-separated list of score names to filter by. + + source : typing.Optional[str] + Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent. + + data_type : typing.Optional[str] + Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax. + + environment : typing.Optional[str] + Comma-separated list of environments to filter by. + + config_id : typing.Optional[str] + Comma-separated list of score config IDs to filter by. + + queue_id : typing.Optional[str] + Comma-separated list of annotation queue IDs to filter by. + + author_user_id : typing.Optional[str] + Comma-separated list of author user IDs to filter by. + + value : typing.Optional[str] + Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400. + + value_min : typing.Optional[float] + Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + value_max : typing.Optional[float] + Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + trace_id : typing.Optional[str] + Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace. + + session_id : typing.Optional[str] + Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId. + + observation_id : typing.Optional[str] + Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId. + + experiment_id : typing.Optional[str] + Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId. + + from_timestamp : typing.Optional[dt.datetime] + Inclusive lower bound on the score timestamp. + + to_timestamp : typing.Optional[dt.datetime] + Exclusive upper bound on the score timestamp. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + HttpResponse[GetScoresV3Response] + """ + _response = self._client_wrapper.httpx_client.request( + "api/public/v3/scores", + method="GET", + params={ + "limit": limit, + "cursor": cursor, + "fields": fields, + "id": id, + "name": name, + "source": source, + "dataType": data_type, + "environment": environment, + "configId": config_id, + "queueId": queue_id, + "authorUserId": author_user_id, + "value": value, + "valueMin": value_min, + "valueMax": value_max, + "traceId": trace_id, + "sessionId": session_id, + "observationId": observation_id, + "experimentId": experiment_id, + "fromTimestamp": serialize_datetime(from_timestamp) + if from_timestamp is not None + else None, + "toTimestamp": serialize_datetime(to_timestamp) + if to_timestamp is not None + else None, + }, + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + GetScoresV3Response, + parse_obj_as( + type_=GetScoresV3Response, # type: ignore + object_=_response.json(), + ), + ) + return HttpResponse(response=_response, data=_data) + if _response.status_code == 400: + raise Error( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 401: + raise UnauthorizedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 403: + raise AccessDeniedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 405: + raise MethodNotAllowedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 404: + raise NotFoundError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError( + status_code=_response.status_code, + headers=dict(_response.headers), + body=_response.text, + ) + raise ApiError( + status_code=_response.status_code, + headers=dict(_response.headers), + body=_response_json, + ) + + +class AsyncRawScoresV3Client: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._client_wrapper = client_wrapper + + async def get_many_v3( + self, + *, + limit: typing.Optional[int] = None, + cursor: typing.Optional[str] = None, + fields: typing.Optional[str] = None, + id: typing.Optional[str] = None, + name: typing.Optional[str] = None, + source: typing.Optional[str] = None, + data_type: typing.Optional[str] = None, + environment: typing.Optional[str] = None, + config_id: typing.Optional[str] = None, + queue_id: typing.Optional[str] = None, + author_user_id: typing.Optional[str] = None, + value: typing.Optional[str] = None, + value_min: typing.Optional[float] = None, + value_max: typing.Optional[float] = None, + trace_id: typing.Optional[str] = None, + session_id: typing.Optional[str] = None, + observation_id: typing.Optional[str] = None, + experiment_id: typing.Optional[str] = None, + from_timestamp: typing.Optional[dt.datetime] = None, + to_timestamp: typing.Optional[dt.datetime] = None, + request_options: typing.Optional[RequestOptions] = None, + ) -> AsyncHttpResponse[GetScoresV3Response]: + """ + Get a list of scores with a polymorphic `value` field (v3). + + This endpoint requires Langfuse v4 or later. + + The `value` field type depends on `dataType`: + - `NUMERIC` → number + - `BOOLEAN` → boolean + - `CATEGORICAL`, `TEXT`, `CORRECTION` → string + + Use the `fields` parameter to include optional field groups beyond the + default `core`. Unknown group names return HTTP 400. + + Parameters + ---------- + limit : typing.Optional[int] + Number of items per page. Maximum 100, default 50. Requests with a limit greater than 100 return HTTP 400. + + cursor : typing.Optional[str] + URL-safe base64 (base64url) cursor for pagination. Use the cursor from the previous response to get the next page. Absent on the final page. + + fields : typing.Optional[str] + Comma-separated field groups to include. Allowed: core, details, subject, annotation. Defaults to "core". Unknown names return HTTP 400. + + id : typing.Optional[str] + Comma-separated list of score IDs to filter by (OR within, AND across filters). + + name : typing.Optional[str] + Comma-separated list of score names to filter by. + + source : typing.Optional[str] + Comma-separated list of score sources to filter by (e.g. API, ANNOTATION, EVAL). Case-insensitive — `api` and `API` are equivalent. + + data_type : typing.Optional[str] + Comma-separated list of data types to filter by (NUMERIC, BOOLEAN, CATEGORICAL, TEXT, CORRECTION). Case-insensitive — `numeric` and `NUMERIC` are equivalent. Must be a single value when used with value, valueMin, or valueMax; otherwise the request returns HTTP 400. Must be NUMERIC when used with valueMin or valueMax. + + environment : typing.Optional[str] + Comma-separated list of environments to filter by. + + config_id : typing.Optional[str] + Comma-separated list of score config IDs to filter by. + + queue_id : typing.Optional[str] + Comma-separated list of annotation queue IDs to filter by. + + author_user_id : typing.Optional[str] + Comma-separated list of author user IDs to filter by. + + value : typing.Optional[str] + Comma-separated list of exact values to filter by. Requires a single dataType from NUMERIC, BOOLEAN, or CATEGORICAL; any other dataType, multiple dataTypes, or omitting dataType returns HTTP 400. For BOOLEAN, each value must be "true" or "false"; for NUMERIC, each value must be a finite number. Otherwise the request returns HTTP 400. + + value_min : typing.Optional[float] + Inclusive lower bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + value_max : typing.Optional[float] + Inclusive upper bound on the numeric value. Requires dataType=NUMERIC as a single value; otherwise the request returns HTTP 400. + + trace_id : typing.Optional[str] + Comma-separated list of trace IDs to filter by. Mutually exclusive with sessionId, experimentId. May be combined with observationId to scope the observation lookup to a specific trace. + + session_id : typing.Optional[str] + Comma-separated list of session IDs to filter by. Mutually exclusive with traceId, observationId, experimentId. + + observation_id : typing.Optional[str] + Comma-separated list of observation IDs to filter by. Requires traceId to be specified, because observation IDs are scoped to a trace. Mutually exclusive with sessionId, experimentId. Returns HTTP 400 when used without traceId. + + experiment_id : typing.Optional[str] + Comma-separated list of dataset run IDs (experiment IDs) to filter by. Mutually exclusive with traceId, sessionId, observationId. + + from_timestamp : typing.Optional[dt.datetime] + Inclusive lower bound on the score timestamp. + + to_timestamp : typing.Optional[dt.datetime] + Exclusive upper bound on the score timestamp. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AsyncHttpResponse[GetScoresV3Response] + """ + _response = await self._client_wrapper.httpx_client.request( + "api/public/v3/scores", + method="GET", + params={ + "limit": limit, + "cursor": cursor, + "fields": fields, + "id": id, + "name": name, + "source": source, + "dataType": data_type, + "environment": environment, + "configId": config_id, + "queueId": queue_id, + "authorUserId": author_user_id, + "value": value, + "valueMin": value_min, + "valueMax": value_max, + "traceId": trace_id, + "sessionId": session_id, + "observationId": observation_id, + "experimentId": experiment_id, + "fromTimestamp": serialize_datetime(from_timestamp) + if from_timestamp is not None + else None, + "toTimestamp": serialize_datetime(to_timestamp) + if to_timestamp is not None + else None, + }, + request_options=request_options, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + GetScoresV3Response, + parse_obj_as( + type_=GetScoresV3Response, # type: ignore + object_=_response.json(), + ), + ) + return AsyncHttpResponse(response=_response, data=_data) + if _response.status_code == 400: + raise Error( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 401: + raise UnauthorizedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 403: + raise AccessDeniedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 405: + raise MethodNotAllowedError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + if _response.status_code == 404: + raise NotFoundError( + headers=dict(_response.headers), + body=typing.cast( + typing.Any, + parse_obj_as( + type_=typing.Any, # type: ignore + object_=_response.json(), + ), + ), + ) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError( + status_code=_response.status_code, + headers=dict(_response.headers), + body=_response.text, + ) + raise ApiError( + status_code=_response.status_code, + headers=dict(_response.headers), + body=_response_json, + ) diff --git a/langfuse/api/scores_v3/types/__init__.py b/langfuse/api/scores_v3/types/__init__.py new file mode 100644 index 000000000..14da0ca73 --- /dev/null +++ b/langfuse/api/scores_v3/types/__init__.py @@ -0,0 +1,114 @@ +# This file was auto-generated by Fern from our API Definition. + +# isort: skip_file + +import typing +from importlib import import_module + +if typing.TYPE_CHECKING: + from .base_score_v3 import BaseScoreV3 + from .boolean_score_v3 import BooleanScoreV3 + from .categorical_score_v3 import CategoricalScoreV3 + from .correction_score_v3 import CorrectionScoreV3 + from .get_scores_v3meta import GetScoresV3Meta + from .get_scores_v3response import GetScoresV3Response + from .numeric_score_v3 import NumericScoreV3 + from .score_subject_experiment_v3 import ScoreSubjectExperimentV3 + from .score_subject_observation_v3 import ScoreSubjectObservationV3 + from .score_subject_session_v3 import ScoreSubjectSessionV3 + from .score_subject_trace_v3 import ScoreSubjectTraceV3 + from .score_subject_v3 import ( + ScoreSubjectV3, + ScoreSubjectV3_Experiment, + ScoreSubjectV3_Observation, + ScoreSubjectV3_Session, + ScoreSubjectV3_Trace, + ) + from .score_v3 import ( + ScoreV3, + ScoreV3_Boolean, + ScoreV3_Categorical, + ScoreV3_Correction, + ScoreV3_Numeric, + ScoreV3_Text, + ) + from .text_score_v3 import TextScoreV3 +_dynamic_imports: typing.Dict[str, str] = { + "BaseScoreV3": ".base_score_v3", + "BooleanScoreV3": ".boolean_score_v3", + "CategoricalScoreV3": ".categorical_score_v3", + "CorrectionScoreV3": ".correction_score_v3", + "GetScoresV3Meta": ".get_scores_v3meta", + "GetScoresV3Response": ".get_scores_v3response", + "NumericScoreV3": ".numeric_score_v3", + "ScoreSubjectExperimentV3": ".score_subject_experiment_v3", + "ScoreSubjectObservationV3": ".score_subject_observation_v3", + "ScoreSubjectSessionV3": ".score_subject_session_v3", + "ScoreSubjectTraceV3": ".score_subject_trace_v3", + "ScoreSubjectV3": ".score_subject_v3", + "ScoreSubjectV3_Experiment": ".score_subject_v3", + "ScoreSubjectV3_Observation": ".score_subject_v3", + "ScoreSubjectV3_Session": ".score_subject_v3", + "ScoreSubjectV3_Trace": ".score_subject_v3", + "ScoreV3": ".score_v3", + "ScoreV3_Boolean": ".score_v3", + "ScoreV3_Categorical": ".score_v3", + "ScoreV3_Correction": ".score_v3", + "ScoreV3_Numeric": ".score_v3", + "ScoreV3_Text": ".score_v3", + "TextScoreV3": ".text_score_v3", +} + + +def __getattr__(attr_name: str) -> typing.Any: + module_name = _dynamic_imports.get(attr_name) + if module_name is None: + raise AttributeError( + f"No {attr_name} found in _dynamic_imports for module name -> {__name__}" + ) + try: + module = import_module(module_name, __package__) + if module_name == f".{attr_name}": + return module + else: + return getattr(module, attr_name) + except ImportError as e: + raise ImportError( + f"Failed to import {attr_name} from {module_name}: {e}" + ) from e + except AttributeError as e: + raise AttributeError( + f"Failed to get {attr_name} from {module_name}: {e}" + ) from e + + +def __dir__(): + lazy_attrs = list(_dynamic_imports.keys()) + return sorted(lazy_attrs) + + +__all__ = [ + "BaseScoreV3", + "BooleanScoreV3", + "CategoricalScoreV3", + "CorrectionScoreV3", + "GetScoresV3Meta", + "GetScoresV3Response", + "NumericScoreV3", + "ScoreSubjectExperimentV3", + "ScoreSubjectObservationV3", + "ScoreSubjectSessionV3", + "ScoreSubjectTraceV3", + "ScoreSubjectV3", + "ScoreSubjectV3_Experiment", + "ScoreSubjectV3_Observation", + "ScoreSubjectV3_Session", + "ScoreSubjectV3_Trace", + "ScoreV3", + "ScoreV3_Boolean", + "ScoreV3_Categorical", + "ScoreV3_Correction", + "ScoreV3_Numeric", + "ScoreV3_Text", + "TextScoreV3", +] diff --git a/langfuse/api/scores_v3/types/base_score_v3.py b/langfuse/api/scores_v3/types/base_score_v3.py new file mode 100644 index 000000000..3d5394f95 --- /dev/null +++ b/langfuse/api/scores_v3/types/base_score_v3.py @@ -0,0 +1,71 @@ +# This file was auto-generated by Fern from our API Definition. + +import datetime as dt +import typing + +import pydantic +import typing_extensions +from ...commons.types.score_source import ScoreSource +from ...core.pydantic_utilities import UniversalBaseModel +from ...core.serialization import FieldMetadata +from .score_subject_v3 import ScoreSubjectV3 + + +class BaseScoreV3(UniversalBaseModel): + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str = pydantic.Field() + """ + The environment from which this score originated. + """ + + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = pydantic.Field(default=None) + """ + Optional comment attached to the score. Present when "details" is included in the fields parameter. + """ + + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = pydantic.Field(default=None) + """ + The score config ID, if this score was created from a config. Present when "details" is included in the fields parameter. + """ + + metadata: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field( + default=None + ) + """ + Arbitrary metadata attached to the score. Present when "details" is included in the fields parameter. + """ + + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = pydantic.Field(default=None) + """ + The user who created this score, if available. Present when "annotation" is included in the fields parameter. + """ + + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = pydantic.Field(default=None) + """ + The annotation queue this score belongs to, if any. Present when "annotation" is included in the fields parameter. + """ + + subject: typing.Optional[ScoreSubjectV3] = pydantic.Field(default=None) + """ + The entity this score is attached to (trace, observation, session, or experiment). Present when "subject" is included in the fields parameter. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/boolean_score_v3.py b/langfuse/api/scores_v3/types/boolean_score_v3.py new file mode 100644 index 000000000..5b94bc1d1 --- /dev/null +++ b/langfuse/api/scores_v3/types/boolean_score_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from .base_score_v3 import BaseScoreV3 + + +class BooleanScoreV3(BaseScoreV3): + value: bool = pydantic.Field() + """ + The boolean value of the score. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/categorical_score_v3.py b/langfuse/api/scores_v3/types/categorical_score_v3.py new file mode 100644 index 000000000..975b1f64c --- /dev/null +++ b/langfuse/api/scores_v3/types/categorical_score_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from .base_score_v3 import BaseScoreV3 + + +class CategoricalScoreV3(BaseScoreV3): + value: str = pydantic.Field() + """ + The string category value of the score. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/correction_score_v3.py b/langfuse/api/scores_v3/types/correction_score_v3.py new file mode 100644 index 000000000..1717a6e67 --- /dev/null +++ b/langfuse/api/scores_v3/types/correction_score_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from .base_score_v3 import BaseScoreV3 + + +class CorrectionScoreV3(BaseScoreV3): + value: str = pydantic.Field() + """ + The correction content of the score. Empty string if not set. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/get_scores_v3meta.py b/langfuse/api/scores_v3/types/get_scores_v3meta.py new file mode 100644 index 000000000..7dfcfe0e1 --- /dev/null +++ b/langfuse/api/scores_v3/types/get_scores_v3meta.py @@ -0,0 +1,18 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import UniversalBaseModel + + +class GetScoresV3Meta(UniversalBaseModel): + limit: int + cursor: typing.Optional[str] = pydantic.Field(default=None) + """ + URL-safe base64 (base64url) cursor for the next page. Absent when there are no more results. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/get_scores_v3response.py b/langfuse/api/scores_v3/types/get_scores_v3response.py new file mode 100644 index 000000000..4d625b29a --- /dev/null +++ b/langfuse/api/scores_v3/types/get_scores_v3response.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import UniversalBaseModel +from .get_scores_v3meta import GetScoresV3Meta +from .score_v3 import ScoreV3 + + +class GetScoresV3Response(UniversalBaseModel): + data: typing.List[ScoreV3] + meta: GetScoresV3Meta + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/numeric_score_v3.py b/langfuse/api/scores_v3/types/numeric_score_v3.py new file mode 100644 index 000000000..10df001a4 --- /dev/null +++ b/langfuse/api/scores_v3/types/numeric_score_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from .base_score_v3 import BaseScoreV3 + + +class NumericScoreV3(BaseScoreV3): + value: float = pydantic.Field() + """ + The numeric value of the score. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/score_subject_experiment_v3.py b/langfuse/api/scores_v3/types/score_subject_experiment_v3.py new file mode 100644 index 000000000..a71a49241 --- /dev/null +++ b/langfuse/api/scores_v3/types/score_subject_experiment_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import UniversalBaseModel + + +class ScoreSubjectExperimentV3(UniversalBaseModel): + id: str = pydantic.Field() + """ + The dataset run ID (experiment ID). + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/score_subject_observation_v3.py b/langfuse/api/scores_v3/types/score_subject_observation_v3.py new file mode 100644 index 000000000..1bc2edf20 --- /dev/null +++ b/langfuse/api/scores_v3/types/score_subject_observation_v3.py @@ -0,0 +1,26 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +import typing_extensions +from ...core.pydantic_utilities import UniversalBaseModel +from ...core.serialization import FieldMetadata + + +class ScoreSubjectObservationV3(UniversalBaseModel): + id: str = pydantic.Field() + """ + The observation ID. + """ + + trace_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="traceId") + ] = pydantic.Field(default=None) + """ + The parent trace ID, if available. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/score_subject_session_v3.py b/langfuse/api/scores_v3/types/score_subject_session_v3.py new file mode 100644 index 000000000..cb9347583 --- /dev/null +++ b/langfuse/api/scores_v3/types/score_subject_session_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import UniversalBaseModel + + +class ScoreSubjectSessionV3(UniversalBaseModel): + id: str = pydantic.Field() + """ + The session ID. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/score_subject_trace_v3.py b/langfuse/api/scores_v3/types/score_subject_trace_v3.py new file mode 100644 index 000000000..26aab7f07 --- /dev/null +++ b/langfuse/api/scores_v3/types/score_subject_trace_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import UniversalBaseModel + + +class ScoreSubjectTraceV3(UniversalBaseModel): + id: str = pydantic.Field() + """ + The trace ID. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/scores_v3/types/score_subject_v3.py b/langfuse/api/scores_v3/types/score_subject_v3.py new file mode 100644 index 000000000..7464fda55 --- /dev/null +++ b/langfuse/api/scores_v3/types/score_subject_v3.py @@ -0,0 +1,76 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import typing + +import pydantic +import typing_extensions +from ...core.pydantic_utilities import UniversalBaseModel +from ...core.serialization import FieldMetadata + + +class ScoreSubjectV3_Trace(UniversalBaseModel): + """ + A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment. + """ + + kind: typing.Literal["trace"] = "trace" + id: str + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreSubjectV3_Observation(UniversalBaseModel): + """ + A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment. + """ + + kind: typing.Literal["observation"] = "observation" + id: str + trace_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="traceId") + ] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreSubjectV3_Session(UniversalBaseModel): + """ + A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment. + """ + + kind: typing.Literal["session"] = "session" + id: str + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreSubjectV3_Experiment(UniversalBaseModel): + """ + A reference to the entity this score is attached to. Discriminated by "kind" — one of trace, observation, session, or experiment. + """ + + kind: typing.Literal["experiment"] = "experiment" + id: str + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +ScoreSubjectV3 = typing_extensions.Annotated[ + typing.Union[ + ScoreSubjectV3_Trace, + ScoreSubjectV3_Observation, + ScoreSubjectV3_Session, + ScoreSubjectV3_Experiment, + ], + pydantic.Field(discriminator="kind"), +] diff --git a/langfuse/api/scores_v3/types/score_v3.py b/langfuse/api/scores_v3/types/score_v3.py new file mode 100644 index 000000000..9921d1bda --- /dev/null +++ b/langfuse/api/scores_v3/types/score_v3.py @@ -0,0 +1,200 @@ +# This file was auto-generated by Fern from our API Definition. + +from __future__ import annotations + +import datetime as dt +import typing + +import pydantic +import typing_extensions +from ...commons.types.score_source import ScoreSource +from ...core.pydantic_utilities import UniversalBaseModel +from ...core.serialization import FieldMetadata +from .score_subject_v3 import ScoreSubjectV3 + + +class ScoreV3_Numeric(UniversalBaseModel): + data_type: typing_extensions.Annotated[ + typing.Literal["NUMERIC"], FieldMetadata(alias="dataType") + ] = "NUMERIC" + value: float + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = None + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = None + metadata: typing.Optional[typing.Dict[str, typing.Any]] = None + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = None + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = None + subject: typing.Optional[ScoreSubjectV3] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreV3_Boolean(UniversalBaseModel): + data_type: typing_extensions.Annotated[ + typing.Literal["BOOLEAN"], FieldMetadata(alias="dataType") + ] = "BOOLEAN" + value: bool + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = None + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = None + metadata: typing.Optional[typing.Dict[str, typing.Any]] = None + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = None + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = None + subject: typing.Optional[ScoreSubjectV3] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreV3_Categorical(UniversalBaseModel): + data_type: typing_extensions.Annotated[ + typing.Literal["CATEGORICAL"], FieldMetadata(alias="dataType") + ] = "CATEGORICAL" + value: str + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = None + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = None + metadata: typing.Optional[typing.Dict[str, typing.Any]] = None + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = None + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = None + subject: typing.Optional[ScoreSubjectV3] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreV3_Text(UniversalBaseModel): + data_type: typing_extensions.Annotated[ + typing.Literal["TEXT"], FieldMetadata(alias="dataType") + ] = "TEXT" + value: str + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = None + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = None + metadata: typing.Optional[typing.Dict[str, typing.Any]] = None + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = None + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = None + subject: typing.Optional[ScoreSubjectV3] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class ScoreV3_Correction(UniversalBaseModel): + data_type: typing_extensions.Annotated[ + typing.Literal["CORRECTION"], FieldMetadata(alias="dataType") + ] = "CORRECTION" + value: str + id: str + project_id: typing_extensions.Annotated[str, FieldMetadata(alias="projectId")] + name: str + source: ScoreSource + timestamp: dt.datetime + environment: str + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] + comment: typing.Optional[str] = None + config_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="configId") + ] = None + metadata: typing.Optional[typing.Dict[str, typing.Any]] = None + author_user_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="authorUserId") + ] = None + queue_id: typing_extensions.Annotated[ + typing.Optional[str], FieldMetadata(alias="queueId") + ] = None + subject: typing.Optional[ScoreSubjectV3] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +ScoreV3 = typing_extensions.Annotated[ + typing.Union[ + ScoreV3_Numeric, + ScoreV3_Boolean, + ScoreV3_Categorical, + ScoreV3_Text, + ScoreV3_Correction, + ], + pydantic.Field(discriminator="data_type"), +] diff --git a/langfuse/api/scores_v3/types/text_score_v3.py b/langfuse/api/scores_v3/types/text_score_v3.py new file mode 100644 index 000000000..3d658972c --- /dev/null +++ b/langfuse/api/scores_v3/types/text_score_v3.py @@ -0,0 +1,17 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from .base_score_v3 import BaseScoreV3 + + +class TextScoreV3(BaseScoreV3): + value: str = pydantic.Field() + """ + The text content of the score. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/__init__.py b/langfuse/api/unstable/__init__.py index 75aafdc24..e5356c235 100644 --- a/langfuse/api/unstable/__init__.py +++ b/langfuse/api/unstable/__init__.py @@ -26,6 +26,7 @@ ArrayOptionsEvaluationRuleFilter, BooleanEvaluationRuleFilter, CategoryOptionsEvaluationRuleFilter, + CodeEvaluatorSourceCodeLanguage, DateTimeEvaluationRuleFilter, EvaluationRuleArrayOptionsFilterOperator, EvaluationRuleBooleanFilterOperator, @@ -73,24 +74,51 @@ StringOptionsEvaluationRuleFilter, ) from .evaluation_rules import ( + CodeEvaluationRuleEvaluatorReference, + CreateCodeEvaluationRuleRequest, CreateEvaluationRuleRequest, + CreateLlmAsJudgeEvaluationRuleRequest, DeleteEvaluationRuleResponse, EvaluationRule, EvaluationRuleEvaluator, EvaluationRuleEvaluatorReference, EvaluationRules, + LlmAsJudgeEvaluationRuleEvaluatorReference, + LlmAsJudgeEvaluatorType, UpdateEvaluationRuleRequest, ) - from .evaluators import CreateEvaluatorRequest, Evaluator, Evaluators + from .evaluators import ( + CodeEvaluator, + CreateCodeEvaluatorRequest, + CreateEvaluatorRequest, + CreateEvaluatorRequest_Code, + CreateEvaluatorRequest_LlmAsJudge, + CreateLlmAsJudgeEvaluatorRequest, + Evaluator, + EvaluatorBase, + Evaluator_Code, + Evaluator_LlmAsJudge, + Evaluators, + LlmAsJudgeEvaluator, + ) _dynamic_imports: typing.Dict[str, str] = { "AccessDeniedError": ".errors", "ArrayOptionsEvaluationRuleFilter": ".commons", "BadRequestError": ".errors", "BooleanEvaluationRuleFilter": ".commons", "CategoryOptionsEvaluationRuleFilter": ".commons", + "CodeEvaluationRuleEvaluatorReference": ".evaluation_rules", + "CodeEvaluator": ".evaluators", + "CodeEvaluatorSourceCodeLanguage": ".commons", "ConflictError": ".errors", + "CreateCodeEvaluationRuleRequest": ".evaluation_rules", + "CreateCodeEvaluatorRequest": ".evaluators", "CreateEvaluationRuleRequest": ".evaluation_rules", "CreateEvaluatorRequest": ".evaluators", + "CreateEvaluatorRequest_Code": ".evaluators", + "CreateEvaluatorRequest_LlmAsJudge": ".evaluators", + "CreateLlmAsJudgeEvaluationRuleRequest": ".evaluation_rules", + "CreateLlmAsJudgeEvaluatorRequest": ".evaluators", "DateTimeEvaluationRuleFilter": ".commons", "DeleteEvaluationRuleResponse": ".evaluation_rules", "EvaluationRule": ".evaluation_rules", @@ -119,6 +147,7 @@ "EvaluationRuleTarget": ".commons", "EvaluationRules": ".evaluation_rules", "Evaluator": ".evaluators", + "EvaluatorBase": ".evaluators", "EvaluatorModelConfig": ".commons", "EvaluatorOutputDataType": ".commons", "EvaluatorOutputDefinition": ".commons", @@ -128,8 +157,13 @@ "EvaluatorOutputFieldDefinition": ".commons", "EvaluatorScope": ".commons", "EvaluatorType": ".commons", + "Evaluator_Code": ".evaluators", + "Evaluator_LlmAsJudge": ".evaluators", "Evaluators": ".evaluators", "InternalServerError": ".errors", + "LlmAsJudgeEvaluationRuleEvaluatorReference": ".evaluation_rules", + "LlmAsJudgeEvaluator": ".evaluators", + "LlmAsJudgeEvaluatorType": ".evaluation_rules", "MethodNotAllowedError": ".errors", "NotFoundError": ".errors", "NullEvaluationRuleFilter": ".commons", @@ -194,9 +228,18 @@ def __dir__(): "BadRequestError", "BooleanEvaluationRuleFilter", "CategoryOptionsEvaluationRuleFilter", + "CodeEvaluationRuleEvaluatorReference", + "CodeEvaluator", + "CodeEvaluatorSourceCodeLanguage", "ConflictError", + "CreateCodeEvaluationRuleRequest", + "CreateCodeEvaluatorRequest", "CreateEvaluationRuleRequest", "CreateEvaluatorRequest", + "CreateEvaluatorRequest_Code", + "CreateEvaluatorRequest_LlmAsJudge", + "CreateLlmAsJudgeEvaluationRuleRequest", + "CreateLlmAsJudgeEvaluatorRequest", "DateTimeEvaluationRuleFilter", "DeleteEvaluationRuleResponse", "EvaluationRule", @@ -225,6 +268,7 @@ def __dir__(): "EvaluationRuleTarget", "EvaluationRules", "Evaluator", + "EvaluatorBase", "EvaluatorModelConfig", "EvaluatorOutputDataType", "EvaluatorOutputDefinition", @@ -234,8 +278,13 @@ def __dir__(): "EvaluatorOutputFieldDefinition", "EvaluatorScope", "EvaluatorType", + "Evaluator_Code", + "Evaluator_LlmAsJudge", "Evaluators", "InternalServerError", + "LlmAsJudgeEvaluationRuleEvaluatorReference", + "LlmAsJudgeEvaluator", + "LlmAsJudgeEvaluatorType", "MethodNotAllowedError", "NotFoundError", "NullEvaluationRuleFilter", diff --git a/langfuse/api/unstable/commons/__init__.py b/langfuse/api/unstable/commons/__init__.py index 13d9571ff..c617b53c7 100644 --- a/langfuse/api/unstable/commons/__init__.py +++ b/langfuse/api/unstable/commons/__init__.py @@ -10,6 +10,7 @@ ArrayOptionsEvaluationRuleFilter, BooleanEvaluationRuleFilter, CategoryOptionsEvaluationRuleFilter, + CodeEvaluatorSourceCodeLanguage, DateTimeEvaluationRuleFilter, EvaluationRuleArrayOptionsFilterOperator, EvaluationRuleBooleanFilterOperator, @@ -60,6 +61,7 @@ "ArrayOptionsEvaluationRuleFilter": ".types", "BooleanEvaluationRuleFilter": ".types", "CategoryOptionsEvaluationRuleFilter": ".types", + "CodeEvaluatorSourceCodeLanguage": ".types", "DateTimeEvaluationRuleFilter": ".types", "EvaluationRuleArrayOptionsFilterOperator": ".types", "EvaluationRuleBooleanFilterOperator": ".types", @@ -139,6 +141,7 @@ def __dir__(): "ArrayOptionsEvaluationRuleFilter", "BooleanEvaluationRuleFilter", "CategoryOptionsEvaluationRuleFilter", + "CodeEvaluatorSourceCodeLanguage", "DateTimeEvaluationRuleFilter", "EvaluationRuleArrayOptionsFilterOperator", "EvaluationRuleBooleanFilterOperator", diff --git a/langfuse/api/unstable/commons/types/__init__.py b/langfuse/api/unstable/commons/types/__init__.py index a0e7d9f9d..487480da4 100644 --- a/langfuse/api/unstable/commons/types/__init__.py +++ b/langfuse/api/unstable/commons/types/__init__.py @@ -11,6 +11,7 @@ from .category_options_evaluation_rule_filter import ( CategoryOptionsEvaluationRuleFilter, ) + from .code_evaluator_source_code_language import CodeEvaluatorSourceCodeLanguage from .date_time_evaluation_rule_filter import DateTimeEvaluationRuleFilter from .evaluation_rule_array_options_filter_operator import ( EvaluationRuleArrayOptionsFilterOperator, @@ -84,6 +85,7 @@ "ArrayOptionsEvaluationRuleFilter": ".array_options_evaluation_rule_filter", "BooleanEvaluationRuleFilter": ".boolean_evaluation_rule_filter", "CategoryOptionsEvaluationRuleFilter": ".category_options_evaluation_rule_filter", + "CodeEvaluatorSourceCodeLanguage": ".code_evaluator_source_code_language", "DateTimeEvaluationRuleFilter": ".date_time_evaluation_rule_filter", "EvaluationRuleArrayOptionsFilterOperator": ".evaluation_rule_array_options_filter_operator", "EvaluationRuleBooleanFilterOperator": ".evaluation_rule_boolean_filter_operator", @@ -163,6 +165,7 @@ def __dir__(): "ArrayOptionsEvaluationRuleFilter", "BooleanEvaluationRuleFilter", "CategoryOptionsEvaluationRuleFilter", + "CodeEvaluatorSourceCodeLanguage", "DateTimeEvaluationRuleFilter", "EvaluationRuleArrayOptionsFilterOperator", "EvaluationRuleBooleanFilterOperator", diff --git a/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py b/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py new file mode 100644 index 000000000..7071a317c --- /dev/null +++ b/langfuse/api/unstable/commons/types/code_evaluator_source_code_language.py @@ -0,0 +1,26 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +from ....core import enum + +T_Result = typing.TypeVar("T_Result") + + +class CodeEvaluatorSourceCodeLanguage(enum.StrEnum): + """ + Code evaluator runtime language. + """ + + PYTHON = "PYTHON" + TYPESCRIPT = "TYPESCRIPT" + + def visit( + self, + python: typing.Callable[[], T_Result], + typescript: typing.Callable[[], T_Result], + ) -> T_Result: + if self is CodeEvaluatorSourceCodeLanguage.PYTHON: + return python() + if self is CodeEvaluatorSourceCodeLanguage.TYPESCRIPT: + return typescript() diff --git a/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py b/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py index 1c407819c..cd08e8b33 100644 --- a/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py +++ b/langfuse/api/unstable/commons/types/evaluation_rule_mapping.py @@ -11,7 +11,9 @@ class EvaluationRuleMapping(UniversalBaseModel): """ - Maps one evaluator prompt variable to one source field from the target object. + Maps one evaluator variable to one source field from the target object. + + Manual mappings are used for `llm_as_judge` evaluators. `code` evaluators use a fixed runtime mapping managed by Langfuse. How to build a valid mapping list: 1. Create the evaluator or fetch it with `GET /evaluators/{id}`. @@ -24,7 +26,7 @@ class EvaluationRuleMapping(UniversalBaseModel): Recovery guidance: - `invalid_variable_mapping`: the variable name is unknown for this evaluator, or the selected `source` is not valid for the chosen `target` - - `missing_variable_mapping`: one or more evaluator variables are not mapped yet + - `missing_variable_mapping`: one or more LLM-as-judge evaluator variables are not mapped yet - `duplicate_variable_mapping`: the same evaluator variable appears more than once - `invalid_json_path`: the JSONPath expression is malformed. Remove it or correct it. diff --git a/langfuse/api/unstable/commons/types/evaluator_type.py b/langfuse/api/unstable/commons/types/evaluator_type.py index d411d6111..f219fb7e1 100644 --- a/langfuse/api/unstable/commons/types/evaluator_type.py +++ b/langfuse/api/unstable/commons/types/evaluator_type.py @@ -11,11 +11,18 @@ class EvaluatorType(enum.StrEnum): """ The evaluator engine type. - The unstable public API currently supports only LLM-as-a-judge evaluators. + The unstable public API supports LLM-as-a-judge and code evaluators. """ LLM_AS_JUDGE = "llm_as_judge" + CODE = "code" - def visit(self, llm_as_judge: typing.Callable[[], T_Result]) -> T_Result: + def visit( + self, + llm_as_judge: typing.Callable[[], T_Result], + code: typing.Callable[[], T_Result], + ) -> T_Result: if self is EvaluatorType.LLM_AS_JUDGE: return llm_as_judge() + if self is EvaluatorType.CODE: + return code() diff --git a/langfuse/api/unstable/evaluation_rules/__init__.py b/langfuse/api/unstable/evaluation_rules/__init__.py index f0c007231..8541bdcc8 100644 --- a/langfuse/api/unstable/evaluation_rules/__init__.py +++ b/langfuse/api/unstable/evaluation_rules/__init__.py @@ -7,21 +7,31 @@ if typing.TYPE_CHECKING: from .types import ( + CodeEvaluationRuleEvaluatorReference, + CreateCodeEvaluationRuleRequest, CreateEvaluationRuleRequest, + CreateLlmAsJudgeEvaluationRuleRequest, DeleteEvaluationRuleResponse, EvaluationRule, EvaluationRuleEvaluator, EvaluationRuleEvaluatorReference, EvaluationRules, + LlmAsJudgeEvaluationRuleEvaluatorReference, + LlmAsJudgeEvaluatorType, UpdateEvaluationRuleRequest, ) _dynamic_imports: typing.Dict[str, str] = { + "CodeEvaluationRuleEvaluatorReference": ".types", + "CreateCodeEvaluationRuleRequest": ".types", "CreateEvaluationRuleRequest": ".types", + "CreateLlmAsJudgeEvaluationRuleRequest": ".types", "DeleteEvaluationRuleResponse": ".types", "EvaluationRule": ".types", "EvaluationRuleEvaluator": ".types", "EvaluationRuleEvaluatorReference": ".types", "EvaluationRules": ".types", + "LlmAsJudgeEvaluationRuleEvaluatorReference": ".types", + "LlmAsJudgeEvaluatorType": ".types", "UpdateEvaluationRuleRequest": ".types", } @@ -54,11 +64,16 @@ def __dir__(): __all__ = [ + "CodeEvaluationRuleEvaluatorReference", + "CreateCodeEvaluationRuleRequest", "CreateEvaluationRuleRequest", + "CreateLlmAsJudgeEvaluationRuleRequest", "DeleteEvaluationRuleResponse", "EvaluationRule", "EvaluationRuleEvaluator", "EvaluationRuleEvaluatorReference", "EvaluationRules", + "LlmAsJudgeEvaluationRuleEvaluatorReference", + "LlmAsJudgeEvaluatorType", "UpdateEvaluationRuleRequest", ] diff --git a/langfuse/api/unstable/evaluation_rules/client.py b/langfuse/api/unstable/evaluation_rules/client.py index 20e56e6c3..aa0cefbdf 100644 --- a/langfuse/api/unstable/evaluation_rules/client.py +++ b/langfuse/api/unstable/evaluation_rules/client.py @@ -8,6 +8,7 @@ from ..commons.types.evaluation_rule_mapping import EvaluationRuleMapping from ..commons.types.evaluation_rule_target import EvaluationRuleTarget from .raw_client import AsyncRawEvaluationRulesClient, RawEvaluationRulesClient +from .types.create_evaluation_rule_request import CreateEvaluationRuleRequest from .types.delete_evaluation_rule_response import DeleteEvaluationRuleResponse from .types.evaluation_rule import EvaluationRule from .types.evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference @@ -35,13 +36,7 @@ def with_raw_response(self) -> RawEvaluationRulesClient: def create( self, *, - name: str, - evaluator: EvaluationRuleEvaluatorReference, - target: EvaluationRuleTarget, - enabled: bool, - mapping: typing.Sequence[EvaluationRuleMapping], - sampling: typing.Optional[float] = OMIT, - filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT, + request: CreateEvaluationRuleRequest, request_options: typing.Optional[RequestOptions] = None, ) -> EvaluationRule: """ @@ -57,8 +52,9 @@ def create( - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints - Langfuse resolves that family to its latest version before saving the evaluation rule - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId` - - every evaluator prompt variable must be mapped exactly once - - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment` + - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once + - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests + - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment` - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run - at most 50 evaluation rules can be effectively active in one project at the same time @@ -75,44 +71,15 @@ def create( Recovery guidance: - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues` - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response - - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` + - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable - - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name + - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name - `400 invalid_json_path`: remove or correct the `jsonPath` - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request. Parameters ---------- - name : str - Human-readable deployment name. - - evaluator : EvaluationRuleEvaluatorReference - Evaluator family to use. - - Use `name` and `scope` from the evaluator endpoints. - Langfuse resolves that family to its latest version before saving the rule. - - target : EvaluationRuleTarget - Target object type to evaluate. - - enabled : bool - Whether the deployment should be active immediately after creation. - - mapping : typing.Sequence[EvaluationRuleMapping] - Required variable mappings. - - Every evaluator variable must appear exactly once. - Build this list from the evaluator `variables` array returned by the evaluator endpoints. - - sampling : typing.Optional[float] - Optional sampling fraction. Defaults to `1`. - - filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]] - Optional filter list. - - Omit or pass an empty list to evaluate all matching targets for the selected `target`. - Each filter object must use a column that is valid for that `target`. - For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + request : CreateEvaluationRuleRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -132,7 +99,11 @@ def create( EvaluationRuleTarget, EvaluatorScope, ) - from langfuse.unstable.evaluation_rules import EvaluationRuleEvaluatorReference + from langfuse.unstable.evaluation_rules import ( + CreateLlmAsJudgeEvaluationRuleRequest, + LlmAsJudgeEvaluationRuleEvaluatorReference, + LlmAsJudgeEvaluatorType, + ) client = LangfuseAPI( x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", @@ -143,42 +114,38 @@ def create( base_url="https://yourhost.com/path/to/api", ) client.unstable.evaluation_rules.create( - name="answer-correctness-live", - evaluator=EvaluationRuleEvaluatorReference( - name="answer-correctness", - scope=EvaluatorScope.PROJECT, - ), - target=EvaluationRuleTarget.OBSERVATION, - enabled=True, - sampling=1.0, - filter=[ - EvaluationRuleFilter_StringOptions( - column="type", - operator=EvaluationRuleOptionsFilterOperator.ANY_OF, - value=["GENERATION"], - ) - ], - mapping=[ - EvaluationRuleMapping( - variable="input", - source=EvaluationRuleMappingSource.INPUT, - ), - EvaluationRuleMapping( - variable="output", - source=EvaluationRuleMappingSource.OUTPUT, + request=CreateLlmAsJudgeEvaluationRuleRequest( + name="answer-correctness-live", + evaluator=LlmAsJudgeEvaluationRuleEvaluatorReference( + name="answer-correctness", + scope=EvaluatorScope.PROJECT, + type=LlmAsJudgeEvaluatorType.LLM_AS_JUDGE, ), - ], + target=EvaluationRuleTarget.OBSERVATION, + enabled=True, + sampling=1.0, + filter=[ + EvaluationRuleFilter_StringOptions( + column="type", + operator=EvaluationRuleOptionsFilterOperator.ANY_OF, + value=["GENERATION"], + ) + ], + mapping=[ + EvaluationRuleMapping( + variable="input", + source=EvaluationRuleMappingSource.INPUT, + ), + EvaluationRuleMapping( + variable="output", + source=EvaluationRuleMappingSource.OUTPUT, + ), + ], + ), ) """ _response = self._raw_client.create( - name=name, - evaluator=evaluator, - target=target, - enabled=enabled, - mapping=mapping, - sampling=sampling, - filter=filter, - request_options=request_options, + request=request, request_options=request_options ) return _response.data @@ -293,18 +260,19 @@ def update( - switch to another evaluator - adjust sampling - change filters - - update variable mappings + - update LLM-as-judge variable mappings Important behavior: - provide only the fields you want to change - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving - - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration - - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration + - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409` Recovery guidance: - - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` + - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter` Parameters @@ -319,6 +287,7 @@ def update( Updated evaluator family. Langfuse resolves the provided evaluator family to its latest version before saving the rule. + A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule. target : typing.Optional[EvaluationRuleTarget] Updated target object type. @@ -335,7 +304,9 @@ def update( For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]] - Updated variable mappings. + Updated LLM-as-judge variable mappings. + + Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response. request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -436,13 +407,7 @@ def with_raw_response(self) -> AsyncRawEvaluationRulesClient: async def create( self, *, - name: str, - evaluator: EvaluationRuleEvaluatorReference, - target: EvaluationRuleTarget, - enabled: bool, - mapping: typing.Sequence[EvaluationRuleMapping], - sampling: typing.Optional[float] = OMIT, - filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT, + request: CreateEvaluationRuleRequest, request_options: typing.Optional[RequestOptions] = None, ) -> EvaluationRule: """ @@ -458,8 +423,9 @@ async def create( - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints - Langfuse resolves that family to its latest version before saving the evaluation rule - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId` - - every evaluator prompt variable must be mapped exactly once - - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment` + - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once + - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests + - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment` - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run - at most 50 evaluation rules can be effectively active in one project at the same time @@ -476,44 +442,15 @@ async def create( Recovery guidance: - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues` - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response - - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` + - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable - - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name + - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name - `400 invalid_json_path`: remove or correct the `jsonPath` - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request. Parameters ---------- - name : str - Human-readable deployment name. - - evaluator : EvaluationRuleEvaluatorReference - Evaluator family to use. - - Use `name` and `scope` from the evaluator endpoints. - Langfuse resolves that family to its latest version before saving the rule. - - target : EvaluationRuleTarget - Target object type to evaluate. - - enabled : bool - Whether the deployment should be active immediately after creation. - - mapping : typing.Sequence[EvaluationRuleMapping] - Required variable mappings. - - Every evaluator variable must appear exactly once. - Build this list from the evaluator `variables` array returned by the evaluator endpoints. - - sampling : typing.Optional[float] - Optional sampling fraction. Defaults to `1`. - - filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]] - Optional filter list. - - Omit or pass an empty list to evaluate all matching targets for the selected `target`. - Each filter object must use a column that is valid for that `target`. - For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + request : CreateEvaluationRuleRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -535,7 +472,11 @@ async def create( EvaluationRuleTarget, EvaluatorScope, ) - from langfuse.unstable.evaluation_rules import EvaluationRuleEvaluatorReference + from langfuse.unstable.evaluation_rules import ( + CreateLlmAsJudgeEvaluationRuleRequest, + LlmAsJudgeEvaluationRuleEvaluatorReference, + LlmAsJudgeEvaluatorType, + ) client = AsyncLangfuseAPI( x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", @@ -549,45 +490,41 @@ async def create( async def main() -> None: await client.unstable.evaluation_rules.create( - name="answer-correctness-live", - evaluator=EvaluationRuleEvaluatorReference( - name="answer-correctness", - scope=EvaluatorScope.PROJECT, - ), - target=EvaluationRuleTarget.OBSERVATION, - enabled=True, - sampling=1.0, - filter=[ - EvaluationRuleFilter_StringOptions( - column="type", - operator=EvaluationRuleOptionsFilterOperator.ANY_OF, - value=["GENERATION"], - ) - ], - mapping=[ - EvaluationRuleMapping( - variable="input", - source=EvaluationRuleMappingSource.INPUT, - ), - EvaluationRuleMapping( - variable="output", - source=EvaluationRuleMappingSource.OUTPUT, + request=CreateLlmAsJudgeEvaluationRuleRequest( + name="answer-correctness-live", + evaluator=LlmAsJudgeEvaluationRuleEvaluatorReference( + name="answer-correctness", + scope=EvaluatorScope.PROJECT, + type=LlmAsJudgeEvaluatorType.LLM_AS_JUDGE, ), - ], + target=EvaluationRuleTarget.OBSERVATION, + enabled=True, + sampling=1.0, + filter=[ + EvaluationRuleFilter_StringOptions( + column="type", + operator=EvaluationRuleOptionsFilterOperator.ANY_OF, + value=["GENERATION"], + ) + ], + mapping=[ + EvaluationRuleMapping( + variable="input", + source=EvaluationRuleMappingSource.INPUT, + ), + EvaluationRuleMapping( + variable="output", + source=EvaluationRuleMappingSource.OUTPUT, + ), + ], + ), ) asyncio.run(main()) """ _response = await self._raw_client.create( - name=name, - evaluator=evaluator, - target=target, - enabled=enabled, - mapping=mapping, - sampling=sampling, - filter=filter, - request_options=request_options, + request=request, request_options=request_options ) return _response.data @@ -718,18 +655,19 @@ async def update( - switch to another evaluator - adjust sampling - change filters - - update variable mappings + - update LLM-as-judge variable mappings Important behavior: - provide only the fields you want to change - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving - - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration - - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration + - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409` Recovery guidance: - - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` + - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter` Parameters @@ -744,6 +682,7 @@ async def update( Updated evaluator family. Langfuse resolves the provided evaluator family to its latest version before saving the rule. + A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule. target : typing.Optional[EvaluationRuleTarget] Updated target object type. @@ -760,7 +699,9 @@ async def update( For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]] - Updated variable mappings. + Updated LLM-as-judge variable mappings. + + Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response. request_options : typing.Optional[RequestOptions] Request-specific configuration. diff --git a/langfuse/api/unstable/evaluation_rules/raw_client.py b/langfuse/api/unstable/evaluation_rules/raw_client.py index f99aba663..7115cbe70 100644 --- a/langfuse/api/unstable/evaluation_rules/raw_client.py +++ b/langfuse/api/unstable/evaluation_rules/raw_client.py @@ -44,6 +44,7 @@ ) from ..errors.errors.unprocessable_content_error import UnprocessableContentError from ..errors.types.public_api_error import PublicApiError +from .types.create_evaluation_rule_request import CreateEvaluationRuleRequest from .types.delete_evaluation_rule_response import DeleteEvaluationRuleResponse from .types.evaluation_rule import EvaluationRule from .types.evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference @@ -60,13 +61,7 @@ def __init__(self, *, client_wrapper: SyncClientWrapper): def create( self, *, - name: str, - evaluator: EvaluationRuleEvaluatorReference, - target: EvaluationRuleTarget, - enabled: bool, - mapping: typing.Sequence[EvaluationRuleMapping], - sampling: typing.Optional[float] = OMIT, - filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT, + request: CreateEvaluationRuleRequest, request_options: typing.Optional[RequestOptions] = None, ) -> HttpResponse[EvaluationRule]: """ @@ -82,8 +77,9 @@ def create( - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints - Langfuse resolves that family to its latest version before saving the evaluation rule - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId` - - every evaluator prompt variable must be mapped exactly once - - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment` + - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once + - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests + - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment` - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run - at most 50 evaluation rules can be effectively active in one project at the same time @@ -100,44 +96,15 @@ def create( Recovery guidance: - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues` - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response - - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` + - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable - - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name + - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name - `400 invalid_json_path`: remove or correct the `jsonPath` - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request. Parameters ---------- - name : str - Human-readable deployment name. - - evaluator : EvaluationRuleEvaluatorReference - Evaluator family to use. - - Use `name` and `scope` from the evaluator endpoints. - Langfuse resolves that family to its latest version before saving the rule. - - target : EvaluationRuleTarget - Target object type to evaluate. - - enabled : bool - Whether the deployment should be active immediately after creation. - - mapping : typing.Sequence[EvaluationRuleMapping] - Required variable mappings. - - Every evaluator variable must appear exactly once. - Build this list from the evaluator `variables` array returned by the evaluator endpoints. - - sampling : typing.Optional[float] - Optional sampling fraction. Defaults to `1`. - - filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]] - Optional filter list. - - Omit or pass an empty list to evaluate all matching targets for the selected `target`. - Each filter object must use a column that is valid for that `target`. - For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + request : CreateEvaluationRuleRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -149,27 +116,11 @@ def create( _response = self._client_wrapper.httpx_client.request( "api/public/unstable/evaluation-rules", method="POST", - json={ - "name": name, - "evaluator": convert_and_respect_annotation_metadata( - object_=evaluator, - annotation=EvaluationRuleEvaluatorReference, - direction="write", - ), - "target": target, - "enabled": enabled, - "sampling": sampling, - "filter": convert_and_respect_annotation_metadata( - object_=filter, - annotation=typing.Sequence[EvaluationRuleFilter], - direction="write", - ), - "mapping": convert_and_respect_annotation_metadata( - object_=mapping, - annotation=typing.Sequence[EvaluationRuleMapping], - direction="write", - ), - }, + json=convert_and_respect_annotation_metadata( + object_=request, + annotation=CreateEvaluationRuleRequest, + direction="write", + ), request_options=request_options, omit=OMIT, ) @@ -734,18 +685,19 @@ def update( - switch to another evaluator - adjust sampling - change filters - - update variable mappings + - update LLM-as-judge variable mappings Important behavior: - provide only the fields you want to change - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving - - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration - - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration + - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409` Recovery guidance: - - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` + - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter` Parameters @@ -760,6 +712,7 @@ def update( Updated evaluator family. Langfuse resolves the provided evaluator family to its latest version before saving the rule. + A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule. target : typing.Optional[EvaluationRuleTarget] Updated target object type. @@ -776,7 +729,9 @@ def update( For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]] - Updated variable mappings. + Updated LLM-as-judge variable mappings. + + Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response. request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -1169,13 +1124,7 @@ def __init__(self, *, client_wrapper: AsyncClientWrapper): async def create( self, *, - name: str, - evaluator: EvaluationRuleEvaluatorReference, - target: EvaluationRuleTarget, - enabled: bool, - mapping: typing.Sequence[EvaluationRuleMapping], - sampling: typing.Optional[float] = OMIT, - filter: typing.Optional[typing.Sequence[EvaluationRuleFilter]] = OMIT, + request: CreateEvaluationRuleRequest, request_options: typing.Optional[RequestOptions] = None, ) -> AsyncHttpResponse[EvaluationRule]: """ @@ -1191,8 +1140,9 @@ async def create( - `evaluator.name` + `evaluator.scope` must identify an existing evaluator family returned by the evaluator endpoints - Langfuse resolves that family to its latest version before saving the evaluation rule - for `target=experiment`, use dataset `id` values from `GET /api/public/v2/datasets` when filtering by `datasetId` - - every evaluator prompt variable must be mapped exactly once - - `expected_output` and `experiment_item_metadata` mappings are only valid for `target=experiment` + - for `llm_as_judge` evaluators, every evaluator prompt variable must be mapped exactly once + - for `code` evaluators, Langfuse uses the fixed code runtime mapping; omit `mapping` in create and update requests + - for user-provided `llm_as_judge` mappings, `expected_output` and `experiment_item_metadata` are only valid for `target=experiment` - if `enabled=true`, Langfuse validates that the referenced evaluator can currently run - at most 50 evaluation rules can be effectively active in one project at the same time @@ -1209,44 +1159,15 @@ async def create( Recovery guidance: - `400 invalid_filter_value`: fix the filter `column` or `value` using `details.column`, `details.invalidValues`, and `details.allowedValues` - `400 invalid_filter_value` with `details.column=datasetId`: call `GET /api/public/v2/datasets`, then retry with dataset `id` values from that response - - `400 missing_variable_mapping`: fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` + - `400 missing_variable_mapping`: for `llm_as_judge` evaluators, fetch the evaluator again and make sure every variable in `variables` appears exactly once in `mapping` - `400 duplicate_variable_mapping`: remove repeated mappings for the same variable - - `400 invalid_variable_mapping`: switch to a valid `source` for the selected `target`, or fix the variable name + - `400 invalid_variable_mapping`: for `llm_as_judge`, switch to a valid `source` for the selected `target`, or fix the variable name - `400 invalid_json_path`: remove or correct the `jsonPath` - `422 evaluator_preflight_failed`: the selected evaluator cannot run with the resolved model configuration. Fix the evaluator/default model setup, then retry the create request. Parameters ---------- - name : str - Human-readable deployment name. - - evaluator : EvaluationRuleEvaluatorReference - Evaluator family to use. - - Use `name` and `scope` from the evaluator endpoints. - Langfuse resolves that family to its latest version before saving the rule. - - target : EvaluationRuleTarget - Target object type to evaluate. - - enabled : bool - Whether the deployment should be active immediately after creation. - - mapping : typing.Sequence[EvaluationRuleMapping] - Required variable mappings. - - Every evaluator variable must appear exactly once. - Build this list from the evaluator `variables` array returned by the evaluator endpoints. - - sampling : typing.Optional[float] - Optional sampling fraction. Defaults to `1`. - - filter : typing.Optional[typing.Sequence[EvaluationRuleFilter]] - Optional filter list. - - Omit or pass an empty list to evaluate all matching targets for the selected `target`. - Each filter object must use a column that is valid for that `target`. - For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + request : CreateEvaluationRuleRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -1258,27 +1179,11 @@ async def create( _response = await self._client_wrapper.httpx_client.request( "api/public/unstable/evaluation-rules", method="POST", - json={ - "name": name, - "evaluator": convert_and_respect_annotation_metadata( - object_=evaluator, - annotation=EvaluationRuleEvaluatorReference, - direction="write", - ), - "target": target, - "enabled": enabled, - "sampling": sampling, - "filter": convert_and_respect_annotation_metadata( - object_=filter, - annotation=typing.Sequence[EvaluationRuleFilter], - direction="write", - ), - "mapping": convert_and_respect_annotation_metadata( - object_=mapping, - annotation=typing.Sequence[EvaluationRuleMapping], - direction="write", - ), - }, + json=convert_and_respect_annotation_metadata( + object_=request, + annotation=CreateEvaluationRuleRequest, + direction="write", + ), request_options=request_options, omit=OMIT, ) @@ -1843,18 +1748,19 @@ async def update( - switch to another evaluator - adjust sampling - change filters - - update variable mappings + - update LLM-as-judge variable mappings Important behavior: - provide only the fields you want to change - if you provide `evaluator`, Langfuse resolves that evaluator family to its latest version before saving - - changing `target`, `filter`, or `mapping` must still produce a valid target-specific configuration - - if you change `target`, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - changing `target`, `filter`, or an LLM-as-judge `mapping` must still produce a valid target-specific configuration + - if you change `target` for an LLM-as-judge rule, also send a compatible `filter` and `mapping` in the same request unless the existing ones are still valid for the new target + - for `code` evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically - if the resulting config is enabled, Langfuse re-validates that the selected evaluator can run - if the update would move a non-active evaluation rule into the active state and the project already has 50 active evaluation rules, the API returns `409` Recovery guidance: - - if the update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` + - if an LLM-as-judge update fails with `missing_variable_mapping` or `invalid_variable_mapping` after changing `evaluator` or `target`, resend the request with a complete new `mapping` - if the update fails with `invalid_filter_value` after changing `target`, resend the request with a target-compatible `filter` Parameters @@ -1869,6 +1775,7 @@ async def update( Updated evaluator family. Langfuse resolves the provided evaluator family to its latest version before saving the rule. + A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule. target : typing.Optional[EvaluationRuleTarget] Updated target object type. @@ -1885,7 +1792,9 @@ async def update( For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. mapping : typing.Optional[typing.Sequence[EvaluationRuleMapping]] - Updated variable mappings. + Updated LLM-as-judge variable mappings. + + Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response. request_options : typing.Optional[RequestOptions] Request-specific configuration. diff --git a/langfuse/api/unstable/evaluation_rules/types/__init__.py b/langfuse/api/unstable/evaluation_rules/types/__init__.py index 2854b1237..a1cdeb967 100644 --- a/langfuse/api/unstable/evaluation_rules/types/__init__.py +++ b/langfuse/api/unstable/evaluation_rules/types/__init__.py @@ -6,20 +6,36 @@ from importlib import import_module if typing.TYPE_CHECKING: + from .code_evaluation_rule_evaluator_reference import ( + CodeEvaluationRuleEvaluatorReference, + ) + from .create_code_evaluation_rule_request import CreateCodeEvaluationRuleRequest from .create_evaluation_rule_request import CreateEvaluationRuleRequest + from .create_llm_as_judge_evaluation_rule_request import ( + CreateLlmAsJudgeEvaluationRuleRequest, + ) from .delete_evaluation_rule_response import DeleteEvaluationRuleResponse from .evaluation_rule import EvaluationRule from .evaluation_rule_evaluator import EvaluationRuleEvaluator from .evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference from .evaluation_rules import EvaluationRules + from .llm_as_judge_evaluation_rule_evaluator_reference import ( + LlmAsJudgeEvaluationRuleEvaluatorReference, + ) + from .llm_as_judge_evaluator_type import LlmAsJudgeEvaluatorType from .update_evaluation_rule_request import UpdateEvaluationRuleRequest _dynamic_imports: typing.Dict[str, str] = { + "CodeEvaluationRuleEvaluatorReference": ".code_evaluation_rule_evaluator_reference", + "CreateCodeEvaluationRuleRequest": ".create_code_evaluation_rule_request", "CreateEvaluationRuleRequest": ".create_evaluation_rule_request", + "CreateLlmAsJudgeEvaluationRuleRequest": ".create_llm_as_judge_evaluation_rule_request", "DeleteEvaluationRuleResponse": ".delete_evaluation_rule_response", "EvaluationRule": ".evaluation_rule", "EvaluationRuleEvaluator": ".evaluation_rule_evaluator", "EvaluationRuleEvaluatorReference": ".evaluation_rule_evaluator_reference", "EvaluationRules": ".evaluation_rules", + "LlmAsJudgeEvaluationRuleEvaluatorReference": ".llm_as_judge_evaluation_rule_evaluator_reference", + "LlmAsJudgeEvaluatorType": ".llm_as_judge_evaluator_type", "UpdateEvaluationRuleRequest": ".update_evaluation_rule_request", } @@ -52,11 +68,16 @@ def __dir__(): __all__ = [ + "CodeEvaluationRuleEvaluatorReference", + "CreateCodeEvaluationRuleRequest", "CreateEvaluationRuleRequest", + "CreateLlmAsJudgeEvaluationRuleRequest", "DeleteEvaluationRuleResponse", "EvaluationRule", "EvaluationRuleEvaluator", "EvaluationRuleEvaluatorReference", "EvaluationRules", + "LlmAsJudgeEvaluationRuleEvaluatorReference", + "LlmAsJudgeEvaluatorType", "UpdateEvaluationRuleRequest", ] diff --git a/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py new file mode 100644 index 000000000..1c259bab8 --- /dev/null +++ b/langfuse/api/unstable/evaluation_rules/types/code_evaluation_rule_evaluator_reference.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ....core.pydantic_utilities import UniversalBaseModel +from ...commons.types.evaluator_scope import EvaluatorScope + + +class CodeEvaluationRuleEvaluatorReference(UniversalBaseModel): + """ + Code evaluator family reference used when creating an evaluation rule. + """ + + name: str = pydantic.Field() + """ + Evaluator family name. + """ + + scope: EvaluatorScope = pydantic.Field() + """ + Whether the evaluator family is project-owned or Langfuse-managed. + """ + + type: typing.Literal["code"] = pydantic.Field(default="code") + """ + Must be `code`. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py new file mode 100644 index 000000000..08df1f78a --- /dev/null +++ b/langfuse/api/unstable/evaluation_rules/types/create_code_evaluation_rule_request.py @@ -0,0 +1,56 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ....core.pydantic_utilities import UniversalBaseModel +from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter +from ...commons.types.evaluation_rule_target import EvaluationRuleTarget +from .code_evaluation_rule_evaluator_reference import ( + CodeEvaluationRuleEvaluatorReference, +) + + +class CreateCodeEvaluationRuleRequest(UniversalBaseModel): + name: str = pydantic.Field() + """ + Human-readable deployment name. + """ + + evaluator: CodeEvaluationRuleEvaluatorReference = pydantic.Field() + """ + Code evaluator family to use. + + Use `name`, `scope`, and `type` from the evaluator endpoints. + Langfuse resolves that family to its latest version before saving the rule. + """ + + target: EvaluationRuleTarget = pydantic.Field() + """ + Target object type to evaluate. + """ + + enabled: bool = pydantic.Field() + """ + Whether the deployment should be active immediately after creation. + """ + + sampling: typing.Optional[float] = pydantic.Field(default=None) + """ + Optional sampling fraction. Defaults to `1`. + """ + + filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field( + default=None + ) + """ + Optional filter list. + + Omit or pass an empty list to evaluate all matching targets for the selected `target`. + Each filter object must use a column that is valid for that `target`. + For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py index 9a90b227a..a6504934d 100644 --- a/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py +++ b/langfuse/api/unstable/evaluation_rules/types/create_evaluation_rule_request.py @@ -2,74 +2,11 @@ import typing -import pydantic -from ....core.pydantic_utilities import UniversalBaseModel -from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter -from ...commons.types.evaluation_rule_mapping import EvaluationRuleMapping -from ...commons.types.evaluation_rule_target import EvaluationRuleTarget -from .evaluation_rule_evaluator_reference import EvaluationRuleEvaluatorReference - - -class CreateEvaluationRuleRequest(UniversalBaseModel): - """ - Request body for creating an evaluation rule. - - Checklist for agents and SDK clients: - - reference an existing evaluator family by `evaluator.name` and `evaluator.scope` - - choose `target=observation` or `target=experiment` - - if `target=experiment` and you want a dataset filter, call `GET /api/public/v2/datasets` first and use dataset `id` values in `filter[].value` - - fetch or inspect the evaluator first, then provide a complete variable mapping for every evaluator variable listed in `variables` - - optionally narrow execution with `filter` - - set `enabled=true` only when you want live execution immediately - """ - - name: str = pydantic.Field() - """ - Human-readable deployment name. - """ - - evaluator: EvaluationRuleEvaluatorReference = pydantic.Field() - """ - Evaluator family to use. - - Use `name` and `scope` from the evaluator endpoints. - Langfuse resolves that family to its latest version before saving the rule. - """ - - target: EvaluationRuleTarget = pydantic.Field() - """ - Target object type to evaluate. - """ - - enabled: bool = pydantic.Field() - """ - Whether the deployment should be active immediately after creation. - """ - - sampling: typing.Optional[float] = pydantic.Field(default=None) - """ - Optional sampling fraction. Defaults to `1`. - """ - - filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field( - default=None - ) - """ - Optional filter list. - - Omit or pass an empty list to evaluate all matching targets for the selected `target`. - Each filter object must use a column that is valid for that `target`. - For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. - """ - - mapping: typing.List[EvaluationRuleMapping] = pydantic.Field() - """ - Required variable mappings. - - Every evaluator variable must appear exactly once. - Build this list from the evaluator `variables` array returned by the evaluator endpoints. - """ - - model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( - extra="allow", frozen=True - ) +from .create_code_evaluation_rule_request import CreateCodeEvaluationRuleRequest +from .create_llm_as_judge_evaluation_rule_request import ( + CreateLlmAsJudgeEvaluationRuleRequest, +) + +CreateEvaluationRuleRequest = typing.Union[ + CreateLlmAsJudgeEvaluationRuleRequest, CreateCodeEvaluationRuleRequest +] diff --git a/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py new file mode 100644 index 000000000..b511b4353 --- /dev/null +++ b/langfuse/api/unstable/evaluation_rules/types/create_llm_as_judge_evaluation_rule_request.py @@ -0,0 +1,65 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ....core.pydantic_utilities import UniversalBaseModel +from ...commons.types.evaluation_rule_filter import EvaluationRuleFilter +from ...commons.types.evaluation_rule_mapping import EvaluationRuleMapping +from ...commons.types.evaluation_rule_target import EvaluationRuleTarget +from .llm_as_judge_evaluation_rule_evaluator_reference import ( + LlmAsJudgeEvaluationRuleEvaluatorReference, +) + + +class CreateLlmAsJudgeEvaluationRuleRequest(UniversalBaseModel): + name: str = pydantic.Field() + """ + Human-readable deployment name. + """ + + evaluator: LlmAsJudgeEvaluationRuleEvaluatorReference = pydantic.Field() + """ + LLM-as-judge evaluator family to use. + + Use `name`, `scope`, and `type` from the evaluator endpoints. If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility. + Langfuse resolves that family to its latest version before saving the rule. + """ + + target: EvaluationRuleTarget = pydantic.Field() + """ + Target object type to evaluate. + """ + + enabled: bool = pydantic.Field() + """ + Whether the deployment should be active immediately after creation. + """ + + sampling: typing.Optional[float] = pydantic.Field(default=None) + """ + Optional sampling fraction. Defaults to `1`. + """ + + filter: typing.Optional[typing.List[EvaluationRuleFilter]] = pydantic.Field( + default=None + ) + """ + Optional filter list. + + Omit or pass an empty list to evaluate all matching targets for the selected `target`. + Each filter object must use a column that is valid for that `target`. + For `target=experiment`, `column=datasetId` expects dataset `id` values from `GET /api/public/v2/datasets`, not dataset names. + """ + + mapping: typing.List[EvaluationRuleMapping] = pydantic.Field() + """ + LLM-as-judge variable mappings. + + Every evaluator variable must appear exactly once. + Build this list from the evaluator `variables` array returned by the evaluator endpoints. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py index d8baee407..418004090 100644 --- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py +++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule.py @@ -42,6 +42,7 @@ class EvaluationRule(UniversalBaseModel): EvaluationRuleStatus, EvaluationRuleTarget, EvaluatorScope, + EvaluatorType, ) from langfuse.unstable.evaluation_rules import ( EvaluationRule, @@ -55,6 +56,7 @@ class EvaluationRule(UniversalBaseModel): id="evaltmpl_123", name="answer-correctness", scope=EvaluatorScope.PROJECT, + type=EvaluatorType.LLM_AS_JUDGE, ), target=EvaluationRuleTarget.OBSERVATION, enabled=True, @@ -150,7 +152,7 @@ class EvaluationRule(UniversalBaseModel): mapping: typing.List[EvaluationRuleMapping] = pydantic.Field() """ - Variable mappings used to populate the evaluator prompt from the live target object. + Variable mappings used to populate evaluator runtime variables from the live target object. """ created_at: typing_extensions.Annotated[ diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py index 9d1be79de..c27497c9d 100644 --- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py +++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator.py @@ -5,6 +5,7 @@ import pydantic from ....core.pydantic_utilities import UniversalBaseModel from ...commons.types.evaluator_scope import EvaluatorScope +from ...commons.types.evaluator_type import EvaluatorType class EvaluationRuleEvaluator(UniversalBaseModel): @@ -12,7 +13,7 @@ class EvaluationRuleEvaluator(UniversalBaseModel): Resolved evaluator currently used by the evaluation rule. `id` is the exact active evaluator version. - `name` and `scope` identify the evaluator family conceptually. + `name`, `scope`, and `type` identify the evaluator family conceptually. """ id: str = pydantic.Field() @@ -30,6 +31,11 @@ class EvaluationRuleEvaluator(UniversalBaseModel): Whether the evaluator family is project-owned or Langfuse-managed. """ + type: EvaluatorType = pydantic.Field() + """ + Evaluator engine type. + """ + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( extra="allow", frozen=True ) diff --git a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py index 25253182f..a2a38723d 100644 --- a/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py +++ b/langfuse/api/unstable/evaluation_rules/types/evaluation_rule_evaluator_reference.py @@ -9,9 +9,10 @@ class EvaluationRuleEvaluatorReference(UniversalBaseModel): """ - Evaluator family reference used when creating or updating an evaluation rule. + Evaluator family reference used when updating an evaluation rule. - `name` and `scope` are enough to identify the evaluator family in the authenticated project context. + `name` and `scope` identify the evaluator family in the authenticated project context. + A rule's evaluator type cannot be changed, so this reference does not accept a `type`; the family must match the rule's current evaluator type. """ name: str = pydantic.Field() diff --git a/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py new file mode 100644 index 000000000..ca57fe517 --- /dev/null +++ b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluation_rule_evaluator_reference.py @@ -0,0 +1,33 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ....core.pydantic_utilities import UniversalBaseModel +from ...commons.types.evaluator_scope import EvaluatorScope +from .llm_as_judge_evaluator_type import LlmAsJudgeEvaluatorType + + +class LlmAsJudgeEvaluationRuleEvaluatorReference(UniversalBaseModel): + """ + LLM-as-judge evaluator family reference used when creating an evaluation rule. + """ + + name: str = pydantic.Field() + """ + Evaluator family name. + """ + + scope: EvaluatorScope = pydantic.Field() + """ + Whether the evaluator family is project-owned or Langfuse-managed. + """ + + type: typing.Optional[LlmAsJudgeEvaluatorType] = pydantic.Field(default=None) + """ + Evaluator engine type. Defaults to `llm_as_judge` when omitted. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py new file mode 100644 index 000000000..b18856d22 --- /dev/null +++ b/langfuse/api/unstable/evaluation_rules/types/llm_as_judge_evaluator_type.py @@ -0,0 +1,15 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +from ....core import enum + +T_Result = typing.TypeVar("T_Result") + + +class LlmAsJudgeEvaluatorType(enum.StrEnum): + LLM_AS_JUDGE = "llm_as_judge" + + def visit(self, llm_as_judge: typing.Callable[[], T_Result]) -> T_Result: + if self is LlmAsJudgeEvaluatorType.LLM_AS_JUDGE: + return llm_as_judge() diff --git a/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py b/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py index 51e2d9288..40e5043a6 100644 --- a/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py +++ b/langfuse/api/unstable/evaluation_rules/types/update_evaluation_rule_request.py @@ -19,8 +19,9 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel): Practical guidance: - If you only want to rename the rule or change sampling, send just those fields. - - If you change `evaluator`, send a fresh `mapping` unless you are certain the existing mapping still matches the evaluator variables. - - If you change `target`, usually send both `filter` and `mapping` in the same request. + - If you change to an LLM-as-judge `evaluator`, send a fresh `mapping` unless you are certain the existing mapping still matches the evaluator variables. + - If you change `target` for an LLM-as-judge rule, usually send both `filter` and `mapping` in the same request. + - For code evaluator rules, omit `mapping`; Langfuse stores the fixed code runtime mapping automatically. - If you change an experiment `datasetId` filter, call `GET /api/public/v2/datasets` and use dataset `id` values from that response. """ @@ -36,6 +37,7 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel): Updated evaluator family. Langfuse resolves the provided evaluator family to its latest version before saving the rule. + A rule's evaluator type cannot be changed: provide `name` and `scope` for an evaluator family of the rule's current type. To use a different evaluator type, create a new rule. """ target: typing.Optional[EvaluationRuleTarget] = pydantic.Field(default=None) @@ -66,7 +68,9 @@ class UpdateEvaluationRuleRequest(UniversalBaseModel): default=None ) """ - Updated variable mappings. + Updated LLM-as-judge variable mappings. + + Do not send this field for code evaluator rules. Langfuse stores the fixed code runtime mapping automatically and returns it in the response. """ model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( diff --git a/langfuse/api/unstable/evaluators/__init__.py b/langfuse/api/unstable/evaluators/__init__.py index 942109740..20a72ef82 100644 --- a/langfuse/api/unstable/evaluators/__init__.py +++ b/langfuse/api/unstable/evaluators/__init__.py @@ -6,11 +6,33 @@ from importlib import import_module if typing.TYPE_CHECKING: - from .types import CreateEvaluatorRequest, Evaluator, Evaluators + from .types import ( + CodeEvaluator, + CreateCodeEvaluatorRequest, + CreateEvaluatorRequest, + CreateEvaluatorRequest_Code, + CreateEvaluatorRequest_LlmAsJudge, + CreateLlmAsJudgeEvaluatorRequest, + Evaluator, + EvaluatorBase, + Evaluator_Code, + Evaluator_LlmAsJudge, + Evaluators, + LlmAsJudgeEvaluator, + ) _dynamic_imports: typing.Dict[str, str] = { + "CodeEvaluator": ".types", + "CreateCodeEvaluatorRequest": ".types", "CreateEvaluatorRequest": ".types", + "CreateEvaluatorRequest_Code": ".types", + "CreateEvaluatorRequest_LlmAsJudge": ".types", + "CreateLlmAsJudgeEvaluatorRequest": ".types", "Evaluator": ".types", + "EvaluatorBase": ".types", + "Evaluator_Code": ".types", + "Evaluator_LlmAsJudge": ".types", "Evaluators": ".types", + "LlmAsJudgeEvaluator": ".types", } @@ -41,4 +63,17 @@ def __dir__(): return sorted(lazy_attrs) -__all__ = ["CreateEvaluatorRequest", "Evaluator", "Evaluators"] +__all__ = [ + "CodeEvaluator", + "CreateCodeEvaluatorRequest", + "CreateEvaluatorRequest", + "CreateEvaluatorRequest_Code", + "CreateEvaluatorRequest_LlmAsJudge", + "CreateLlmAsJudgeEvaluatorRequest", + "Evaluator", + "EvaluatorBase", + "Evaluator_Code", + "Evaluator_LlmAsJudge", + "Evaluators", + "LlmAsJudgeEvaluator", +] diff --git a/langfuse/api/unstable/evaluators/client.py b/langfuse/api/unstable/evaluators/client.py index b7f25532a..ac63e2da9 100644 --- a/langfuse/api/unstable/evaluators/client.py +++ b/langfuse/api/unstable/evaluators/client.py @@ -4,9 +4,8 @@ from ...core.client_wrapper import AsyncClientWrapper, SyncClientWrapper from ...core.request_options import RequestOptions -from ..commons.types.evaluator_model_config import EvaluatorModelConfig -from ..commons.types.evaluator_output_definition import EvaluatorOutputDefinition from .raw_client import AsyncRawEvaluatorsClient, RawEvaluatorsClient +from .types.create_evaluator_request import CreateEvaluatorRequest from .types.evaluator import Evaluator from .types.evaluators import Evaluators @@ -32,16 +31,15 @@ def with_raw_response(self) -> RawEvaluatorsClient: def create( self, *, - name: str, - prompt: str, - output_definition: EvaluatorOutputDefinition, - model_config: typing.Optional[EvaluatorModelConfig] = OMIT, + request: CreateEvaluatorRequest, request_options: typing.Optional[RequestOptions] = None, ) -> Evaluator: """ Create an evaluator in the authenticated project. - Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration. + Use evaluators to define **how** Langfuse should score data. + LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration. + Code evaluators define source code and a runtime language. Naming behavior: - If this is a new evaluator name in your project, Langfuse creates version `1`. @@ -54,30 +52,22 @@ def create( 3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical. 4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`. + Code evaluator validation: + - At creation, Langfuse only validates the request shape + - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation. + Recovery guidance: - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request. - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry. - - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly. Unstable API note: - This surface may evolve while the underlying evaluation data model is being redesigned. Parameters ---------- - name : str - Evaluator name within the authenticated project. - - prompt : str - Prompt template used by the evaluator. - - output_definition : EvaluatorOutputDefinition - Structured output schema the evaluator must return. - - Always send `dataType`. - Do not send `version`; it is an internal storage detail and not part of the public request contract. - - model_config : typing.Optional[EvaluatorModelConfig] - Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + request : CreateEvaluatorRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -95,6 +85,7 @@ def create( EvaluatorOutputDefinition_Numeric, EvaluatorOutputFieldDefinition, ) + from langfuse.unstable.evaluators import CreateEvaluatorRequest_LlmAsJudge client = LangfuseAPI( x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", @@ -105,29 +96,27 @@ def create( base_url="https://yourhost.com/path/to/api", ) client.unstable.evaluators.create( - name="answer-correctness", - prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n", - output_definition=EvaluatorOutputDefinition_Numeric( - data_type=EvaluatorOutputDataType.NUMERIC, - reasoning=EvaluatorOutputFieldDefinition( - description="Explain why the score was assigned.", + request=CreateEvaluatorRequest_LlmAsJudge( + name="answer-correctness", + prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n", + output_definition=EvaluatorOutputDefinition_Numeric( + data_type=EvaluatorOutputDataType.NUMERIC, + reasoning=EvaluatorOutputFieldDefinition( + description="Explain why the score was assigned.", + ), + score=EvaluatorOutputFieldDefinition( + description="Correctness score between 0 and 1.", + ), ), - score=EvaluatorOutputFieldDefinition( - description="Correctness score between 0 and 1.", + model_config=EvaluatorModelConfig( + provider="openai", + model="gpt-4.1-mini", ), ), - model_config=EvaluatorModelConfig( - provider="openai", - model="gpt-4.1-mini", - ), ) """ _response = self._raw_client.create( - name=name, - prompt=prompt, - output_definition=output_definition, - model_config=model_config, - request_options=request_options, + request=request, request_options=request_options ) return _response.data @@ -241,16 +230,15 @@ def with_raw_response(self) -> AsyncRawEvaluatorsClient: async def create( self, *, - name: str, - prompt: str, - output_definition: EvaluatorOutputDefinition, - model_config: typing.Optional[EvaluatorModelConfig] = OMIT, + request: CreateEvaluatorRequest, request_options: typing.Optional[RequestOptions] = None, ) -> Evaluator: """ Create an evaluator in the authenticated project. - Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration. + Use evaluators to define **how** Langfuse should score data. + LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration. + Code evaluators define source code and a runtime language. Naming behavior: - If this is a new evaluator name in your project, Langfuse creates version `1`. @@ -263,30 +251,22 @@ async def create( 3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical. 4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`. + Code evaluator validation: + - At creation, Langfuse only validates the request shape + - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation. + Recovery guidance: - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request. - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry. - - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly. Unstable API note: - This surface may evolve while the underlying evaluation data model is being redesigned. Parameters ---------- - name : str - Evaluator name within the authenticated project. - - prompt : str - Prompt template used by the evaluator. - - output_definition : EvaluatorOutputDefinition - Structured output schema the evaluator must return. - - Always send `dataType`. - Do not send `version`; it is an internal storage detail and not part of the public request contract. - - model_config : typing.Optional[EvaluatorModelConfig] - Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + request : CreateEvaluatorRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -306,6 +286,7 @@ async def create( EvaluatorOutputDefinition_Numeric, EvaluatorOutputFieldDefinition, ) + from langfuse.unstable.evaluators import CreateEvaluatorRequest_LlmAsJudge client = AsyncLangfuseAPI( x_langfuse_sdk_name="YOUR_X_LANGFUSE_SDK_NAME", @@ -319,32 +300,30 @@ async def create( async def main() -> None: await client.unstable.evaluators.create( - name="answer-correctness", - prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n", - output_definition=EvaluatorOutputDefinition_Numeric( - data_type=EvaluatorOutputDataType.NUMERIC, - reasoning=EvaluatorOutputFieldDefinition( - description="Explain why the score was assigned.", + request=CreateEvaluatorRequest_LlmAsJudge( + name="answer-correctness", + prompt="You are grading an answer.\n\nInput:\n{{input}}\n\nOutput:\n{{output}}\n\nReturn a score between 0 and 1.\n", + output_definition=EvaluatorOutputDefinition_Numeric( + data_type=EvaluatorOutputDataType.NUMERIC, + reasoning=EvaluatorOutputFieldDefinition( + description="Explain why the score was assigned.", + ), + score=EvaluatorOutputFieldDefinition( + description="Correctness score between 0 and 1.", + ), ), - score=EvaluatorOutputFieldDefinition( - description="Correctness score between 0 and 1.", + model_config=EvaluatorModelConfig( + provider="openai", + model="gpt-4.1-mini", ), ), - model_config=EvaluatorModelConfig( - provider="openai", - model="gpt-4.1-mini", - ), ) asyncio.run(main()) """ _response = await self._raw_client.create( - name=name, - prompt=prompt, - output_definition=output_definition, - model_config=model_config, - request_options=request_options, + request=request, request_options=request_options ) return _response.data diff --git a/langfuse/api/unstable/evaluators/raw_client.py b/langfuse/api/unstable/evaluators/raw_client.py index f599e3298..30034d033 100644 --- a/langfuse/api/unstable/evaluators/raw_client.py +++ b/langfuse/api/unstable/evaluators/raw_client.py @@ -23,8 +23,6 @@ from ...core.pydantic_utilities import parse_obj_as from ...core.request_options import RequestOptions from ...core.serialization import convert_and_respect_annotation_metadata -from ..commons.types.evaluator_model_config import EvaluatorModelConfig -from ..commons.types.evaluator_output_definition import EvaluatorOutputDefinition from ..errors.errors.access_denied_error import ( AccessDeniedError as unstable_errors_errors_access_denied_error_AccessDeniedError, ) @@ -43,6 +41,7 @@ ) from ..errors.errors.unprocessable_content_error import UnprocessableContentError from ..errors.types.public_api_error import PublicApiError +from .types.create_evaluator_request import CreateEvaluatorRequest from .types.evaluator import Evaluator from .types.evaluators import Evaluators @@ -57,16 +56,15 @@ def __init__(self, *, client_wrapper: SyncClientWrapper): def create( self, *, - name: str, - prompt: str, - output_definition: EvaluatorOutputDefinition, - model_config: typing.Optional[EvaluatorModelConfig] = OMIT, + request: CreateEvaluatorRequest, request_options: typing.Optional[RequestOptions] = None, ) -> HttpResponse[Evaluator]: """ Create an evaluator in the authenticated project. - Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration. + Use evaluators to define **how** Langfuse should score data. + LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration. + Code evaluators define source code and a runtime language. Naming behavior: - If this is a new evaluator name in your project, Langfuse creates version `1`. @@ -79,30 +77,22 @@ def create( 3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical. 4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`. + Code evaluator validation: + - At creation, Langfuse only validates the request shape + - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation. + Recovery guidance: - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request. - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry. - - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly. Unstable API note: - This surface may evolve while the underlying evaluation data model is being redesigned. Parameters ---------- - name : str - Evaluator name within the authenticated project. - - prompt : str - Prompt template used by the evaluator. - - output_definition : EvaluatorOutputDefinition - Structured output schema the evaluator must return. - - Always send `dataType`. - Do not send `version`; it is an internal storage detail and not part of the public request contract. - - model_config : typing.Optional[EvaluatorModelConfig] - Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + request : CreateEvaluatorRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -114,20 +104,9 @@ def create( _response = self._client_wrapper.httpx_client.request( "api/public/unstable/evaluators", method="POST", - json={ - "name": name, - "prompt": prompt, - "outputDefinition": convert_and_respect_annotation_metadata( - object_=output_definition, - annotation=EvaluatorOutputDefinition, - direction="write", - ), - "modelConfig": convert_and_respect_annotation_metadata( - object_=model_config, - annotation=typing.Optional[EvaluatorModelConfig], - direction="write", - ), - }, + json=convert_and_respect_annotation_metadata( + object_=request, annotation=CreateEvaluatorRequest, direction="write" + ), request_options=request_options, omit=OMIT, ) @@ -671,16 +650,15 @@ def __init__(self, *, client_wrapper: AsyncClientWrapper): async def create( self, *, - name: str, - prompt: str, - output_definition: EvaluatorOutputDefinition, - model_config: typing.Optional[EvaluatorModelConfig] = OMIT, + request: CreateEvaluatorRequest, request_options: typing.Optional[RequestOptions] = None, ) -> AsyncHttpResponse[Evaluator]: """ Create an evaluator in the authenticated project. - Use evaluators to define **how** Langfuse should score data: the prompt, the expected structured output, and the optional model configuration. + Use evaluators to define **how** Langfuse should score data. + LLM-as-a-judge evaluators define a prompt, expected structured output, and optional model configuration. + Code evaluators define source code and a runtime language. Naming behavior: - If this is a new evaluator name in your project, Langfuse creates version `1`. @@ -693,30 +671,22 @@ async def create( 3. Read the returned `outputDefinition.dataType` so the client knows whether future scores will be numeric, boolean, or categorical. 4. Create one or more evaluation rules that reference the returned evaluator family using `name` and `scope`. + Code evaluator validation: + - At creation, Langfuse only validates the request shape + - The `sourceCode` itself is not executed here. It is first run (preflight-tested against a sample observation) when you link the evaluator to an evaluation rule, so runtime errors in the code surface at evaluation-rule creation, not at evaluator creation. + Recovery guidance: - `422` with `code=evaluator_preflight_failed`: the evaluator cannot run with the resolved model configuration. Add a valid explicit `modelConfig`, or configure the project's default evaluation model, then retry the same request. - `400` with `code=invalid_body`: the request shape is malformed. Use the structured `details.issues` array to fix the specific fields and retry. - - `400` with `code=invalid_body` on `outputDefinition`: send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - `400` with `code=invalid_body` on `outputDefinition`: for `type=llm_as_judge`, send `dataType`, `reasoning.description`, and `score.description`. Do not send `version`; it is not part of the public request shape. + - If `type` is omitted, Langfuse treats the request as `type=llm_as_judge` for backwards compatibility. New clients should send `type` explicitly. Unstable API note: - This surface may evolve while the underlying evaluation data model is being redesigned. Parameters ---------- - name : str - Evaluator name within the authenticated project. - - prompt : str - Prompt template used by the evaluator. - - output_definition : EvaluatorOutputDefinition - Structured output schema the evaluator must return. - - Always send `dataType`. - Do not send `version`; it is an internal storage detail and not part of the public request contract. - - model_config : typing.Optional[EvaluatorModelConfig] - Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + request : CreateEvaluatorRequest request_options : typing.Optional[RequestOptions] Request-specific configuration. @@ -728,20 +698,9 @@ async def create( _response = await self._client_wrapper.httpx_client.request( "api/public/unstable/evaluators", method="POST", - json={ - "name": name, - "prompt": prompt, - "outputDefinition": convert_and_respect_annotation_metadata( - object_=output_definition, - annotation=EvaluatorOutputDefinition, - direction="write", - ), - "modelConfig": convert_and_respect_annotation_metadata( - object_=model_config, - annotation=typing.Optional[EvaluatorModelConfig], - direction="write", - ), - }, + json=convert_and_respect_annotation_metadata( + object_=request, annotation=CreateEvaluatorRequest, direction="write" + ), request_options=request_options, omit=OMIT, ) diff --git a/langfuse/api/unstable/evaluators/types/__init__.py b/langfuse/api/unstable/evaluators/types/__init__.py index 6e7a13233..650598592 100644 --- a/langfuse/api/unstable/evaluators/types/__init__.py +++ b/langfuse/api/unstable/evaluators/types/__init__.py @@ -6,13 +6,31 @@ from importlib import import_module if typing.TYPE_CHECKING: - from .create_evaluator_request import CreateEvaluatorRequest - from .evaluator import Evaluator + from .code_evaluator import CodeEvaluator + from .create_code_evaluator_request import CreateCodeEvaluatorRequest + from .create_evaluator_request import ( + CreateEvaluatorRequest, + CreateEvaluatorRequest_Code, + CreateEvaluatorRequest_LlmAsJudge, + ) + from .create_llm_as_judge_evaluator_request import CreateLlmAsJudgeEvaluatorRequest + from .evaluator import Evaluator, Evaluator_Code, Evaluator_LlmAsJudge + from .evaluator_base import EvaluatorBase from .evaluators import Evaluators + from .llm_as_judge_evaluator import LlmAsJudgeEvaluator _dynamic_imports: typing.Dict[str, str] = { + "CodeEvaluator": ".code_evaluator", + "CreateCodeEvaluatorRequest": ".create_code_evaluator_request", "CreateEvaluatorRequest": ".create_evaluator_request", + "CreateEvaluatorRequest_Code": ".create_evaluator_request", + "CreateEvaluatorRequest_LlmAsJudge": ".create_evaluator_request", + "CreateLlmAsJudgeEvaluatorRequest": ".create_llm_as_judge_evaluator_request", "Evaluator": ".evaluator", + "EvaluatorBase": ".evaluator_base", + "Evaluator_Code": ".evaluator", + "Evaluator_LlmAsJudge": ".evaluator", "Evaluators": ".evaluators", + "LlmAsJudgeEvaluator": ".llm_as_judge_evaluator", } @@ -43,4 +61,17 @@ def __dir__(): return sorted(lazy_attrs) -__all__ = ["CreateEvaluatorRequest", "Evaluator", "Evaluators"] +__all__ = [ + "CodeEvaluator", + "CreateCodeEvaluatorRequest", + "CreateEvaluatorRequest", + "CreateEvaluatorRequest_Code", + "CreateEvaluatorRequest_LlmAsJudge", + "CreateLlmAsJudgeEvaluatorRequest", + "Evaluator", + "EvaluatorBase", + "Evaluator_Code", + "Evaluator_LlmAsJudge", + "Evaluators", + "LlmAsJudgeEvaluator", +] diff --git a/langfuse/api/unstable/evaluators/types/code_evaluator.py b/langfuse/api/unstable/evaluators/types/code_evaluator.py new file mode 100644 index 000000000..f8648603d --- /dev/null +++ b/langfuse/api/unstable/evaluators/types/code_evaluator.py @@ -0,0 +1,31 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +import typing_extensions +from ....core.serialization import FieldMetadata +from ...commons.types.code_evaluator_source_code_language import ( + CodeEvaluatorSourceCodeLanguage, +) +from .evaluator_base import EvaluatorBase + + +class CodeEvaluator(EvaluatorBase): + source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] = ( + pydantic.Field() + ) + """ + Source code executed for each matched observation. + """ + + source_code_language: typing_extensions.Annotated[ + CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage") + ] = pydantic.Field() + """ + Runtime language for `sourceCode`. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py new file mode 100644 index 000000000..860c15f9a --- /dev/null +++ b/langfuse/api/unstable/evaluators/types/create_code_evaluator_request.py @@ -0,0 +1,36 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +import typing_extensions +from ....core.pydantic_utilities import UniversalBaseModel +from ....core.serialization import FieldMetadata +from ...commons.types.code_evaluator_source_code_language import ( + CodeEvaluatorSourceCodeLanguage, +) + + +class CreateCodeEvaluatorRequest(UniversalBaseModel): + name: str = pydantic.Field() + """ + Evaluator name within the authenticated project. + """ + + source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] = ( + pydantic.Field() + ) + """ + Code executed for each matched observation. + """ + + source_code_language: typing_extensions.Annotated[ + CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage") + ] = pydantic.Field() + """ + Runtime language for `sourceCode`. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluators/types/create_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_evaluator_request.py index 7616d99ee..a866aa4c5 100644 --- a/langfuse/api/unstable/evaluators/types/create_evaluator_request.py +++ b/langfuse/api/unstable/evaluators/types/create_evaluator_request.py @@ -1,50 +1,66 @@ # This file was auto-generated by Fern from our API Definition. +from __future__ import annotations + import typing import pydantic import typing_extensions from ....core.pydantic_utilities import UniversalBaseModel from ....core.serialization import FieldMetadata +from ...commons.types.code_evaluator_source_code_language import ( + CodeEvaluatorSourceCodeLanguage, +) from ...commons.types.evaluator_model_config import EvaluatorModelConfig from ...commons.types.evaluator_output_definition import EvaluatorOutputDefinition -class CreateEvaluatorRequest(UniversalBaseModel): +class CreateEvaluatorRequest_LlmAsJudge(UniversalBaseModel): """ Request body for creating an evaluator. If the same `name` already exists in your project, Langfuse creates the next version and returns it. Existing evaluation rules in the same project are then moved to that new latest version automatically. + If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility. """ - name: str = pydantic.Field() - """ - Evaluator name within the authenticated project. - """ - - prompt: str = pydantic.Field() - """ - Prompt template used by the evaluator. - """ - + type: typing.Literal["llm_as_judge"] = "llm_as_judge" + name: str + prompt: str output_definition: typing_extensions.Annotated[ EvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition") - ] = pydantic.Field() - """ - Structured output schema the evaluator must return. - - Always send `dataType`. - Do not send `version`; it is an internal storage detail and not part of the public request contract. - """ - + ] model_config_: typing_extensions.Annotated[ typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig") - ] = pydantic.Field(default=None) + ] = None + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) + + +class CreateEvaluatorRequest_Code(UniversalBaseModel): """ - Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + Request body for creating an evaluator. + + If the same `name` already exists in your project, Langfuse creates the next version and returns it. + Existing evaluation rules in the same project are then moved to that new latest version automatically. + If `type` is omitted, Langfuse defaults it to `llm_as_judge` for backwards compatibility. """ + type: typing.Literal["code"] = "code" + name: str + source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] + source_code_language: typing_extensions.Annotated[ + CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage") + ] + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( extra="allow", frozen=True ) + + +CreateEvaluatorRequest = typing_extensions.Annotated[ + typing.Union[CreateEvaluatorRequest_LlmAsJudge, CreateEvaluatorRequest_Code], + pydantic.Field(discriminator="type"), +] diff --git a/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py b/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py new file mode 100644 index 000000000..09e121b1b --- /dev/null +++ b/langfuse/api/unstable/evaluators/types/create_llm_as_judge_evaluator_request.py @@ -0,0 +1,43 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +import typing_extensions +from ....core.pydantic_utilities import UniversalBaseModel +from ....core.serialization import FieldMetadata +from ...commons.types.evaluator_model_config import EvaluatorModelConfig +from ...commons.types.evaluator_output_definition import EvaluatorOutputDefinition + + +class CreateLlmAsJudgeEvaluatorRequest(UniversalBaseModel): + name: str = pydantic.Field() + """ + Evaluator name within the authenticated project. + """ + + prompt: str = pydantic.Field() + """ + Prompt template used by the evaluator. + """ + + output_definition: typing_extensions.Annotated[ + EvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition") + ] = pydantic.Field() + """ + Structured output schema the evaluator must return. + + Always send `dataType`. + Do not send `version`; it is an internal storage detail and not part of the public request contract. + """ + + model_config_: typing_extensions.Annotated[ + typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig") + ] = pydantic.Field(default=None) + """ + Optional explicit model configuration. Omit or set to `null` to use the project default evaluation model. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluators/types/evaluator.py b/langfuse/api/unstable/evaluators/types/evaluator.py index 8023839fc..69295e0fd 100644 --- a/langfuse/api/unstable/evaluators/types/evaluator.py +++ b/langfuse/api/unstable/evaluators/types/evaluator.py @@ -1,5 +1,7 @@ # This file was auto-generated by Fern from our API Definition. +from __future__ import annotations + import datetime as dt import typing @@ -7,30 +9,27 @@ import typing_extensions from ....core.pydantic_utilities import UniversalBaseModel from ....core.serialization import FieldMetadata +from ...commons.types.code_evaluator_source_code_language import ( + CodeEvaluatorSourceCodeLanguage, +) from ...commons.types.evaluator_model_config import EvaluatorModelConfig from ...commons.types.evaluator_scope import EvaluatorScope -from ...commons.types.evaluator_type import EvaluatorType from ...commons.types.public_evaluator_output_definition import ( PublicEvaluatorOutputDefinition, ) -class Evaluator(UniversalBaseModel): +class Evaluator_LlmAsJudge(UniversalBaseModel): """ One evaluator that can be used for scoring. - An evaluator describes **how** to score data: - - prompt - - extracted prompt variables - - output schema - - optional explicit model configuration + An evaluator describes **how** to score data. It does not define **which** live objects are evaluated. That is the job of `evaluation-rules`. For agent clients, the most important fields are: - - `variables`: use these exact names when building the evaluation-rule `mapping` array - - `outputDefinition`: tells you the expected score type and the evaluator's response instructions - - `modelConfig`: tells you whether the evaluator uses the project default model (`null`) or an explicit provider/model + - `type`: determines which evaluator fields are present + - `variables`: for LLM evaluators, use these exact names when building the evaluation-rule `mapping` array. LLM evaluators require every variable to be mapped. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically. Versioning behavior: - `GET /evaluators` returns the latest version of each available evaluator. @@ -38,81 +37,78 @@ class Evaluator(UniversalBaseModel): - Evaluation rules always run against the latest version for the selected evaluator name within the same source (`project` or `managed`). """ - id: str = pydantic.Field() - """ - Identifier of this evaluator. - """ + type: typing.Literal["llm_as_judge"] = "llm_as_judge" + prompt: str + output_definition: typing_extensions.Annotated[ + PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition") + ] + model_config_: typing_extensions.Annotated[ + typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig") + ] = None + id: str + name: str + version: int + scope: EvaluatorScope + variables: typing.List[str] + evaluation_rule_count: typing_extensions.Annotated[ + int, FieldMetadata(alias="evaluationRuleCount") + ] + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] - name: str = pydantic.Field() - """ - Evaluator name. - """ + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) - version: int = pydantic.Field() - """ - Version number of this evaluator. - """ - scope: EvaluatorScope = pydantic.Field() - """ - Where this evaluator comes from: your project or Langfuse-managed defaults. +class Evaluator_Code(UniversalBaseModel): """ + One evaluator that can be used for scoring. - type: EvaluatorType = pydantic.Field() - """ - Evaluator engine type. Currently always `llm_as_judge`. - """ + An evaluator describes **how** to score data. - prompt: str = pydantic.Field() - """ - Prompt template used during evaluation. - """ - - variables: typing.List[str] = pydantic.Field() - """ - Variables extracted from the evaluator prompt. - - Every variable in this list must be mapped exactly once when creating an evaluation rule. - """ + It does not define **which** live objects are evaluated. That is the job of `evaluation-rules`. - output_definition: typing_extensions.Annotated[ - PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition") - ] = pydantic.Field() - """ - Structured output schema returned by this evaluator. - - Responses always include `dataType` and omit the internal output-definition `version`. - Use `dataType` to decide how future scores should be interpreted. - """ + For agent clients, the most important fields are: + - `type`: determines which evaluator fields are present + - `variables`: for LLM evaluators, use these exact names when building the evaluation-rule `mapping` array. LLM evaluators require every variable to be mapped. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically. - model_config_: typing_extensions.Annotated[ - typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig") - ] = pydantic.Field(default=None) - """ - Explicit model configuration, or `null` when the project default evaluation model is used. + Versioning behavior: + - `GET /evaluators` returns the latest version of each available evaluator. + - `GET /evaluators/{id}` can return an older version. + - Evaluation rules always run against the latest version for the selected evaluator name within the same source (`project` or `managed`). """ + type: typing.Literal["code"] = "code" + source_code: typing_extensions.Annotated[str, FieldMetadata(alias="sourceCode")] + source_code_language: typing_extensions.Annotated[ + CodeEvaluatorSourceCodeLanguage, FieldMetadata(alias="sourceCodeLanguage") + ] + id: str + name: str + version: int + scope: EvaluatorScope + variables: typing.List[str] evaluation_rule_count: typing_extensions.Annotated[ int, FieldMetadata(alias="evaluationRuleCount") - ] = pydantic.Field() - """ - Number of evaluation rules in the project that currently use this evaluator version. - """ - + ] created_at: typing_extensions.Annotated[ dt.datetime, FieldMetadata(alias="createdAt") - ] = pydantic.Field() - """ - Timestamp when this evaluator was created. - """ - + ] updated_at: typing_extensions.Annotated[ dt.datetime, FieldMetadata(alias="updatedAt") - ] = pydantic.Field() - """ - Timestamp when this evaluator was last updated. - """ + ] model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( extra="allow", frozen=True ) + + +Evaluator = typing_extensions.Annotated[ + typing.Union[Evaluator_LlmAsJudge, Evaluator_Code], + pydantic.Field(discriminator="type"), +] diff --git a/langfuse/api/unstable/evaluators/types/evaluator_base.py b/langfuse/api/unstable/evaluators/types/evaluator_base.py new file mode 100644 index 000000000..7a8362657 --- /dev/null +++ b/langfuse/api/unstable/evaluators/types/evaluator_base.py @@ -0,0 +1,64 @@ +# This file was auto-generated by Fern from our API Definition. + +import datetime as dt +import typing + +import pydantic +import typing_extensions +from ....core.pydantic_utilities import UniversalBaseModel +from ....core.serialization import FieldMetadata +from ...commons.types.evaluator_scope import EvaluatorScope + + +class EvaluatorBase(UniversalBaseModel): + id: str = pydantic.Field() + """ + Identifier of this evaluator. + """ + + name: str = pydantic.Field() + """ + Evaluator name. + """ + + version: int = pydantic.Field() + """ + Version number of this evaluator. + """ + + scope: EvaluatorScope = pydantic.Field() + """ + Where this evaluator comes from: your project or Langfuse-managed defaults. + """ + + variables: typing.List[str] = pydantic.Field() + """ + Variables that can be mapped when creating an evaluation rule. + + LLM evaluators require every variable to be mapped exactly once. Code evaluators always expose the fixed runtime payload fields and Langfuse maps them automatically. + """ + + evaluation_rule_count: typing_extensions.Annotated[ + int, FieldMetadata(alias="evaluationRuleCount") + ] = pydantic.Field() + """ + Number of evaluation rules in the project that currently use this evaluator version. + """ + + created_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="createdAt") + ] = pydantic.Field() + """ + Timestamp when this evaluator was created. + """ + + updated_at: typing_extensions.Annotated[ + dt.datetime, FieldMetadata(alias="updatedAt") + ] = pydantic.Field() + """ + Timestamp when this evaluator was last updated. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + ) diff --git a/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py b/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py new file mode 100644 index 000000000..0cf186f47 --- /dev/null +++ b/langfuse/api/unstable/evaluators/types/llm_as_judge_evaluator.py @@ -0,0 +1,40 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +import typing_extensions +from ....core.serialization import FieldMetadata +from ...commons.types.evaluator_model_config import EvaluatorModelConfig +from ...commons.types.public_evaluator_output_definition import ( + PublicEvaluatorOutputDefinition, +) +from .evaluator_base import EvaluatorBase + + +class LlmAsJudgeEvaluator(EvaluatorBase): + prompt: str = pydantic.Field() + """ + Prompt template used during evaluation. + """ + + output_definition: typing_extensions.Annotated[ + PublicEvaluatorOutputDefinition, FieldMetadata(alias="outputDefinition") + ] = pydantic.Field() + """ + Structured output schema returned by this evaluator. + + Responses always include `dataType` and omit the internal output-definition `version`. + Use `dataType` to decide how future scores should be interpreted. + """ + + model_config_: typing_extensions.Annotated[ + typing.Optional[EvaluatorModelConfig], FieldMetadata(alias="modelConfig") + ] = pydantic.Field(default=None) + """ + Explicit model configuration, or `null` when the project default evaluation model is used. + """ + + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict( + extra="allow", frozen=True + )